您好,我目前已从网上获得此代码。当前,它获取以下代码的网址和公司信息。有什么方法可以更新此代码,以代替网址和公司信息来显示行业和行业信息?新手编码,所以将不胜感激:)
下面的代码:
import bs4 as BeautifulSoup
from bs4 import SoupStrainer
import re
import urllib.request
import pandas as pd
import requests
symbols = ['SBUX', 'MET', 'CAT', 'JNJ', 'ORCL']
headers = {'User-agent': 'Mozilla/5.0'}
mySymbols = {}
for s in symbols:
vals = {}
url = ("https://finance.yahoo.com/quote/{}/profile?p={}".format(s,s))
webpage = requests.get(url, headers=headers)
soup = BeautifulSoup.BeautifulSoup(webpage.content)
title = soup.find("title")
tmp = title.get_text()
rxTitle = re.compile(r'\(.*$')
coName = rxTitle.sub("", tmp)
for link in soup.find_all('a', href=True):
try:
if link['target'] and "" == link['title']:
m = re.search('yahoo', link['href'], flags=re.IGNORECASE)
if None == m:
url = link['href']
webpage = requests.get(url, headers=headers)
soup = BeautifulSoup.BeautifulSoup(webpage.content)
vals = {"company":coName, "url":link['href']}
print (s, vals)
mySymbols[s] = vals
except:
pass
[查看这些页面之一,我看到该部门处于'class'='Fw(600)'和'data-reactid'= 21的行业,而data-reactid = 25的行业,所以您可以使用] >
sector = soup.find('span', {'class':'Fw(600)','data-reactid': '21'}) print(sector.next) industry = soup.find('span', {'class':'Fw(600)','data-reactid': '25'}) print(industry.next)
sector.next在范围内获取内容,而不是返回整个内容。
寻找“部门”和“行业”跨度并返回后续跨度的更好方法在下面进行了完整编码:
import bs4 as BeautifulSoup
import requests
def get_tags(url):
webpage = requests.get(url, headers=headers)
soup = BeautifulSoup.BeautifulSoup(webpage.content)
title = soup.find("title")
results = {}
tmp = title.get_text()
results['title'] = tmp
spans = soup.findAll('span')
for i in range(len(spans)):
if spans[i] and spans[i].text == 'Sector':
sector = spans[i+1].text
results['Sector'] = sector
if spans[i] and spans[i].text == 'Industry':
industry = spans[i+1].text
results['Industry'] = industry
return results
headers = {'User-agent': 'Mozilla/5.0'}
symbols = ['SBUX', 'MET', 'CAT', 'JNJ', 'ORCL']
for s in symbols:
url = ("https://finance.yahoo.com/quote/{}/profile?p={}".format(s,s))
results = get_tags(url)
print(results)