from bs4 import BeautifulSoup
import requests #importing beautifulsoup and requests
url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html') #storing beautiful soup in soup variable
print(soup) # to see what it displays
soup.find('table') # finding every tag labelled table
soup.find('table', class_ = 'wikitable sortable') # trying to specify tables
table = soup.find_all('table')[0] # this is the one I want
print(table) # to see what it displays
table.find_all('th') # find all the th (column headings) tags in the table.
world_titles = table.find_all('th') # so i can just type world_titles instead of table.find_all('th') all the time
world_table_titles = [title.text.strip() for title in world_titles] # removing /n and making data clean
print(world_table_titles) # seeing what it displays
import pandas as pd # importing pandas
df = pd.DataFrame(columns = world_table_titles) # making a dataframe
df
column_data = table.find_all('tr') # finding rows within my table
for row in column_data[2:]: # [2:] because the first two just displayed []
row_data = row.find_all('td')
individual_row_data = [data.text.strip() for data in row_data] #clean version of row_data
length = len(df)
df.loc[length] = individual_row_data
print(individual_row_data)
我试图从维基百科网站上抓取表格,然后收到此错误。我做错了什么?我该如何解决它?
我尝试在 youtube 和网上寻找解决方案,但没有找到帮助。
检查您对
world_titles
的选择,ResultSet
比您想象的要长得多。以下更接近您的期望:
world_titles = table.tr.find_all('th')
但是您会遇到另一个问题,因此有些列没有任何文本,这将导致列表长度不同。所以检查一下:
...
world_titles = table.tr.find_all('th')
world_table_titles = [title.text.strip() for title in world_titles]
column_data = table.find_all('tr')
data = []
for row in column_data[2:]: # [2:] because the first two just displayed []
row_data = row.find_all('td')
data.append([data.text.strip() for data in row_data])
pd.DataFrame(data,columns=world_table_titles[1:])
但是 - 您可以使用
pandas.read_html()
获取数据框并对其进行调整:
import pandas as pd
#read in the first table
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue')[0]
# clean the column headers
df.columns = [' '.join(e for e in set(c) if not 'Unnamed' in e) for c in df.columns]
# check the result
df