对于这个维基百科网站:https://en.wikipedia.org/wiki/List_of_countries_and_dependency_by_population,我如何提取中国和印度的人口数量和百分比,尽管它们排在一行?它需要在Python中。我能够为除印度以外的其他国家提取正确的人口。
我的代码:
import requests
from bs4 import BeautifulSoup
def load_population_dict(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
population_dict = {}
table = soup.find('table', {'class': 'wikitable'})
rows = table.find_all('tr')[1:] # Skip the header row
for row in rows:
data = row.find_all('td')
country = data[1].text.strip()
population = data[2].text.strip()
population_dict[country] = (population, data[3].text.strip()) # Store population and percentage as a tuple
return population_dict
一种可能的解决方案是检查当前行有多少个单元格并相应地调整提取:
import requests
from bs4 import BeautifulSoup
def load_population_dict(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
population_dict = {}
table = soup.find("table", {"class": "wikitable"})
rows = table.find_all("tr")[1:] # Skip the header row
for row in rows:
data = row.find_all("td")
country_idx, population_idx, pct_idx = 1, 2, 3
if len(data) == 6:
country_idx, population_idx, pct_idx = 0, 1, 2
country = data[country_idx].text.strip()
population = data[population_idx].text.strip()
population_dict[country] = (
population,
data[pct_idx].text.strip(),
)
return population_dict
url = "https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population"
out = load_population_dict(url)
print(out)
打印:
{
"World": ("8,092,488,000", "100%"),
"China": ("1,409,670,000", "17.4%"),
"India": ("1,392,329,000", "17.2%"),
"United States": ("335,893,238", "4.2%"),
"Indonesia": ("279,118,866", "3.4%"),
"Pakistan": ("241,499,431", "3.0%"),
"Nigeria": ("216,783,381", "2.7%"),
"Brazil": ("203,080,756", "2.5%"),
"Bangladesh": ("169,828,911", "2.1%"),
"Russia": ("146,424,729", "1.8%"),
"Mexico": ("129,406,736", "1.6%"),
"Japan": ("124,090,000", "1.5%"),
"Philippines": ("112,892,781", "1.4%"),
"Ethiopia": ("107,334,000", "1.3%"),
"Egypt": ("104,462,545", "1.3%"),
"Vietnam": ("100,300,000", "1.2%"),
...