from bs4 import BeautifulSoup
import requests
from csv import writer
soup = BeautifulSoup(requests.get('https://www.cfpc.ca/en/members-list').text, 'lxml')
table = soup.find('table', id='memberlist-table')
header = []
rows = []
j = 0
for i, row in enumerate(table.findAll('tr')):
if i == 0:
header = [el.text.strip() for el in row.findAll('th')]
with open('test1.csv', 'w', newline='') as f:
thewriter = writer(f)
thewriter.writerow(header)
else:
rows.append([el.text.strip() for el in row.findAll('td')])
with open('test1.csv', 'a+', newline='') as f:
thewriter = writer(f)
info = [rows[j],rows[j+1],rows[j+2],rows[j+3]]
thewriter.writerow(info)
j = j+4
我输入了这段代码,一切都按预期工作,直到它通过了标题,然后代码结束,就好像标题后面的任何行中都没有信息一样。
问题是数据是通过
JavaScript
动态加载和渲染的,requests
不支持。尝试找到数据来源的 API 或使用 selenium
或类似模仿浏览器行为的东西。
import requests, time
data = []
page_number = 442
while True:
result = requests.get(f'https://www.cfpc.ca/ajax/memberlist/members.ashx?PageNumber={page_number}&PageSize=100').json().get('responseStatus').get('memberList')
if len(result) > 0:
data.extend(result)
page_number = page_number + 1
time.sleep(3)
else:
break
data