我目前正在按照教程进行操作,因为我以前从未做过此事,或者以前从未使用过Python,因此我正在尝试从事件网页中提取一些数据。它涉及提取所列事件的名称,日期和位置。它似乎要么提取数据要么输出两次数据,但是我看不到有任何行代码可以这样做。任何帮助,将不胜感激!
from time import sleep
from time import time
from random import randint
from bs4 import BeautifulSoup
from requests import get
import pandas
#loop through individual webpages
pages = [str(i) for i in range(1,3)]
url = 'https://www.eventbrite.com/d/malaysia--kuala-lumpur--85675181/all-events/?page=' + str(pages)
name = []
date = []
location = []
start_time = time()
requests = 0
for page in pages:
response = get(url)
sleep(randint(1,3))
requests += 1
elapsed_time = time() - start_time
print('Request: {}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
if response.status_code != 200:
warn('Request: {}; Status Code: {}'.format(requests, response.status_code))
html_soup = BeautifulSoup(response.text, 'html.parser')
#main div
event_containers = html_soup.find_all('div', class_ = 'eds-media-card-content__content__principal')
for container in event_containers:
#get event name
event_name = container.h3.div.div.text
name.append(event_name)
#get event day & date
event_date = container.div.div.text
date.append(event_date)
#get event location
event_location = container.find('div', class_ = 'card-text--truncated__one')
location.append(event_location)
event_list = pandas.DataFrame({
'event': name,
'date': date,
'location': location
})
print(event_list)
不。您的代码中没有任何东西可以重复,但是html源中确实有两次(不知道为什么)。但是您只需删除重复的行。
虽然还有另一个问题。您实际上并没有遍历每个页面。您需要在for循环内使用url来做到这一点:
from time import sleep
from time import time
from random import randint
from bs4 import BeautifulSoup
from requests import get
import pandas
#loop through individual webpages
pages = [str(i) for i in range(1,3)]
name = []
date = []
location = []
start_time = time()
requests = 0
for page in pages:
url = 'https://www.eventbrite.com/d/malaysia--kuala-lumpur--85675181/all-events/?page=' + str(page)
response = get(url)
sleep(randint(1,3))
requests += 1
elapsed_time = time() - start_time
print('Request: {}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
if response.status_code != 200:
print('Request: {}; Status Code: {}'.format(requests, response.status_code))
html_soup = BeautifulSoup(response.text, 'html.parser')
#main div
event_containers = html_soup.find_all('div', class_ = 'eds-media-card-content__content__principal')
for container in event_containers:
#get event name
event_name = container.h3.div.div.text
name.append(event_name)
#get event day & date
event_date = container.div.div.text
date.append(event_date)
#get event location
event_location = container.find('div', class_ = 'card-text--truncated__one')
location.append(event_location)
event_list = pandas.DataFrame({
'event': name,
'date': date,
'location': location})
event_list = event_list.drop_duplicates()
print(event_list)