如何使用BeautifulSoup4抓取数据直到Python的最后一页?

问题描述 投票:0回答:1
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup

my_url = 'https://www.eventbrite.com/d/malaysia--kuala-lumpur--85675181/all- 
events/?page=1'

#opening connection , downloading page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()

# html parser
page_soup = soup(page_html, "html.parser")

# catch each events
card = page_soup.findAll("div",{"class":"eds-media-card-content__content"})

filename = "Data_Events.csv"
f = open(filename, "w")

headers = "events_name, events_dates, events_location, events_fees\n"

f.write(headers)

for activity in card :

    event_activity = activity.findAll("div",{"class":"eds-event- 
card__formatted-name--is-clamped"})
    events_name = event_activity[0].text

    event_date = activity.findAll("div",{"class":"eds-text-bs--fixed eds- 
    text-color--grey-600 eds-l-mar-top-1"})
    events_dates = event_date[0].text
    events_location = event_date[1].text
    events_fees = event_date[2].text

    print("events_name: " + events_name)
    print("events_dates: " + events_dates)
    print("events_location: " + events_location)
    print("events_fees: " + events_fees)

    f.write(events_name + "," + events_dates + "," + events_location + "," + 
events_fees + "\n")

f.close()

嗨,我仍然是使用Python语言的初学者,我想知道如何应用一个函数,使该脚本能够将数据获取到网站的下一页?

我已经尝试做一个

for pages in page (1, 49)
my_url = 'https://www.eventbrite.com/d/malaysia--kuala-lumpur--85675181/all- 
events/?page=1'

任何建议将不胜感激

python web-scraping
1个回答
0
投票
import itertools
import requests
from bs4 import BeautifulSoup

def parse_page(url, page)
    params = dict(page=page)
    resp = requests.get(url, params=params) # will format `?page=#` to url
    soup = BeautifulSoup(resp.text, 'html.parser')
    ... # parse data from page


url = 'https://www.eventbrite.com/d/malaysia--kuala-lumpur--85675181/all-events'

for page in itertools.count(start=1): # don't need to know total pages
    try:
        parse_page(url, page)
    except Exception:
        # `parse_url` was designed for a different page layout and will
        # fail when no more pages to scrape, so we break here
        break
© www.soinside.com 2019 - 2024. All rights reserved.