使用 Python 进行网页抓取,无需分页网站

问题描述 投票:0回答:1

我使用 Selenium 和 BS4 从网站上抓取数据并将其保存到 json 文件中。由于没有分页结构,我将 Web 驱动程序与 selenium 一起使用,但在添加 selenium 之前,当我的旧代码运行时,我现在在收集数据时将其视为一个空的 json 文件。如何在不破坏现有结构的情况下修复它?

我的旧代码(成功收集数据)

from bs4 import BeautifulSoup
import cloudscraper
import json

url = "https://www.brickeconomy.com/sets/year/2024"

# Create a scraper instance
scraper = cloudscraper.create_scraper()

# Send a GET request to the URL
response = scraper.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # List to hold all set data
    sets_data = []

    # Find all table rows containing set information
    table_rows = soup.find('table', id='ContentPlaceHolder1_ctlSets_GridViewSets').find_all('tr', align='left')

    # Iterate over each row to extract set details
    for row in table_rows:
        set_info = {}

        # Find the <h4> element containing the set name and ID
        set_name_elem = row.find('h4')
        if set_name_elem:
            set_string = set_name_elem.text.strip()
            set_info['id'], set_info['name'] = set_string.split(' ', 1)

        # Find <div> elements containing Year, Pieces/Minifigs, and other information
        div_elements = row.find_all('div', class_='mb-2')

        for div in div_elements:
            label = div.find('small', class_='text-muted mr-5')
            if label:
                label_text = label.text.strip()

                if label_text == 'Year':
                    set_info['year'] = div.text.replace('Year', '').strip()

        # Find all <td> elements with class="ctlsets-right text-right"
        td_elements = row.find_all('td', class_='ctlsets-right text-right')

        # Process each <td> element
        for td in td_elements:
            div_elements = td.find_all('div')
            for div in div_elements:
                # If the div content contains "Retail", get the price from the next sibling
                if "Retail" in div.text:
                    retail_price = div.text.strip()
                    price_without_retail = ' '.join(retail_price.split()[1:])
                    set_info['price'] = price_without_retail

                    first_sibling = div.find_next_sibling()
                    if first_sibling:
                        content = first_sibling.text.strip()
                        set_info['retail'] = content

                        second_sibling = first_sibling.find_next_sibling()
                        if second_sibling:
                            content2 = second_sibling.text.strip()
                            set_info['detail'] = content2
                        else:
                            set_info['detail'] = "None"
                    else:
                        print("Not Found Retail.")

        # Add the set information to the list
        sets_data.append(set_info)

    # Convert the extracted set data to JSON format and write to a file
    with open('sets_data.json', 'w') as json_file:
        json.dump(sets_data, json_file, ensure_ascii=False, indent=4)

    print("Sets data extracted successfully and saved to sets_data.json.")

else:
    print("HTTP Error Code:", response.status_code)

我当前的代码(带网络驱动程序):

import json
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

# Initialize WebDriver (Safari, Chrome, Firefox, etc.)
driver = webdriver.Chrome()  # or change to webdriver.Firefox() or webdriver.Safari()

url = "https://www.brickeconomy.com/sets/year/2024"
max_iterations = 2  # Specify how many pages to fetch
delay_seconds = 2  # Delay time between each page transition (seconds)

all_sets_data = []  # List to hold all set data

try:
    for i in range(max_iterations):
        driver.get(url)

        # Wait for the table to load when the page is loaded
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'ContentPlaceHolder1_ctlSets_GridViewSets')))

        # Process the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        sets_data = []

        # Find all rows in the table
        table = soup.find('table', id='ContentPlaceHolder1_ctlSets_GridViewSets')
        if table:
            table_rows = table.find_all('tr', align='left')

            # Extract set information from each row
            for row in table_rows:
                set_info = {}

                # Find the <h4> element containing the set name
                set_name_elem = row.find('h4')
                if set_name_elem:
                    set_string = set_name_elem.text.strip()
                    set_info['id'], set_info['name'] = set_string.split(' ', 1)

                # Find <div> elements containing Year and other information
                div_elements = row.find_all('div', class_='mb-2')

                for div in div_elements:
                    label = div.find('small', class_='text-muted mr-5')
                    if label:
                        label_text = label.text.strip()

                        if label_text == 'Year':
                            set_info['year'] = div.text.replace('Year', '').strip()

                sets_data.append(set_info)

            # Add the extracted set data to the list of all sets
            all_sets_data.extend(sets_data)

            print(f"Sets data for iteration {i + 1} extracted successfully.")

            # Click the "Next" button to go to the next page
            next_button = driver.find_element(By.XPATH, "//a[contains(text(), 'Next')]")
            if next_button:
                next_button.click()

                # Wait for a specified time before the next iteration (rate limiting)
                time.sleep(delay_seconds)
            else:
                print("Next button not found. Exiting loop.")
                break
        else:
            print("Table not found. Exiting loop.")
            break

except Exception as e:
    print(f"An error occurred: {str(e)}")

finally:
    # Close the WebDriver
    driver.quit()

    # Write all set data to a single JSON file
    if all_sets_data:
        with open('all_sets_data.json', 'w') as json_file:
            json.dump(all_sets_data, json_file, ensure_ascii=False, indent=4)
        print("All sets data extracted successfully and saved to all_sets_data.json.")
    else:
        print("No sets data extracted or saved.")

电流输出:

[
    {},
    {},
    {},
    {},
    {},
...
]
python python-3.x selenium-webdriver web-scraping beautifulsoup
1个回答
0
投票

删除align='left'。这些元素是空的。

table_rows = table.find_all('tr')

现在工作正常。

© www.soinside.com 2019 - 2024. All rights reserved.