使用 scrapy 和 selenium 处理网页抓取的“加载更多”按钮

问题描述 投票:0回答:0

我目前正在尝试从尼泊尔时报网站上抓取文章。我面临的挑战是该网站使用了“加载更多”按钮,我需要单击该按钮才能加载其他文章。但是,我的抓取过程成功检索了包含前六篇文章的初始页面,但无法单击“加载更多”按钮来加载其余文章。结果,除了最初的六篇文章之外,我无法抓取任何内容。

此外,在抓取过程中,它继续获取 URL,但没有获得所需的内容,而是返回“oops”页面,这表明 Selenium 和按钮单击功能存在问题。

如果有人可以解释我该如何处理这个问题?我将非常感激!

import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy.loader import ItemLoader


class NepaliSpider(CrawlSpider):
    name = "nepalitimes"
    allowed_domains = ["nepalitimes.com"]
    # Start URL for the spider
    start_urls = ['https://www.nepalitimes.com/news']

    custom_settings = {
        'FEED_FORMAT': 'csv',
        'FEED_URI': 'nepali_times.csv'
    }

    # Rule to follow links to individual article pages
    rules = (
        Rule(LinkExtractor(), callback='parse_item', follow=True),
    )

# Handling the load button using Selenium --- En cours de pulvérisation <3
    def __init__(self, *args, **kwargs):
        super(NepaliSpider, self).__init__(*args, **kwargs)

        # Set up options for Chrome WebDriver
        options = webdriver.ChromeOptions()
        options.add_argument("start-maximized")
        options.add_experimental_option("detach", True)

        # Initialize the Chrome WebDriver
        self.driver = webdriver.Chrome(options=options)

    def parse_start_url(self, response, **kwargs):
        # Extract initial articles from the start URL
        for url in self.start_urls:
            yield scrapy.Request(url, callback=self.parse)

        # Use Selenium to click "Load More" buttons and wait until all articles are loaded
        while True:
            try:
                load_more_button = self.driver.find_element_by_xpath(
                    ".//button[contains(@class, 'btn btn--load center') and contains(., 'load more')]"
                )
                load_more_button.click()
                print("Load more button clicked")
                WebDriverWait(self.driver, 10).until(
                    EC.staleness_of(load_more_button)
                )
            except:
                break

        # Create a new response from the updated page source
        updated_body = self.driver.page_source
        new_response = response.replace(body=updated_body)
        print("New_response", new_response)

        # Extract newly loaded articles
        yield from self.parse(new_response)

# Scraping all articles from Nepali_times - 100% WORKING
    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url, callback=self.parse)

    def parse(self, response, **kwargs):
        for result in response.xpath(".//div[contains(@class,'main--left')]/a"):
            relative_url = result.xpath("@href").extract_first()
            absolute_url = response.urljoin(relative_url)
            yield scrapy.Request(url=absolute_url, callback=self.parse_item)

    def parse_item(self, response):
        # This function should extract the article information from the provided response
        # and yield the scraped data as a dictionary

        # Extract article information using XPath selectors
        title = response.xpath('.//article[contains(@class,"article__full")]/h1/text()').get()
        subtitle = response.xpath('.//span[contains(@class,"article__subhead")]/text()').get()
        date = response.xpath(".//div/time[contains(@class,'article__time')]/text()").get()
        author = response.xpath('.//div/span[contains(@class,"article__author")]/span/text()').get()
        category = response.xpath(".//a[contains(@class,'active')]/text()").get()
        url = response.xpath(".//meta[contains(@property, 'og:url')]/@content").get()

        # Parse the HTML content
        content_elements = response.xpath('.//div[contains(@class,"article__text")]/p')
        text_content = [element.xpath("string(.)").get().strip() for element in content_elements]
        cleaned_content = ' '.join(text_content)

        yield {
            'title': title,
            'subtitle': subtitle,
            'author': author,
            'date': date,
            'content': cleaned_content,
            'category': category,
            'URL': url
        }
python selenium-webdriver web-scraping scrapy
© www.soinside.com 2019 - 2024. All rights reserved.