我目前正在尝试从尼泊尔时报网站上抓取文章。我面临的挑战是该网站使用了“加载更多”按钮,我需要单击该按钮才能加载其他文章。但是,我的抓取过程成功检索了包含前六篇文章的初始页面,但无法单击“加载更多”按钮来加载其余文章。结果,除了最初的六篇文章之外,我无法抓取任何内容。
此外,在抓取过程中,它继续获取 URL,但没有获得所需的内容,而是返回“oops”页面,这表明 Selenium 和按钮单击功能存在问题。
如果有人可以解释我该如何处理这个问题?我将非常感激!
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy.loader import ItemLoader
class NepaliSpider(CrawlSpider):
name = "nepalitimes"
allowed_domains = ["nepalitimes.com"]
# Start URL for the spider
start_urls = ['https://www.nepalitimes.com/news']
custom_settings = {
'FEED_FORMAT': 'csv',
'FEED_URI': 'nepali_times.csv'
}
# Rule to follow links to individual article pages
rules = (
Rule(LinkExtractor(), callback='parse_item', follow=True),
)
# Handling the load button using Selenium --- En cours de pulvérisation <3
def __init__(self, *args, **kwargs):
super(NepaliSpider, self).__init__(*args, **kwargs)
# Set up options for Chrome WebDriver
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_experimental_option("detach", True)
# Initialize the Chrome WebDriver
self.driver = webdriver.Chrome(options=options)
def parse_start_url(self, response, **kwargs):
# Extract initial articles from the start URL
for url in self.start_urls:
yield scrapy.Request(url, callback=self.parse)
# Use Selenium to click "Load More" buttons and wait until all articles are loaded
while True:
try:
load_more_button = self.driver.find_element_by_xpath(
".//button[contains(@class, 'btn btn--load center') and contains(., 'load more')]"
)
load_more_button.click()
print("Load more button clicked")
WebDriverWait(self.driver, 10).until(
EC.staleness_of(load_more_button)
)
except:
break
# Create a new response from the updated page source
updated_body = self.driver.page_source
new_response = response.replace(body=updated_body)
print("New_response", new_response)
# Extract newly loaded articles
yield from self.parse(new_response)
# Scraping all articles from Nepali_times - 100% WORKING
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, callback=self.parse)
def parse(self, response, **kwargs):
for result in response.xpath(".//div[contains(@class,'main--left')]/a"):
relative_url = result.xpath("@href").extract_first()
absolute_url = response.urljoin(relative_url)
yield scrapy.Request(url=absolute_url, callback=self.parse_item)
def parse_item(self, response):
# This function should extract the article information from the provided response
# and yield the scraped data as a dictionary
# Extract article information using XPath selectors
title = response.xpath('.//article[contains(@class,"article__full")]/h1/text()').get()
subtitle = response.xpath('.//span[contains(@class,"article__subhead")]/text()').get()
date = response.xpath(".//div/time[contains(@class,'article__time')]/text()").get()
author = response.xpath('.//div/span[contains(@class,"article__author")]/span/text()').get()
category = response.xpath(".//a[contains(@class,'active')]/text()").get()
url = response.xpath(".//meta[contains(@property, 'og:url')]/@content").get()
# Parse the HTML content
content_elements = response.xpath('.//div[contains(@class,"article__text")]/p')
text_content = [element.xpath("string(.)").get().strip() for element in content_elements]
cleaned_content = ' '.join(text_content)
yield {
'title': title,
'subtitle': subtitle,
'author': author,
'date': date,
'content': cleaned_content,
'category': category,
'URL': url
}