抓取分页失败,页面链接处于 for 循环中

问题描述 投票:0回答:1

我的 scrapy 蜘蛛仅获取第一页的内容。我从主页获取所有分页链接,然后在 for 循环中迭代它们。示例页面网址为“https://www.moneycontrol.com/news/business/markets/page-2/”,页面按顺序从 2 到 30 编号。我已验证链接的格式是否正确。

import scrapy

from bookworm.items import NewsItem
from datetime import datetime
import random

class SpidermoneySpider(scrapy.Spider):
    name = "spidermoney"
    allowed_domains = ["www.moneycontrol.com"]
    start_urls = ["https://www.moneycontrol.com/news/business/markets/"]
    
    def parse(self, response):
        # Parse 1st page
        yield from self.parse_page_url(response=response)

        # Parse pages 1..30
        root_url = "https://www.moneycontrol.com"
        urls2follow = response.xpath('//div[@class="pagenation"]/a[contains(@href, "page")]/@href').getall()
        
        urls2follow = [root_url + url2follow for url2follow in urls2follow]  # 'https://www.moneycontrol.com/news/business/markets/page-2/'
        urls_seen = set()
        for url2follow in urls2follow:
            if url2follow not in urls_seen:
                urls_seen.add(url2follow)
                yield response.follow(url2follow, callback=self.parse_page_url)          

    def parse_page_url(self, response):
        article_URLs = response.css('li.clearfix a::attr(href)').getall()
        for article_URL in article_URLs:
            yield response.follow(article_URL, callback=self.parse_article_url)

    def parse_article_url(self, response):

        news_item = NewsItem()

        # Scrape content
        art_desc = response.css('h2.article_desc::text').get()
        art_para = response.xpath('//div[@id="contentdata"]/p//text()').getall()
        art_content = art_desc + ''.join(art_para)

        # Scrape timestamp
        schedule = response.css('div.article_schedule ::text').getall()
        ts_string = ''.join(schedule).strip().upper()
        ts_object = datetime.strptime(ts_string, "%B %d, %Y / %I:%M %p %Z")
        ts_format = ts_object.strftime('%Y-%m-%d %H:%M:%S')

        # Build the item - news article
        news_item['headline'] = response.css('h1.article_title::text').get() 
        news_item['url'] = response.url 
        news_item['content'] = art_content 
        news_item['author'] = response.xpath('//div[@class="article_author"]/*/text()').get(default="Moneycontrol Contributor")       
        news_item['published'] = ts_format

        yield news_item

代码经过简化以提高可读性。类名“pagenation”不是拼写错误 - 这是该类在网站上的命名方式。感谢您的帮助。

python scrapy
1个回答
0
投票

原来是设置问题。在“settings.py”中设置

 ROBOTSTXT_OBEY = False

这是使用新设置正确运行的代码:

import scrapy

from bookworm.items import NewsItem
from datetime import datetime
import random

class SpidermoneySpider(scrapy.Spider):
    name = "spidermoney"
    allowed_domains = ["www.moneycontrol.com"]
    landing_page = ["https://www.moneycontrol.com/news/business/markets/"]
    pagination_urls = [f"https://www.moneycontrol.com/news/business/markets/page-{i}/" for i in range(2,31)]
    start_urls = landing_page + pagination_urls

    def parse(self, response):
        yield from self.parse_page_url(response=response)

    def parse_page_url(self, response):
        article_URLs = response.css('li.clearfix a::attr(href)').getall()
        for article_URL in article_URLs:
            yield response.follow(article_URL, callback=self.parse_article_url)

    def parse_article_url(self, response):

        news_item = NewsItem()

        # Scrape content
        art_desc = response.css('h2.article_desc::text').get()
        art_para = response.xpath('//div[@id="contentdata"]/p//text()').getall()
        art_content = art_desc + ''.join(art_para)

        # Scrape timestamp
        schedule = response.css('div.article_schedule ::text').getall()
        ts_string = ''.join(schedule).strip().upper()
        ts_object = datetime.strptime(ts_string, "%B %d, %Y / %I:%M %p %Z")
        ts_format = ts_object.strftime('%Y-%m-%d %H:%M:%S')

        # Build the item - news article
        news_item['headline'] = response.css('h1.article_title::text').get() 
        news_item['url'] = response.url 
        news_item['content'] = art_content 
        news_item['author'] = response.xpath('//div[@class="article_author"]/*/text()').get(default="Moneycontrol Contributor")       
        news_item['published'] = ts_format

        yield news_item
© www.soinside.com 2019 - 2024. All rights reserved.