我的 scrapy 蜘蛛仅获取第一页的内容。我从主页获取所有分页链接,然后在 for 循环中迭代它们。示例页面网址为“https://www.moneycontrol.com/news/business/markets/page-2/”,页面按顺序从 2 到 30 编号。我已验证链接的格式是否正确。
import scrapy
from bookworm.items import NewsItem
from datetime import datetime
import random
class SpidermoneySpider(scrapy.Spider):
name = "spidermoney"
allowed_domains = ["www.moneycontrol.com"]
start_urls = ["https://www.moneycontrol.com/news/business/markets/"]
def parse(self, response):
# Parse 1st page
yield from self.parse_page_url(response=response)
# Parse pages 1..30
root_url = "https://www.moneycontrol.com"
urls2follow = response.xpath('//div[@class="pagenation"]/a[contains(@href, "page")]/@href').getall()
urls2follow = [root_url + url2follow for url2follow in urls2follow] # 'https://www.moneycontrol.com/news/business/markets/page-2/'
urls_seen = set()
for url2follow in urls2follow:
if url2follow not in urls_seen:
urls_seen.add(url2follow)
yield response.follow(url2follow, callback=self.parse_page_url)
def parse_page_url(self, response):
article_URLs = response.css('li.clearfix a::attr(href)').getall()
for article_URL in article_URLs:
yield response.follow(article_URL, callback=self.parse_article_url)
def parse_article_url(self, response):
news_item = NewsItem()
# Scrape content
art_desc = response.css('h2.article_desc::text').get()
art_para = response.xpath('//div[@id="contentdata"]/p//text()').getall()
art_content = art_desc + ''.join(art_para)
# Scrape timestamp
schedule = response.css('div.article_schedule ::text').getall()
ts_string = ''.join(schedule).strip().upper()
ts_object = datetime.strptime(ts_string, "%B %d, %Y / %I:%M %p %Z")
ts_format = ts_object.strftime('%Y-%m-%d %H:%M:%S')
# Build the item - news article
news_item['headline'] = response.css('h1.article_title::text').get()
news_item['url'] = response.url
news_item['content'] = art_content
news_item['author'] = response.xpath('//div[@class="article_author"]/*/text()').get(default="Moneycontrol Contributor")
news_item['published'] = ts_format
yield news_item
代码经过简化以提高可读性。类名“pagenation”不是拼写错误 - 这是该类在网站上的命名方式。感谢您的帮助。
原来是设置问题。在“settings.py”中设置
ROBOTSTXT_OBEY = False
。
这是使用新设置正确运行的代码:
import scrapy
from bookworm.items import NewsItem
from datetime import datetime
import random
class SpidermoneySpider(scrapy.Spider):
name = "spidermoney"
allowed_domains = ["www.moneycontrol.com"]
landing_page = ["https://www.moneycontrol.com/news/business/markets/"]
pagination_urls = [f"https://www.moneycontrol.com/news/business/markets/page-{i}/" for i in range(2,31)]
start_urls = landing_page + pagination_urls
def parse(self, response):
yield from self.parse_page_url(response=response)
def parse_page_url(self, response):
article_URLs = response.css('li.clearfix a::attr(href)').getall()
for article_URL in article_URLs:
yield response.follow(article_URL, callback=self.parse_article_url)
def parse_article_url(self, response):
news_item = NewsItem()
# Scrape content
art_desc = response.css('h2.article_desc::text').get()
art_para = response.xpath('//div[@id="contentdata"]/p//text()').getall()
art_content = art_desc + ''.join(art_para)
# Scrape timestamp
schedule = response.css('div.article_schedule ::text').getall()
ts_string = ''.join(schedule).strip().upper()
ts_object = datetime.strptime(ts_string, "%B %d, %Y / %I:%M %p %Z")
ts_format = ts_object.strftime('%Y-%m-%d %H:%M:%S')
# Build the item - news article
news_item['headline'] = response.css('h1.article_title::text').get()
news_item['url'] = response.url
news_item['content'] = art_content
news_item['author'] = response.xpath('//div[@class="article_author"]/*/text()').get(default="Moneycontrol Contributor")
news_item['published'] = ts_format
yield news_item