scrapy限制域名的请求量

问题描述 投票:0回答:1
import scrapy
from urllib.parse import urljoin


class MyCrawlSpider(scrapy.Spider):
    LIMIT_OF_DOMAIN = 100

    def start_requests(self):
        with open('links.txt', 'r') as f:
            urls = f.read().split('\n')
        #urls=['https://www.google.com/search?q=hello+world','https://yahoo.com']
        for url in urls:
            # remove empty links
            if not url:
                continue
            sub_item = {'main_url': url, 'index_of_domain': 0}
            yield scrapy.Request(url, callback=self.parse, meta=sub_item,)

    def parse(self,response):
        main_url = response.meta['main_url']
        index_of_domain = response.meta['index_of_domain']
        for url in response.xpath('//a[@href] | //article[@href]'):
            href = url.xpath('@href').extract_first()
            url = urljoin(main_url, href.strip())
            if index_of_domain >= self.LIMIT_OF_DOMAIN:
                break
            index_of_domain += 1
            sub_item = {'main_url': main_url, 'index_of_domain': index_of_domain}
            yield scrapy.Request(url, callback=self.parse, meta=sub_item, )

目前我的极限LIMIT_OF_DOMAIN 是不工作,因为我想要的,不知何故,它一直抓取很多后,我的100限制。我已经搜索了scrapy文档,但找不到域名的限制,我如何解决这个问题?

理想情况下,我希望每个链接在 starting_requests 获得100个请求

python scrapy generator
1个回答
1
投票
class CurrentValue(object):
    def __init__(self):
        self.value = 0

    def increment(self):
        self.value += 1
class YourSpider(scrapy.Spider):
    def start_requets(self):
        obj = CurrentValue()
        #your code
        sub_item = {'current_url': url, 'main_url': url, 'index_of_domain': obj}

    def parse(self,response):
        obj = response.meta['index_of_domain']
        for url in urls:
            obj.increment()

你只需要另一个类来存储每个起始链接的值。

© www.soinside.com 2019 - 2024. All rights reserved.