import scrapy
from urllib.parse import urljoin
class MyCrawlSpider(scrapy.Spider):
LIMIT_OF_DOMAIN = 100
def start_requests(self):
with open('links.txt', 'r') as f:
urls = f.read().split('\n')
#urls=['https://www.google.com/search?q=hello+world','https://yahoo.com']
for url in urls:
# remove empty links
if not url:
continue
sub_item = {'main_url': url, 'index_of_domain': 0}
yield scrapy.Request(url, callback=self.parse, meta=sub_item,)
def parse(self,response):
main_url = response.meta['main_url']
index_of_domain = response.meta['index_of_domain']
for url in response.xpath('//a[@href] | //article[@href]'):
href = url.xpath('@href').extract_first()
url = urljoin(main_url, href.strip())
if index_of_domain >= self.LIMIT_OF_DOMAIN:
break
index_of_domain += 1
sub_item = {'main_url': main_url, 'index_of_domain': index_of_domain}
yield scrapy.Request(url, callback=self.parse, meta=sub_item, )
目前我的极限LIMIT_OF_DOMAIN
是不工作,因为我想要的,不知何故,它一直抓取很多后,我的100限制。我已经搜索了scrapy文档,但找不到域名的限制,我如何解决这个问题?
理想情况下,我希望每个链接在 starting_requests
获得100个请求
class CurrentValue(object):
def __init__(self):
self.value = 0
def increment(self):
self.value += 1
class YourSpider(scrapy.Spider):
def start_requets(self):
obj = CurrentValue()
#your code
sub_item = {'current_url': url, 'main_url': url, 'index_of_domain': obj}
def parse(self,response):
obj = response.meta['index_of_domain']
for url in urls:
obj.increment()
你只需要另一个类来存储每个起始链接的值。