我想从start_urls列表中跟踪所有URL,并遵循每个start_url的分页并将其内容剪贴在其中。我只能抓取并遵循网址列表中仅一个网址的分页。我试图像这样下一页网址>>
next_page_url = response.request.url + 'page/' + str(Couponsite2SpiderSpider.page_number) if next_page_url is not None: Couponsite2SpiderSpider.page_number += 1 yield response.follow(next_page_url, callback=self.parse) else: Couponsite2SpiderSpider.page_number = 2
但是没有得到想要的结果。我的蜘蛛代码如下
class Couponsite2SpiderSpider(scrapy.Spider):
name = 'couponSite2_spider'
allowed_domains = ['www.uaepayingless.com']
page_number = 2
def start_requests(self):
start_urls = reversed([
'https://www.uaepayingless.com/coupon-category/entertainment/',
'https://www.uaepayingless.com/coupon-category/fashion-accessories/',
'https://www.uaepayingless.com/coupon-category/food-beverage/'
])
return [Request(url = start_url) for start_url in start_urls]
def parse(self, response):
store = store = response.css('#store-listings-wrapper')
coupon_category = store.xpath('h2/text()').extract()
coupon_lists = store.css('#cat-coupon-lists')
for coupon in coupon_lists.xpath('div'):
coupon_title = coupon.xpath('div[2]/h3/a/text()').extract()
coupon_descriptions = coupon.css('div > div.latest-coupon > div')
for description in coupon_descriptions:
final_description = final_description = [''.join(description.xpath('.//div[@class="coupon-des-full"]//text()').extract()).strip().replace('\n', ' ').replace('Less', '').replace('Move to Trash', '').strip()]
if len(final_description[0]) == 0:
final_description = description.css('div.coupon-des-ellip::text').extract()
coupon_exp_date = coupon.xpath('normalize-space(.//div[@class="exp-text"]/text())').extract()
coupon_code_deal = coupon.xpath('normalize-space(.//div[@class="coupon-detail coupon-button-type"]/a/@href)').extract()
coupon_store_out = coupon.xpath('normalize-space(.//div[@class="coupon-detail coupon-button-type"]/a/@data-aff-url)').extract()
store_img_src = coupon.xpath('normalize-space(.//div[@class="store-thumb thumb-img"]/a/img/@src)').extract()
coupon_store_name = coupon.xpath('normalize-space(.//div[@class="store-name"]/a/text())').extract()
yield {
'coupon_title': coupon_title,
'coupon_description': final_description,
'coupon_exp_date': coupon_exp_date,
'coupon_code_deal': coupon_code_deal,
'coupon_store_out': coupon_store_out,
'store_img_src': store_img_src,
'coupon_store_name': coupon_store_name,
'coupon_category': coupon_category,
'website_link': response.request.url
}
next_page_url = "https://www.uaepayingless.com/coupon-category/entertainment/" + 'page/' + str(Couponsite2SpiderSpider.page_number)
if next_page_url is not None:
Couponsite2SpiderSpider.page_number += 1
yield response.follow(next_page_url, callback=self.parse)
我想从start_urls列表中跟踪所有URL,并遵循每个start_url的分页并将其内容剪贴在其中。我只能从...
实际上,我认为您的错误在这里