首先,非常感谢您的帮助!
我不知道为什么我每页只获得两个结果。请你帮助我好吗?这是代码:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from mercado.items import MercadoItem
class MercadoSpider(CrawlSpider):
name = 'mercado'
item_count = 0
allowed_domain = ['https://www.amazon.es']
start_urls = ['https://www.amazon.es/s/ref=sr_pg_2rh=n%3A1951051031%2Cn%3A2424922031%2Ck%3Afe bi&page=1&keywords=febi&ie=UTF8&qid=1 535314254']
rules = {
Rule(LinkExtractor(allow =(), restrict_xpaths = ('//*[h2]')),
callback = 'parse_item', follow = False)
}
def start_requests(self):
yield scrapy.Request("https://www.amazon.es/s/ref=sr_pg_2?rh=n%3A1951051031%2Cn%3A2424922031%2Ck%3Afebi&page=1&keywords=febi&ie=UTF8&qid=1535314254",self.parse_item)
for i in range(2,400):
yield scrapy.Request("https://www.amazon.es/s/ref=sr_pg_2?rh=n%3A1951051031%2Cn%3A2424922031%2Ck%3Afebi&page="+str(i)+"&keywords=febi&ie=UTF8&qid=1535314254",self.parse_item)
def parse_item(self, response):
for mercado in response.xpath('//*[h2]'):
ml_item = MercadoItem()
ml_item['articulo'] = response.xpath("@title").extract()[0]
ml_item['precio'] = response.xpath("@href").extract()[0]
yield ml_item
你需要搜索相对于你的mercado
元素:
def parse_item(self, response):
for mercado in response.xpath('//*[h2]'):
ml_item = MercadoItem()
ml_item['articulo'] = mercado.xpath("@title").extract()[0]
ml_item['precio'] = mercado.xpath("@href").extract()[0]
yield ml_item
我已经能够导入两个以上的结果。看看代码:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import CrawlSpider
import urllib
from mercado.items import MercadoItem
class MercadoSpider(CrawlSpider):
name = 'mercado'
item_count = 0
allowed_domain = ['https://www.amazon.es']
start_urls = ['https://www.amazon.es/s/ref=sr_pg_2rh=n%3A1951051031%2Cn%3A2424922031%2Ck%3Afebi&page=1&keywords=febi&ie=UTF8&qid=1 535314254']
def start_requests(self):
yield scrapy.Request("https://www.amazon.es/s/ref=sr_pg_2?rh=n%3A1951051031%2Cn%3A2424922031%2Ck%3Afebi&page=1&keywords=febi&ie=UTF8&qid=1535314254",self.parse_item)
for i in range(2,400):
yield scrapy.Request("https://www.amazon.es/s/ref=sr_pg_2?rh=n%3A1951051031%2Cn%3A2424922031%2Ck%3Afebi&page="+str(i)+"&keywords=febi&ie=UTF8&qid=1535314254",self.parse_item)
def parse_item(self, response):
namelist = response.xpath('//a[@class="a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal"]/@title').extract()
#precio = response.xpath('//a/span[@class="a-size-base a-color-price s-price a-text-bold"]').extract()
listlength = len(namelist)
for i in range(0,listlength):
ml_item = MercadoItem()
ml_item['articulo'] = response.xpath('//a[@class="a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal"]/@title').extract()
#ml_item['precio'] = response.xpath('//a/span[@class="a-size-base a-color-price s-price a-text-bold"]').extract()
yield ml_item
现在在每个csv行,我获得每个页面的所有结果。我现在的问题来自于添加每篇文章的价格。我被困在这里。我该如何添加它?