下面是我尝试从该网站提取 3 个值(UPC、价格和可用性)的代码:https://books.toscrape.com/。我正在使用 Scrapy CrawlSpider 但它为提取的值返回“无”。我试图用这段代码实现的目标是:进入每本书第一页并提取上述 3 个值。代码如下:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class BooksSpider(CrawlSpider):
name = "bookscraper"
allowed_domains = ["books.toscrape.com"]
start_urls = ["https://books.toscrape.com/"]
rules = (Rule(LinkExtractor(restrict_xpaths='//h3/a'), callback='parse_item', follow=True))
def parse_item(self, response):
product_info = response.xpath('//table[@class="table table-striped"]')
upc = product_info.xpath('(./tbody/tr/td)[1]/text()').get()
price = product_info.xpath('(./tbody/tr/td)[3]/text()').get()
availability = product_info.xpath('(./tbody/tr/td)[6]/text()').get()
yield {'UPC': upc, 'Price': price, 'Availability': availability}
# print(response.url)
这似乎可以解决你的问题
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class BooksSpider(CrawlSpider):
name = "books"
allowed_domains = ["books.toscrape.com"]
start_urls = ["https://books.toscrape.com/"]
rules = (
Rule(LinkExtractor(restrict_xpaths="//h3/a"), callback="parse_book", follow=True),
)
def parse_book(self, response):
selector = scrapy.Selector(response=response) # use Selector to access xpath attribute
product_info: scrapy.Selector = selector.xpath('//table[@class="table table-striped"]//td/text()') # Get each column td label directly
temp = [i.extract().strip() for i in product_info]
upc = temp[0]
price = temp[2]
availability = temp[5]
return {'UPC': upc, 'Price': price, 'Availability': availability}