我正在尝试刮掉各种尺寸和颜色。
以下是以下场景:
所选颜色: -螳螂绿 -线轴黄
所选尺寸: -6磅 -8磅 -10磅 -15磅 -20磅 -30磅
我需要抓取标题、价格和特价,使用下面的代码。
import scrapy
import re
from scrapy_splash import SplashRequest
class FishingRodsSpider(scrapy.Spider):
name = "ana_rods_detailed"
allowed_domains = ["anacondastores.com"]
start_urls = ["https://www.anacondastores.com/fishing/fishing-line/braid-line/shimano-kairiki-8-braid-line-150-metre-spool/BP90140299"]
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url, self.parse, args={'wait': 2})
def parse(self, response):
title = response.css('.pdp-title::text').get().strip()
price = response.css('.product-info .price-was .amount::text').get().strip()
club_price = response.css('.product-info .price-now .amount::text').get().strip()
product_details = response.css('.product-details-list')
base_url = "https://www.anacondastores.com"
variant_style_pickers = response.css('.js-variant-style-picker')
variant_size_pickers = response.css('.js-variant-size')
style_data_urls = []
for i, style_picker in enumerate(variant_style_pickers):
style_data_url = style_picker.attrib.get('data-url')
if i == 0 and style_data_url is None:
for size_picker in variant_size_pickers:
size_data_url = size_picker.attrib.get('data-url')
if size_data_url is not None:
size_variant_url = base_url + size_data_url
# self.log("Size Variant: " + size_variant_url)
else:
self.log("Size data-url attribute is missing for a size picker.")
else:
if style_data_url is not None:
style_data_urls.append(base_url + style_data_url)
for size_picker in variant_size_pickers:
size_data_url = size_picker.attrib.get('data-url')
if size_data_url is not None:
size_variant_url = base_url + size_data_url
self.log("Size Variant: " + size_variant_url)
else:
self.log("Style data-url attribute is missing for a style picker.")
预期结果 -螳螂绿 6 磅 -螳螂绿 8 磅 -螳螂绿 10 磅 -螳螂绿 20 磅 -螳螂绿 30 磅 -线轴黄色 6 磅 -线轴黄色 8 磅 -线轴黄色 10 磅 -线轴黄色 20 磅 -线轴黄色 30 磅
该网站的数据可在加载的页面中找到,因此无需使用
scrapy-splash
。您需要检查网络活动以查找单击颜色或尺寸选项时生成的 URL。请参阅下面的示例代码:
import scrapy
class AnacondaSpider(scrapy.Spider):
name = "anaconda"
allowed_domains = ["www.anacondastores.com"]
start_urls = [
"https://www.anacondastores.com/fishing/fishing-line/braid-line/shimano-kairiki-8-braid-line-150-metre-spool/BP90140299-mantis-green"
]
def parse(self, response):
# get all colors and scrape them
color_urls = response.css(".js-variant-style-picker::attr(data-url)").getall()
for url in color_urls:
yield response.follow(url)
# get all the sizes and scrape them
size_codes = response.css(
".size-variant a::attr(data-variant-size-code)"
).getall()
for code in size_codes:
url = response.urljoin(code + "?version=7")
yield scrapy.Request(url, callback=self.parse_size)
def parse_size(self, response):
item = dict()
item["title"] = response.css(
"#productContentWrapper > div::attr(data-product-name)"
).get()
item["price"] = response.css(
"#productContentWrapper > div::attr(data-product-metric1)"
).get()
item["sale_price"] = response.css(
"#productContentWrapper > div::attr(data-product-price)"
).get()
item["size"] = response.css(
"#productContentWrapper > div::attr(data-product-dimension13)"
).get()
yield item