我有以下代码。它打开了无头浏览器,我还看到页面正在滚动,但解析方法中的响应对象没有任何 HTML。当我不使用自动滚动时,这个蜘蛛工作得很好。
该代码仅用于从本网站提取产品名称和产品价格。
import scrapy
import re
from scrapy_playwright.page import PageMethod
from bs4 import BeautifulSoup
def should_abort_request(req):
if req.resource_type == "image":
return True
if req.method.lower() == 'post':
return True
return False
scrolling_script = """
const scrolls = 8
let scrollCount = 0
// scroll down and then wait for 5s
const scrollInterval = setInterval(() => {
window.scrollTo(0, document.body.scrollHeight)
scrollCount++
if (scrollCount === numScrolls) {
clearInterval(scrollInterval)
}
}, 5000)
"""
class AuchanSpider(scrapy.Spider):
name = 'auchan'
custom_settings = {
'PLAYWRIGHT_ABORT_REQUEST': should_abort_request
}
start_urls = ['https://zakupy.auchan.pl/shop/list/8029?shType=id']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url=url,
callback=self.parse,
meta={
"playwright": True,
"playwright_include_page": True,
"playwright_page_methods": [
PageMethod("evaluate", scrolling_script),
#PageMethod("wait_for_timeout", 30000),
PageMethod("wait_for_selector", "._1E5b _2I59 _1wkJ _3YFw igxN _7Zx6 Eb4X _390_"),
PageMethod("wait_for_selector", "._1E5b _2I59 _1wkJ _3YFw igxN _7Zx6 Eb4X _390_:nth-child(60)")
],
},
errback=self.close_page,
cb_kwargs=dict(main_url=url, page_number=0),
)
async def parse(self, response, main_url, page_number):
soup = BeautifulSoup(response.text, 'html.parser')
product_containers = soup.find_all('div', class_='_1E5b _2I59 _1wkJ _3YFw igxN _7Zx6 Eb4X _390_')
for product_container in product_containers:
price = product_container.find(class_='_1-UB _1Evs').get_text()
price = re.sub(r"[\n\t\s]*", "", price)
yield {
'productName': product_container.find(class_='_1DGZ').get_text(),
'price': price
}
async def close_page(self, failure):
page = failure.request.meta["playwright_page"]
await page.close()
我想通了。问题出在
wait_for_selector
。 div 中不应有“空格”。相反,空格应替换为“.”。这就是 wait_for_selector
的样子。 PageMethod("wait_for_selector", "._1E5b._2I59._1wkJ._3YFw.igxN._7Zx6.Eb4X._390_"),