我尝试单击“加载更多”按钮,直到它消失并加载所有产品。然后我想单击所有单个产品以从产品单个站点中抓取我需要的数据。
我尝试了多种向下滚动的方法,并使用聊天 gpt 和 Gemini 重新排列了代码和合成器几次。但是,我仍然返回一个空的 json 文件。
import scrapy
import datetime
import re
from scrapy.crawler import CrawlerProcess
from scrapy_playwright.page import PageMethod
from scrapy.selector import Selector
class LidlSpider(scrapy.Spider):
name = 'lidl_snacks'
allowed_domains = ['sortiment.lidl.ch']
custom_settings = {
'ROBOTSTXT_OBEY': False
}
start_urls = [
'https://sortiment.lidl.ch/de/sussigkeiten-snacks#/', #246 Produkte
]
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
dont_filter=True,
callback=self.parse,
meta={
'url': url,
'playwright': True,
'playwright_include_page': True,
'playwright_page_methods':[
PageMethod('wait_for_selector', 'div.product-item-info'),
PageMethod("wait_for_selector", "button.primary.amscroll-load-button-new"),
]
}
)
async def scroll_to_bottom(self,page):
await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
async def parse(self, response):
page = response.meta["playwright_page"]
pagination_buttons = page.locator("button.primary.amscroll-load-button-new") # Adjust the selector as needed
if pagination_buttons:
buttons = await pagination_buttons.all()
for button in buttons:
await button.click() # Trigger pagination action
await page.wait_for_navigation()
await self.scroll_to_bottom(page) # Optional scroll down on the new page
# Extract product information after pagination click
content = await page.content()
sel = Selector(text=content)
produkte = sel.css('div.product-item-info')
for produkt in produkte:
produkt_url = produkt.css('a.product-item-link::attr(href)').get()
yield response.follow(produkt_url, callback=self.parse_produkt, meta={'url': response.meta['url']})
def parse_produkt(self, response):
mini_dict = {
'retailer': self.name,
'datetime': datetime.date.today(),
'categorie': None,
'id': None, #response.css('div.col-left>p::text').get().split()[1],
'brand': str(response.css('p.brand-name::text').get()),
'detail': str(response.css('span.base::text').get()),
'actual_price': response.css('strong.pricefield__price::attr(content)').get(),
'quantity': None,
'regular_price': None,
'price_per_unit': None,
}
yield mini_dict
if __name__ == "__main__": # __main__ was only created for debug purposes
process = CrawlerProcess()
process.crawl(LidlSpider)
process.start()
我发现有几个问题,
Zustimmen
(同意)按钮,然后才能单击其他任何内容。因此,将以下内容添加到您的代码中:popup = 'div#onetrust-banner-sdk'
if await page.is_visible(popup, timeout = 5000):
await page.locator('button#onetrust-accept-btn-handler').click()
await page.wait_for_selector(popup, state='hidden')
page.wait_for_navigation()
会报错,因为playwright.page中没有这样的方法,所以你可以用await page.wait_for_load_state("domcontentloaded")
替换它
有一个 single
Weitere Produkte laden
(加载更多产品)按钮,您需要多次单击该按钮直到它消失,因此代码中的 pagination_buttons
返回单个按钮,单击一次即可。
pagination_buttons = page.locator("button.primary.amscroll-load-button-new")
buttons = await pagination_buttons.all()
for button in buttons:
await button.click() # Trigger pagination action
await page.wait_for_load_state("domcontentloaded") # Wait for new page to load
await self.scroll_to_bottom(page) # Optional scroll down on the new page
您可以通过将上面的内容替换为:
来解决这个问题while True:
try:
show_more_button = page.locator("button.primary.amscroll-load-button-new")
if show_more_button:
await show_more_button.click()
await page.wait_for_load_state("domcontentloaded", timeout=5000)
await self.scroll_to_bottom(page)
else:
break
except Exception:
break
这是完整的代码:
import datetime
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy_playwright.page import PageMethod
from scrapy.selector import Selector
class LidlSpider(scrapy.Spider):
name = 'lidl_snacks'
allowed_domains = ['sortiment.lidl.ch']
custom_settings = {
'ROBOTSTXT_OBEY': False
}
start_urls = [
'https://sortiment.lidl.ch/de/kaffee-tee', #72 products
]
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
dont_filter=True,
callback=self.parse,
meta={
'url': url,
'playwright': True,
'playwright_include_page': True,
'playwright_page_methods':[
PageMethod('wait_for_load_state',"domcontentloaded"),
]
}
)
async def scroll_to_bottom(self,page):
await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
async def parse(self, response):
page = response.meta["playwright_page"]
#await page.screenshot(path="popup.png")
popup = 'div#onetrust-banner-sdk'
if await page.is_visible(popup, timeout = 5000):
await page.locator('button#onetrust-accept-btn-handler').click()
await page.wait_for_selector(popup, state='hidden')
#await page.screenshot(path="popup_clicked_check.png", full_page=True)
#count = 0
while True:
try:
show_more_button = page.locator("button.primary.amscroll-load-button-new")
if show_more_button:
await show_more_button.click()
await page.wait_for_load_state("domcontentloaded", timeout=5000) # Wait for new page to load
await self.scroll_to_bottom(page) # Optional scroll down on the new page
# await page.screenshot(path=f"page_scrolled_{count}.png", full_page=True)
# count+=1
else:
break
except Exception:
break
#Extract product information after pagination click
content = await page.content()
sel = Selector(text=content)
produkte = sel.css('div.product-item-info')
for produkt in produkte:
produkt_url = produkt.css('a.product-item-link::attr(href)').get()
yield response.follow(produkt_url, callback=self.parse_produkt, meta={'url': response.meta['url']})
def parse_produkt(self, response):
mini_dict = {
'retailer': self.name,
'datetime': datetime.date.today(),
'categorie': None,
'id': None, #response.css('div.col-left>p::text').get().split()[1],
'brand': str(response.css('p.brand-name::text').get()),
'detail': str(response.css('span.base::text').get()),
'actual_price': response.css('strong.pricefield__price::attr(content)').get(),
'quantity': None,
'regular_price': None,
'price_per_unit': None,
}
yield mini_dict
if __name__ == "__main__": # __main__ was only created for debug purposes
process = CrawlerProcess()
process.crawl(LidlSpider)
process.start()
注意事项:
/sussigkeiten-snacks#/
替换为 /kaffee-tee
,因为页面需要抓取的产品较少。scrapy crawl lidl_snacks -O snacks.json
查看它返回的内容。