我尝试使用 scrapy playwright 滚动浏览网上商店以抓取所有产品,但它不起作用

问题描述 投票:0回答:1

我尝试单击“加载更多”按钮,直到它消失并加载所有产品。然后我想单击所有单个产品以从产品单个站点中抓取我需要的数据。

我尝试了多种向下滚动的方法,并使用聊天 gpt 和 Gemini 重新排列了代码和合成器几次。但是,我仍然返回一个空的 json 文件。


import scrapy
import datetime
import re

from scrapy.crawler import CrawlerProcess
from scrapy_playwright.page import PageMethod
from scrapy.selector import Selector



class LidlSpider(scrapy.Spider):
    name = 'lidl_snacks'
    allowed_domains = ['sortiment.lidl.ch']
    custom_settings = {
        'ROBOTSTXT_OBEY': False
    }
    start_urls = [
        'https://sortiment.lidl.ch/de/sussigkeiten-snacks#/', #246 Produkte
    ] 
    
    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(
                url,
                dont_filter=True,
                callback=self.parse,
                meta={
                    'url': url,
                    'playwright': True,
                    'playwright_include_page': True,
                    'playwright_page_methods':[
                        PageMethod('wait_for_selector', 'div.product-item-info'),
                        PageMethod("wait_for_selector", "button.primary.amscroll-load-button-new"),
                        
                    ]
                }
            )
    async def scroll_to_bottom(self,page):
        await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")

    async def parse(self, response):
        page = response.meta["playwright_page"]
        pagination_buttons = page.locator("button.primary.amscroll-load-button-new")  # Adjust the selector as needed

        
        if pagination_buttons:
            buttons = await pagination_buttons.all()
            for button in buttons:
                await button.click()  # Trigger pagination action
                await page.wait_for_navigation()
                await self.scroll_to_bottom(page)  # Optional scroll down on the new page
                
        # Extract product information after pagination click
        content = await page.content()
        sel = Selector(text=content)
        produkte = sel.css('div.product-item-info')
        for produkt in produkte:
            produkt_url = produkt.css('a.product-item-link::attr(href)').get()
            yield response.follow(produkt_url, callback=self.parse_produkt, meta={'url': response.meta['url']})

      

    def parse_produkt(self, response):
        
        mini_dict = {
                'retailer':       self.name,
                'datetime':       datetime.date.today(),
                'categorie':      None,
                'id':             None, #response.css('div.col-left>p::text').get().split()[1],
                'brand':          str(response.css('p.brand-name::text').get()),
                'detail':         str(response.css('span.base::text').get()),
                'actual_price':   response.css('strong.pricefield__price::attr(content)').get(),
                'quantity':       None,
                'regular_price':  None,
                'price_per_unit': None,

            }
           

        yield mini_dict

       
    

        
if __name__ == "__main__":  # __main__ was only created for debug purposes
    process = CrawlerProcess()
    process.crawl(LidlSpider)
    process.start()

python web-scraping scrapy playwright playwright-python
1个回答
0
投票

我发现有几个问题,

  • 页面上有一个弹出窗口,您首先需要单击
    Zustimmen
    (同意)按钮,然后才能单击其他任何内容。因此,将以下内容添加到您的代码中:
popup = 'div#onetrust-banner-sdk'
if await page.is_visible(popup, timeout = 5000):
    await page.locator('button#onetrust-accept-btn-handler').click()
    await page.wait_for_selector(popup, state='hidden')
  • page.wait_for_navigation()
    会报错,因为playwright.page中没有这样的方法,所以你可以用
    await page.wait_for_load_state("domcontentloaded")

    替换它
  • 有一个 single

    Weitere Produkte laden
    (加载更多产品)按钮,您需要多次单击该按钮直到它消失,因此代码中的
    pagination_buttons
    返回单个按钮,单击一次即可。

pagination_buttons = page.locator("button.primary.amscroll-load-button-new")
buttons = await pagination_buttons.all()
for button in buttons:
    await button.click()  # Trigger pagination action
    await page.wait_for_load_state("domcontentloaded")  # Wait for new page to load
    await self.scroll_to_bottom(page)  # Optional scroll down on the new page

您可以通过将上面的内容替换为:

来解决这个问题
while True: 
    try:
        show_more_button = page.locator("button.primary.amscroll-load-button-new")
        if show_more_button:
            await show_more_button.click()
            await page.wait_for_load_state("domcontentloaded", timeout=5000)
            await self.scroll_to_bottom(page)  
        else:
            break
    except Exception:
        break

这是完整的代码:

import datetime
import scrapy

from scrapy.crawler import CrawlerProcess
from scrapy_playwright.page import PageMethod
from scrapy.selector import Selector

class LidlSpider(scrapy.Spider):
    name = 'lidl_snacks'
    allowed_domains = ['sortiment.lidl.ch']
    custom_settings = {
        'ROBOTSTXT_OBEY': False
    }
    start_urls = [
        'https://sortiment.lidl.ch/de/kaffee-tee', #72 products
    ] 
    
    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(
                url,
                dont_filter=True,
                callback=self.parse,
                meta={
                    'url': url,
                    'playwright': True,
                    'playwright_include_page': True,
                    'playwright_page_methods':[
                        PageMethod('wait_for_load_state',"domcontentloaded"),   
                    ]
                }
            )
    async def scroll_to_bottom(self,page):
        await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")

    async def parse(self, response):
        page = response.meta["playwright_page"]
        
        #await page.screenshot(path="popup.png")

        popup = 'div#onetrust-banner-sdk'
        if await page.is_visible(popup, timeout = 5000):
            await page.locator('button#onetrust-accept-btn-handler').click()
            await page.wait_for_selector(popup, state='hidden')

        #await page.screenshot(path="popup_clicked_check.png", full_page=True)
        
        #count = 0
        while True: 
            try:
                show_more_button = page.locator("button.primary.amscroll-load-button-new")
                if show_more_button:
                    await show_more_button.click()
                    await page.wait_for_load_state("domcontentloaded", timeout=5000)  # Wait for new page to load
                    await self.scroll_to_bottom(page)  # Optional scroll down on the new page
                    # await page.screenshot(path=f"page_scrolled_{count}.png", full_page=True)
                    # count+=1
                else:
                    break
            except Exception:
                break
   
        #Extract product information after pagination click
        content = await page.content()
        sel = Selector(text=content)
        produkte = sel.css('div.product-item-info')
        for produkt in produkte:
            produkt_url = produkt.css('a.product-item-link::attr(href)').get()
            yield response.follow(produkt_url, callback=self.parse_produkt, meta={'url': response.meta['url']})

      

    def parse_produkt(self, response):
        
        mini_dict = {
                'retailer':       self.name,
                'datetime':       datetime.date.today(),
                'categorie':      None,
                'id':             None, #response.css('div.col-left>p::text').get().split()[1],
                'brand':          str(response.css('p.brand-name::text').get()),
                'detail':         str(response.css('span.base::text').get()),
                'actual_price':   response.css('strong.pricefield__price::attr(content)').get(),
                'quantity':       None,
                'regular_price':  None,
                'price_per_unit': None,

            }
           

        yield mini_dict

if __name__ == "__main__":  # __main__ was only created for debug purposes
    process = CrawlerProcess()
    process.crawl(LidlSpider)
    process.start()

注意事项:

  • /sussigkeiten-snacks#/
    替换为
    /kaffee-tee
    ,因为页面需要抓取的产品较少。
  • 按原样运行时,
  • 返回正确的项目数量,或者您可以使用
    scrapy crawl lidl_snacks -O snacks.json
    查看它返回的内容。
© www.soinside.com 2019 - 2024. All rights reserved.