如何使用Python进行多页网页抓取从Lazada中提取产品数据(产品名称,价格)?

问题描述 投票:0回答:1

我试图从 Lazada 商店中提取所有产品的产品数据(名称和价格)。但是,总共有 102 页,但我只能提取数据的第一页。谁能认出我的代码的问题?

网址:https://www.lazada.com.my/guardian/?from=wangpu&langFlag=en&page=1&pageTypeId=2&q=All-Products

下面是我的代码

import time
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

class ScrapeLazada():
    
    def scrape(self):
        url = 'https://www.lazada.com.my/guardian/?from=wangpu&langFlag=en&page=1&pageTypeId=2&q=All-Products'
        driver = webdriver.Chrome()
        driver.get(url)
        
        products=[]
        for i in range(102):
            WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#root")))
            time.sleep(2)

            soup = BeautifulSoup(driver.page_source, "html.parser")
            for item in soup.findAll('div', class_='Bm3ON'):
                product_name = item.find('div', class_='RfADt').text
                price = item.find('span', class_='ooOxS').text.replace('RM', '')
                products.append(
                    (product_name, price) 
                )

            time.sleep(2)
            driver.find_element(By.CSS_SELECTOR, ".ant-pagination-next > button").click()
            time.sleep(3)

            df = pd.DataFrame(products, columns=['Product Name', 'Price'])
            print(df)

            df.to_excel('Lazada_Guardian_Scrape.xlsx', index=False)
            print('Data saved in local disk')
    
    
        driver.close()
        
sl = ScrapeLazada()
sl.scrape()

以下是我的结果。显然它只列出了第一页的产品,但从下一页开始列出有问题。

Product Name   Price
0   UPHAMOL 250 Children Suspension Delicious Oran...    7.80
1   Darlie Double Action Fresh + Clean Toothpaste ...   20.92
2                         Dermal Therapy Lip Balm 10g   12.78
3                Nurish No Teen Anti Acne Toner 100Ml   12.82
4                   Live-Well OCCUsharp 30s Pack-of-3  100.90
5                       Oxy Anti- Blackhead Wash 100g   11.95
6                Guardian Clear Assorted Plasters 20s    1.55
7          Selsun Blue 2 in 1 Treatment Shampoo 120ml   24.66
8                    Hansaplast Disney Frozen II 20's    8.90
9              Guardian Wet Wipes 10's Fragrance Free    3.46
10          Fruiser Shower Cream Pump Rosemilk 1000ml    8.10
11  Enchanteur Wonder Woman Handbag Edt Fighter Of...    9.90
12               Guardian Plastic Plasters 100s + 20s    9.10
13                          Sensodyne Fresh Mint 100g   12.50
14        Pantene Hair Fall Control Conditioner 165ML   10.89
15                Koolfever Cooling Gel For Babies 4s    8.60
16            Hada Labo Premium Whitening Essence 30g   85.90
17           Kinohimitsu J'pan Health Pad 10's + 10's   67.03
18           Ceradan Moisturising Hand Sanitiser 50ml   26.74
19                   Sunsweet Pitted Prune 340g (USA)   23.20
20                      **21st Century Probiotics 30s   17.00
21  Kundal Honey and Macadamia Hair Treatment Pear...   27.22
22      Sunsilk Super Conditioner Damage Rescue 180ml   11.28
23                    LACTOGG probiotic capsules 30's  125.10
24                              Rosken Bio Serum 50ml   28.82
25           Simple Kind To Skin Soothing Toner 200ml   21.34
26                  L’Oreal White Perfect Toner 200ml   30.25
27                            Total Image S Tummy 60s   63.00
28        Durex Invisible Extra Lubricant Condom 10's   51.13
29          3 Legs Tolnaftate Cream Pack Of 2 (2X10g)   14.66
30          Hansaplast Universal Water Resistant 20's    4.20
31     Perfume Generics Perfume Oil Paris Hilton 10Ml    8.90
32                Aiken Shampoo - Intense Repair 350G   12.68
33                            GoodMorning VGrains 1kg   62.52
34                        Woodwards Gripe Water 148ml   14.00
35          Difflam Hextra Sore Throat Lozenges 2.4mg    8.00
36                               Okamoto 003 Cool 3's   14.50
37                 Dettol Hand Sanitizer Refresh 50ml    6.25
38  Avene Pre-Serum Hydrating Essence-In-Lotion 200Ml   87.30
39  Guardian Essential Lavender Refreshing Body Wa...   10.10
Traceback (most recent call last):
  File "Lazada_Guardian.py", line 43, in <module>
    sl.scrape()
  File "Lazada_Guardian.py", line 30, in scrape
    driver.find_element(By.CSS_SELECTOR, ".ant-pagination-next > button").click()
  File "/Users/chingkarlok/opt/anaconda3/lib/python3.8/site-packages/selenium/webdriver/remote/webelement.py", line 94, in click
    self._execute(Command.CLICK_ELEMENT)
  File "/Users/chingkarlok/opt/anaconda3/lib/python3.8/site-packages/selenium/webdriver/remote/webelement.py", line 403, in _execute
    return self._parent.execute(command, params)
  File "/Users/chingkarlok/opt/anaconda3/lib/python3.8/site-packages/selenium/webdriver/remote/webdriver.py", line 440, in execute
    self.error_handler.check_response(response)
  File "/Users/chingkarlok/opt/anaconda3/lib/python3.8/site-packages/selenium/webdriver/remote/errorhandler.py", line 245, in check_response
    raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.ElementClickInterceptedException: Message: element click intercepted: Element <button class="ant-pagination-item-link" type="button" tabindex="-1">...</button> is not clickable at point (1186, 693). Other element would receive the click: <html lang="en" class=" ">...</html>
  (Session info: chrome=112.0.5615.137)
Stacktrace:
0   chromedriver                        0x000000010295d670 chromedriver + 4298352
1   chromedriver                        0x0000000102955bbc chromedriver + 4266940
2   chromedriver                        0x0000000102588758 chromedriver + 280408
3   chromedriver                        0x00000001025cb444 chromedriver + 554052
4   chromedriver                        0x00000001025c8e84 chromedriver + 544388
5   chromedriver                        0x00000001025c663c chromedriver + 534076
6   chromedriver                        0x00000001025c5530 chromedriver + 529712
7   chromedriver                        0x00000001025b8428 chromedriver + 476200
8   chromedriver                        0x00000001025b7b90 chromedriver + 474000
9   chromedriver                        0x00000001025fc080 chromedriver + 753792
10  chromedriver                        0x00000001025b62d0 chromedriver + 467664
11  chromedriver                        0x00000001025b7354 chromedriver + 471892
12  chromedriver                        0x000000010291d6c4 chromedriver + 4036292
13  chromedriver                        0x0000000102921c64 chromedriver + 4054116
14  chromedriver                        0x00000001029282d8 chromedriver + 4080344
15  chromedriver                        0x0000000102922970 chromedriver + 4057456
16  chromedriver                        0x00000001028f98dc chromedriver + 3889372
17  chromedriver                        0x000000010294125c chromedriver + 4182620
18  chromedriver                        0x00000001029413b4 chromedriver + 4182964
19  chromedriver                        0x00000001029500f4 chromedriver + 4243700
20  libsystem_pthread.dylib             0x00000001a0e2e06c _pthread_start + 148
21  libsystem_pthread.dylib             0x00000001a0e28e2c thread_start + 8
python selenium-webdriver web-scraping beautifulsoup analysis
1个回答
0
投票

可以通过两种方式解决这个问题。

  1. 使用动作链点击下一步按钮。
# Imports required
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

actions = ActionChains(driver)
wait = WebDriverWait(driver,30)
products_list = []
for i in range(102):
    nextbutton = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,".ant-pagination-next > button")))
    actions.move_to_element(nextbutton).click().perform()
    time.sleep(2)
  1. 在 URL 中使用
    page
    值并迭代。
for i in range(1,102):
    driver.get(f"https://www.lazada.com.my/guardian/?from=wangpu&langFlag=en&page={i}&pageTypeId=2&q=All-Products")
    ...
© www.soinside.com 2019 - 2024. All rights reserved.