当我抓取网站时,我可以使用 Selenium (python) 在搜索栏中搜索特定关键字吗?

问题描述 投票:0回答:1

我需要使用 Selenium 抓取一些页面,在抓取之前,我需要在网站内搜索特定关键字并抓取与这些关键字相关的所有内容,如下所示: 网站示例 问题是我不能总是使用布尔运算符来搜索所有关键字,所以我需要搜索像“大数据”这样的关键字,抓取搜索页面中的内容,点击所有文章的url来抓取完整的内容内容,对所有文章执行此操作,然后返回并搜索新关键字。

我已经有一个适用于我需要抓取的一些网站的代码,但它错过了我搜索每个关键字的部分

from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd

def scrape_page(url):
    try:
        # Apri il sito web nel browser
        driver.get(url)
        driver.maximize_window()

        # Gestisci il banner dei cookie, se presente
        try:
            cookie_banner = driver.find_element(By.XPATH, "")
            cookie_banner.click()
        except:
            pass

        # Imposta un intervallo di attesa esplicito per 10 secondi per garantire che la pagina sia completamente caricata
        driver.implicitly_wait(10)

        # Trova tutti gli elementi "Continua a leggere"
        elements = driver.find_elements(By.XPATH, "")
        # Lista per memorizzare i dati estratti
        data = []

        # Clicca su ciascun elemento
        for index, element in enumerate(elements):
            try:
                # Ottieni URL e titolo dell'articolo
                article_url = driver.find_element(By.XPATH, "'(])["+str(index+1)+"]").get_attribute("href")
                article_title = element.find_element(By.XPATH, "'(])["+str(index+1)+"]").text
                # Clicca sull'elemento
                driver.find_element(By.XPATH, "'])["+str(index+1)+"]").click()
                # Ottieni il contenuto della landing page di "Continua a leggere"
                article_content = driver.find_element(By.XPATH, "").text
                # Ottieni la data dell'articolo
                article_date = driver.find_element(By.XPATH, "").text
                # Aggiungi i dati alla lista
                data.append({'Titolo': article_title, 'Data': article_date, 'URL': article_url, 'Contenuto': article_content})
                # Torna alla pagina precedente
                driver.back()
            except Exception as e:
                print("Errore durante il clic sull'elemento:", str(e))
    except Exception as e:
        print("Errore durante lo scraping della pagina:", str(e))
        return None

    return data

# Crea un'istanza del driver del browser
driver = webdriver.Chrome()

# URL del sito web da cui desideri effettuare il click
start_url = "https://www.salute.gov.it/portale/home.html"

# Lista per memorizzare tutti i dati estratti da tutte le pagine
all_data = []

# Cicla le pagine finché ci sono pagine successive
while start_url:
    print("Scraping:", start_url)
    page_data = scrape_page(start_url)
    if page_data:
        all_data.extend(page_data)
    
    try:
        # Cerca il link alla pagina successiva
        next_page_link = driver.find_element(By.XPATH, "")
        # Estrae l'URL della pagina successiva
        start_url = next_page_link.get_attribute("href")
    except:
        # Se non ci sono più pagine successive, interrompe il ciclo
        start_url = None

# Chiudi il browser
driver.quit()

# Costruisci un DataFrame Pandas con tutti i dati estratti
df = pd.DataFrame(all_data)

# Visualizza il DataFrame
print(df)

df.to_excel("")

有人可以帮我更新我的代码吗? 先谢谢你了

python selenium-webdriver web-scraping
1个回答
0
投票

搜索:

  • 首先加载主页
  • 接下来找到搜索栏
  • 接下来使用
    send_keys()
    将文本发送到此搜索栏,
  • 然后使用
    send_key()
    发送
    Keys.ENTER

它应该重定向到带有 url 的结果页面

driver.current_url

driver.get(start_url)

searchbar = driver.find_element(By.ID, "f_cerca")
searchbar.send_keys(word)
searchbar.send_keys(Keys.ENTER)

print(driver.current_url)

稍后你应该运行你的函数

scrape_page()
但没有
.get(url)

并且您应该使用搜索链接到下一页的代码循环运行

scrape_page()
- 并且应该使用
.get()
加载此页面(不要在
scrape_page()
中执行此操作,并且不加载主页。)

所有这些代码都应该位于

for
循环中,该循环运行不同关键字的所有代码。


经过一些更改的最小工作代码:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
#from selenium.webdriver.support.ui import WebDriverWait
#from selenium.webdriver.support import expected_conditions as EC
#from selenium.common.exceptions import NoSuchElementException, TimeoutException

import time

# ---

import selenium
print('Selenium:', selenium.__version__)

# ---

def scrape_page(driver, keyword):
    try:

        # Gestisci il banner dei cookie, se presente
        try:
            print('Clicking cookie banner')            
            cookie_banner = driver.find_element(By.XPATH, "//a[b[text()='Chiudi']]")
            cookie_banner.click()
        except Exception as e:
            print('Exception:', e)

        # Trova tutti gli elementi "Continua a leggere"
        elements_dt = driver.find_elements(By.CSS_SELECTOR, "dl.simple-list.results dt")
        #elements_dd = driver.find_elements(By.XPATH, "//dl[@class='sample-list.results']/dd/a")
        
        print('[DEBUG] len(elements_dt):', len(elements_dt))
        # Lista per memorizzare i dati estratti
        data = []

        # Clicca su ciascun elemento
        #for index, element_dt, element_dd in enumerate(zip(elements_dt, elements_dd), 1):  # you can use `enumerate(..., 1)` to start `index` with `1`
        for index, element in enumerate(elements_dt, 1):  # you can use `enumerate(..., 1)` to start `index` with `1`
            
            try:
                article_url = element.find_element(By.XPATH, './/a').get_attribute("href")
                article_title = element.text
                
                # ... DON'T CLIK LINKS BECAUSE IT WILL REMOVE CURRENT PAGE FROM MEMPRY
                # ... AND YOU WILL LOST ACCESS TO OTHER `elements` ON CURRENT PAGE
                # ...
                # ... Get `href` and later (after loop) use `.get(href)` to access subpages. 
                
                data.append({
                    'keyword': keyword,
                    'Titolo': article_title, 
                    'URL': article_url, 
                    #'Data': article_date, 
                    #'Contenuto': article_content
                })
                
                print('[DEBUG] data:', data[-1])
                # Torna alla pagina precedente
                #driver.back()
            except Exception as e:
                print("Errore durante il clic sull'elemento:", e)
                
        # work with subpages

        for item in data:
            print('[DEBUG] subpage:', item['URL'])
            driver.get(item['URL'])
            #article_date = ...
            #article_content = ...
            #item['Data'] = article_date
            #item['Contenuto'] = article_content
             
    except Exception as e:
        print("Errore durante lo scraping della pagina:", e)
        return None

    return data

# --- main ---

driver = webdriver.Chrome()
driver.maximize_window()
driver.implicitly_wait(10)

# ---

start_url = "https://www.salute.gov.it/portale/home.html"

all_data = []

keywords = ['ukraina', 'covid-19', 'elan musk']

for word in keywords:

    print("Main Page:", start_url)

    # open main page 
    driver.get(start_url)

    # find searchbar
    print('Search:', word)
    searchbar = driver.find_element(By.ID, "f_cerca")
    # put keyword in searchbar and press ENTER
    searchbar.send_keys(word)
    searchbar.send_keys(Keys.ENTER)
    
    time.sleep(5) # wait for results
    
    #get current url (because it could load different URL to show results)
    search_results_url = driver.current_url
    
    # start scraping results (with pagination):
    #while True:  # try to get all pages
    for _ in range(3):  # try to get only 3 pages
        print("Scraping:", search_results_url)
        
        page_data = scrape_page(driver, word)  # <--- only scraping, without `.get(url)`, I send `word` only to add it to `data`
        
        if page_data:
            all_data.extend(page_data)

        driver.get(search_results_url) # go back to result after visiting subpages - to get link to next page 
        
        try:
            next_page_link = driver.find_element(By.XPATH, "//a[contains(text(), 'Successive')]")
            search_results_url = next_page_link.get_attribute("href")
            driver.get(search_results_url)  # <--- open next page with results using URL
            #next_page_link.click()   # <--- or click link 
        except Exception as e:
            print('[DEBUG] Exception:', e)
            print('[DEBUG] break')
            #input('Press ENTER to continue')
            break  # exit loop
            
driver.quit()

import pandas as pd
df = pd.DataFrame(all_data)
print(df)

input("Press ENTER to close")
© www.soinside.com 2019 - 2024. All rights reserved.