我需要使用 Selenium 抓取一些页面,在抓取之前,我需要在网站内搜索特定关键字并抓取与这些关键字相关的所有内容,如下所示: 网站示例 问题是我不能总是使用布尔运算符来搜索所有关键字,所以我需要搜索像“大数据”这样的关键字,抓取搜索页面中的内容,点击所有文章的url来抓取完整的内容内容,对所有文章执行此操作,然后返回并搜索新关键字。
我已经有一个适用于我需要抓取的一些网站的代码,但它错过了我搜索每个关键字的部分
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
def scrape_page(url):
try:
# Apri il sito web nel browser
driver.get(url)
driver.maximize_window()
# Gestisci il banner dei cookie, se presente
try:
cookie_banner = driver.find_element(By.XPATH, "")
cookie_banner.click()
except:
pass
# Imposta un intervallo di attesa esplicito per 10 secondi per garantire che la pagina sia completamente caricata
driver.implicitly_wait(10)
# Trova tutti gli elementi "Continua a leggere"
elements = driver.find_elements(By.XPATH, "")
# Lista per memorizzare i dati estratti
data = []
# Clicca su ciascun elemento
for index, element in enumerate(elements):
try:
# Ottieni URL e titolo dell'articolo
article_url = driver.find_element(By.XPATH, "'(])["+str(index+1)+"]").get_attribute("href")
article_title = element.find_element(By.XPATH, "'(])["+str(index+1)+"]").text
# Clicca sull'elemento
driver.find_element(By.XPATH, "'])["+str(index+1)+"]").click()
# Ottieni il contenuto della landing page di "Continua a leggere"
article_content = driver.find_element(By.XPATH, "").text
# Ottieni la data dell'articolo
article_date = driver.find_element(By.XPATH, "").text
# Aggiungi i dati alla lista
data.append({'Titolo': article_title, 'Data': article_date, 'URL': article_url, 'Contenuto': article_content})
# Torna alla pagina precedente
driver.back()
except Exception as e:
print("Errore durante il clic sull'elemento:", str(e))
except Exception as e:
print("Errore durante lo scraping della pagina:", str(e))
return None
return data
# Crea un'istanza del driver del browser
driver = webdriver.Chrome()
# URL del sito web da cui desideri effettuare il click
start_url = "https://www.salute.gov.it/portale/home.html"
# Lista per memorizzare tutti i dati estratti da tutte le pagine
all_data = []
# Cicla le pagine finché ci sono pagine successive
while start_url:
print("Scraping:", start_url)
page_data = scrape_page(start_url)
if page_data:
all_data.extend(page_data)
try:
# Cerca il link alla pagina successiva
next_page_link = driver.find_element(By.XPATH, "")
# Estrae l'URL della pagina successiva
start_url = next_page_link.get_attribute("href")
except:
# Se non ci sono più pagine successive, interrompe il ciclo
start_url = None
# Chiudi il browser
driver.quit()
# Costruisci un DataFrame Pandas con tutti i dati estratti
df = pd.DataFrame(all_data)
# Visualizza il DataFrame
print(df)
df.to_excel("")
有人可以帮我更新我的代码吗? 先谢谢你了
搜索:
send_keys()
将文本发送到此搜索栏,send_key()
发送 Keys.ENTER
它应该重定向到带有 url 的结果页面
driver.current_url
driver.get(start_url)
searchbar = driver.find_element(By.ID, "f_cerca")
searchbar.send_keys(word)
searchbar.send_keys(Keys.ENTER)
print(driver.current_url)
稍后你应该运行你的函数
scrape_page()
但没有 .get(url)
。
并且您应该使用搜索链接到下一页的代码循环运行
scrape_page()
- 并且应该使用 .get()
加载此页面(不要在 scrape_page()
中执行此操作,并且不加载主页。)
所有这些代码都应该位于
for
循环中,该循环运行不同关键字的所有代码。
经过一些更改的最小工作代码:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
#from selenium.webdriver.support.ui import WebDriverWait
#from selenium.webdriver.support import expected_conditions as EC
#from selenium.common.exceptions import NoSuchElementException, TimeoutException
import time
# ---
import selenium
print('Selenium:', selenium.__version__)
# ---
def scrape_page(driver, keyword):
try:
# Gestisci il banner dei cookie, se presente
try:
print('Clicking cookie banner')
cookie_banner = driver.find_element(By.XPATH, "//a[b[text()='Chiudi']]")
cookie_banner.click()
except Exception as e:
print('Exception:', e)
# Trova tutti gli elementi "Continua a leggere"
elements_dt = driver.find_elements(By.CSS_SELECTOR, "dl.simple-list.results dt")
#elements_dd = driver.find_elements(By.XPATH, "//dl[@class='sample-list.results']/dd/a")
print('[DEBUG] len(elements_dt):', len(elements_dt))
# Lista per memorizzare i dati estratti
data = []
# Clicca su ciascun elemento
#for index, element_dt, element_dd in enumerate(zip(elements_dt, elements_dd), 1): # you can use `enumerate(..., 1)` to start `index` with `1`
for index, element in enumerate(elements_dt, 1): # you can use `enumerate(..., 1)` to start `index` with `1`
try:
article_url = element.find_element(By.XPATH, './/a').get_attribute("href")
article_title = element.text
# ... DON'T CLIK LINKS BECAUSE IT WILL REMOVE CURRENT PAGE FROM MEMPRY
# ... AND YOU WILL LOST ACCESS TO OTHER `elements` ON CURRENT PAGE
# ...
# ... Get `href` and later (after loop) use `.get(href)` to access subpages.
data.append({
'keyword': keyword,
'Titolo': article_title,
'URL': article_url,
#'Data': article_date,
#'Contenuto': article_content
})
print('[DEBUG] data:', data[-1])
# Torna alla pagina precedente
#driver.back()
except Exception as e:
print("Errore durante il clic sull'elemento:", e)
# work with subpages
for item in data:
print('[DEBUG] subpage:', item['URL'])
driver.get(item['URL'])
#article_date = ...
#article_content = ...
#item['Data'] = article_date
#item['Contenuto'] = article_content
except Exception as e:
print("Errore durante lo scraping della pagina:", e)
return None
return data
# --- main ---
driver = webdriver.Chrome()
driver.maximize_window()
driver.implicitly_wait(10)
# ---
start_url = "https://www.salute.gov.it/portale/home.html"
all_data = []
keywords = ['ukraina', 'covid-19', 'elan musk']
for word in keywords:
print("Main Page:", start_url)
# open main page
driver.get(start_url)
# find searchbar
print('Search:', word)
searchbar = driver.find_element(By.ID, "f_cerca")
# put keyword in searchbar and press ENTER
searchbar.send_keys(word)
searchbar.send_keys(Keys.ENTER)
time.sleep(5) # wait for results
#get current url (because it could load different URL to show results)
search_results_url = driver.current_url
# start scraping results (with pagination):
#while True: # try to get all pages
for _ in range(3): # try to get only 3 pages
print("Scraping:", search_results_url)
page_data = scrape_page(driver, word) # <--- only scraping, without `.get(url)`, I send `word` only to add it to `data`
if page_data:
all_data.extend(page_data)
driver.get(search_results_url) # go back to result after visiting subpages - to get link to next page
try:
next_page_link = driver.find_element(By.XPATH, "//a[contains(text(), 'Successive')]")
search_results_url = next_page_link.get_attribute("href")
driver.get(search_results_url) # <--- open next page with results using URL
#next_page_link.click() # <--- or click link
except Exception as e:
print('[DEBUG] Exception:', e)
print('[DEBUG] break')
#input('Press ENTER to continue')
break # exit loop
driver.quit()
import pandas as pd
df = pd.DataFrame(all_data)
print(df)
input("Press ENTER to close")