我正在尝试从(https://ec.europa.eu/info/funding-tenders/opportunities/portal/screen/opportunities/calls-for-proposals?isExactMatch=true&status=31094501,提取融资机会的数据, 31094502&order=DESC&pageNumber=1&pageSize=50&sortBy=startDate)网站,但 Selenium 不断恢复到首页。我认为点击确实有效,但它只是一次又一次地向我显示第一页。
还有 10 个页面我应该从中提取数据。我还尝试格式化 url 以增加 URL 内的 pageNumber=1,但没有效果。当我通过链接手动增加页面时,网页本身不断返回到 pageNumber 1。我不知道该怎么办。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException, NoSuchElementException
import pandas as pd
import csv
import time
driver = webdriver.Chrome()
def accept_cookies():
try:
cookies_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "a.wt-ecl-button.wt-ecl-button--primary.cck-actions-button")))
cookies_button.click()
print("Cookies accepted.")
except Exception as e:
print("Error accepting cookies:", e)
def accept_button(driver):
try:
button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//button[@type='button' and contains(@class,'eui-button') and contains(@class,'eui-button--basic') and contains(@class,'eui-button--icon-only') and contains(@class,'eui-button--rounded')]")))
button.click()
print("Button accepted.")
except Exception as e:
print("Error accepting button:", e)
# Accept cookies if present
while True:
url = "https://ec.europa.eu/info/funding-tenders/opportunities/portal/screen/opportunities/calls-for-proposals?order=DESC&pageNumber=1&pageSize=10&isExactMatch=true"
driver.get(url)
accept_cookies()
# Wait until the elements are visible
wait = WebDriverWait(driver, 20)
opportunity_elements = wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'a.eui-u-text-link.eui-u-font-l.eui-u-font-regular')))
deadline_elements = driver.find_elements(By.CSS_SELECTOR, '.eui-u-display-block.eui-u-mt-xs.ng-star-inserted')
url_elements = driver.find_elements(By.CSS_SELECTOR, 'a.eui-u-text-link')
#Extract data from the current page
for i in range(len(opportunity_elements)):
# Name
try:
opportunity_name = opportunity_elements[i].text
k += 1
except StaleElementReferenceException:
continue
# Deadline
try:
opportunity_deadline = deadline_elements[i].text if i < len(deadline_elements) else "Deadline not available"
except StaleElementReferenceException:
opportunity_deadline = "No Deadline"
# URL
try:
opportunity_url = url_elements[i].get_attribute('href')
except StaleElementReferenceException:
opportunity_url = "No URL Available"
data_list.append({
'Opportunity Name': opportunity_name,
'Deadline': opportunity_deadline,
'URL': opportunity_url
})
print(f"Opportunity: {opportunity_name}, Deadline: {opportunity_deadline}, URL: {opportunity_url}")
#Check if there is a next page
accept_button(driver)
driver.quit()
这是保留并不断提取第一页的代码。
此代码片段成功单击“下一页”按钮。
步骤:
代码
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url = "https://ec.europa.eu/info/funding-tenders/opportunities/portal/screen/opportunities/calls-for-proposals?isExactMatch=true&status=31094501,31094502&order=DESC&pageNumber=1&pageSize=50&sortBy=startDate"
driver = webdriver.Chrome()
driver.maximize_window()
driver.get(url)
wait = WebDriverWait(driver, 10)
next_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "eui-icon-svg[aria-label='Go to next page']")))
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
next_button.click()