我试图提取作为数据分析师所需的技能来进行分析项目以了解最需要的技能
我的代码工作正常,直到导航到下一页以提取其余职位信息的部分
但是我已经尝试这样做了两天了
我不知道问题出在哪里
但我认为这是按钮选择器或其他东西
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import os
import csv
import time
options = Options()
options.binary_location = "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe"
driver_path = "C:\\Users\\Alaa\\Desktop\\chromedriver.exe"
driver = webdriver.Chrome(service=Service(driver_path), options=options)
def getJobInfo(job_link):
try:
driver.get(job_link)
wait = WebDriverWait(driver, 10) # wait up to 10 seconds
jobTitle = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'h1.css-f9uh36'))).text.strip()
try:
company = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'a.css-tdvcnh'))).text
except Exception as e:
companylist = driver.find_elements(By.CSS_SELECTOR, 'div.css-9iujih')
company = companylist[0].text
date = driver.find_element(By.CSS_SELECTOR, 'span.css-182mrdn').text.strip()
job_type = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'a.css-g65o95'))).text
location_element = wait.until(EC.presence_of_element_located((By.XPATH, '//strong[contains(@class, "css-9geu3q")]')))
location = driver.execute_script("return arguments[0].textContent", location_element).split("-")[-1].strip()
listt= driver.find_elements(By.CSS_SELECTOR, 'span.css-4xky9y')
experience = listt[0].text if len(listt) > 0 else None
salary = listt[3].text if len(listt) > 3 else None
skills = driver.find_element(By.CSS_SELECTOR, 'div.css-1t5f0fr').text
return {
"Job Title": jobTitle,
"Company": company,
"Date": date,
"Job type": job_type,
"Location": location,
"Experience": experience,
"Salary": salary,
"Skills": skills
}
except Exception as e:
print(f"An error occurred: {e}")
return None
# Navigate to the main page
driver.get("https://wuzzuf.net/search/jobs/?q=data+analyst&a=na")
# Initialize an empty list to store all job information
allJobInfo = []
current_url = driver.current_url # Initialize current_url
while True:
wait = WebDriverWait(driver, 10)
# Get the job detail page URLs
jobsList = driver.find_elements(By.CSS_SELECTOR, 'div.css-1gatmva.e1v1l3u10')
job_links = [job.find_element(By.CSS_SELECTOR, 'a.css-o171kl').get_attribute('href') for job in jobsList]
print(f"Found {len(job_links)} job links on this page.")
for job_link in job_links:
print(f"Visiting job link: {job_link}")
jobInfo = getJobInfo(job_link)
if jobInfo is not None:
allJobInfo.append(jobInfo)
print(f"Successfully scraped job info from {job_link}")
else:
print(f"Failed to scrape job info from {job_link}")
time.sleep(2)
# Try to click the 'Next' button to go to the next page
try:
next_button = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'button.css-1rodp7n.ezfki8j0')))
# Check if the 'Next' button is clickable (optional)
if 'disabled' not in next_button.get_attribute('class'):
print("Next button is enabled.")
current_url = driver.current_url # Store the current URL
print("Current URL before clicking Next:", current_url)
# Use JavaScript to click the 'Next' button
driver.execute_script("arguments[0].click();", next_button)
time.sleep(5) # Wait for 5 seconds (adjust as needed)
# Store the job links of the current page
prev_job_links = job_links
# Get the job links of the new page
new_jobsList = driver.find_elements(By.CSS_SELECTOR, 'div.css-1gatmva.e1v1l3u10')
new_job_links = [job.find_element(By.CSS_SELECTOR, 'a.css-o171kl').get_attribute('href') for job in new_jobsList]
# Compare the job links
if new_job_links != prev_job_links:
print("Successfully navigated to the next page.")
else:
print("Failed to navigate to the next page.")
else:
print("Reached the last page. The 'Next' button is disabled.")
break
except TimeoutException:
print("Reached the last page or element not found.")
break
except Exception as e:
print(f"Failed to navigate to the next page. Error: {e}")
break
wait = WebDriverWait(driver, 10)
# Get the job detail page URLs
jobsList = driver.find_elements(By.CSS_SELECTOR, 'div.css-1gatmva.e1v1l3u10')
job_links = [job.find_element(By.CSS_SELECTOR, 'a.css-o171kl').get_attribute('href') for job in jobsList]
print(f"Found {len(job_links)} job links on this page.")
counter = 0 # Initialize counter
for job_link in job_links:
if counter == 1: # If counter is 1, break the loop
break
print(f"Visiting job link: {job_link}")
jobInfo = getJobInfo(job_link)
if jobInfo is not None:
allJobInfo.append(jobInfo)
print(f"Successfully scraped job info from {job_link}")
else:
print(f"Failed to scrape job info from {job_link}")
time.sleep(2)
counter += 1 # Increment counter at the end of each loop
# Try to click the 'Next' button to go to the next page
try:
next_button = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'button.css-1rodp7n.ezfki8j0')))
# Check if the 'Next' button is clickable (optional)
if 'disabled' not in next_button.get_attribute('class'):
print("Next button is enabled.")
current_url = driver.current_url # Store the current URL
print("Current URL before clicking Next:", current_url)
next_button.click() # Click the 'Next' button
time.sleep(5) # Wait for 5 seconds (adjust as needed)
# Store the job links of the current page
prev_job_links = job_links
# Get the job links of the new page
new_jobsList = driver.find_elements(By.CSS_SELECTOR, 'div.css-1gatmva.e1v1l3u10')
new_job_links = [job.find_element(By.CSS_SELECTOR, 'a.css-o171kl').get_attribute('href') for job in new_jobsList]
# Compare the job links
if new_job_links != prev_job_links:
print("Successfully navigated to the next page.")
else:
print("Failed to navigate to the next page.")
else:
print("Reached the last page. The 'Next' button is disabled.")
break
except TimeoutException:
print("Reached the last page or element not found.")
break
except Exception as e:
print(f"Failed to navigate to the next page. Error: {e}")
break
wait = WebDriverWait(driver, 10)
# Get the job detail page URLs
jobsList = driver.find_elements(By.CSS_SELECTOR, 'div.css-1gatmva.e1v1l3u10')
job_links = [job.find_element(By.CSS_SELECTOR, 'a.css-o171kl').get_attribute('href') for job in jobsList]
print(f"Found {len(job_links)} job links on this page.")
counter = 0 # Initialize counter
for job_link in job_links:
if counter == 1: # If counter is 1, break the loop
break
print(f"Visiting job link: {job_link}")
jobInfo = getJobInfo(job_link)
if jobInfo is not None:
allJobInfo.append(jobInfo)
print(f"Successfully scraped job info from {job_link}")
else:
print(f"Failed to scrape job info from {job_link}")
time.sleep(2)
counter += 1 # Increment counter at the end of each loop
# Try to click the 'Next' button to go to the next page
try:
next_button = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'button.css-1rodp7n.ezfki8j0')))
# Check if the 'Next' button is clickable (optional)
if 'disabled' not in next_button.get_attribute('class'):
print("Next button is enabled.")
current_url = driver.current_url # Store the current URL
print("Current URL before clicking Next:", current_url)
next_button.click() # Click the 'Next' button
time.sleep(5) # Wait for 5 seconds (adjust as needed)
# Check for successful navigation (alternative approach)
if wait.until(EC.url_to_be(current_url + "&page=2")): # Check for specific URL pattern
print("Successfully navigated to the next page.")
else:
print("Failed to navigate to the next page. URL pattern mismatch.")
else:
print("Reached the last page. The 'Next' button is disabled.")
break
except TimeoutException:
print("Reached the last page or element not found.")
break
except Exception as e:
print(f"Failed to navigate to the next page. Error: {e}")
break
driver.quit()
keys = allJobInfo[0].keys() if allJobInfo else [] # Adjusting for empty jobInfo list
filename = "jobs.csv"
path = os.path.join(os.path.expanduser("~"), "Desktop", filename)
if allJobInfo:
with open(path, "w", newline="", encoding="utf-8") as webscraping:
dict_writer = csv.DictWriter(webscraping, keys)
dict_writer.writeheader()
for job in allJobInfo:
match_encoded = {k: v.encode('utf-8').decode('utf-8') if isinstance(v, str) else v for k, v in job.items()}
dict_writer.writerow(match_encoded)
print(f"File '{filename}' saved to Desktop.")
else:
print("No jobs found.")
这是我试图从 wuzzuf_website 提取数据的页面 我尝试了很多方法来选择下一个按钮,但没有任何效果
在浏览器中尝试一下:
document.querySelectorAll('button.css-1rodp7n.ezfki8j0');
虽然第一个选择器针对具有特定类的按钮,但它可能不起作用,因为类名称似乎是动态生成的。
这是使用 HTML 结构中按钮路径的另一种方法:
document.querySelector("#app > div > div.css-1omce3u > div > div > div.css-if9uys > ul > li:last-child > button");
请注意,我已将
li:nth-child(7)
更新为 li:last-child
,因为它应该是最后一个。浏览器发现它位于第七或第九位。
这个更具体的选择器根据按钮在 HTML 树中的位置来定位按钮。
如何找到这条路径:
Copy
-> Copy JS path
(或类似的,具体取决于您的浏览器)。对于第 4 点,您可以根据需要选择
Copy selector
或 Copy JS path
。