网页抓取时无法导航到下一页

问题描述 投票:0回答:1

我试图提取作为数据分析师所需的技能来进行分析项目以了解最需要的技能

我的代码工作正常,直到导航到下一页以提取其余职位信息的部分

但是我已经尝试这样做了两天了

我不知道问题出在哪里

但我认为这是按钮选择器或其他东西

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import os
import csv
import time

options = Options()
options.binary_location = "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe"
driver_path = "C:\\Users\\Alaa\\Desktop\\chromedriver.exe"
driver = webdriver.Chrome(service=Service(driver_path), options=options)



def getJobInfo(job_link): 
    try:
        driver.get(job_link)
        wait = WebDriverWait(driver, 10)  # wait up to 10 seconds
        jobTitle = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'h1.css-f9uh36'))).text.strip()
        try:
            company = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'a.css-tdvcnh'))).text
        except Exception as e:
            companylist = driver.find_elements(By.CSS_SELECTOR, 'div.css-9iujih')
            company = companylist[0].text
        date = driver.find_element(By.CSS_SELECTOR, 'span.css-182mrdn').text.strip()
        job_type = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'a.css-g65o95'))).text
        location_element = wait.until(EC.presence_of_element_located((By.XPATH, '//strong[contains(@class, "css-9geu3q")]')))
        location = driver.execute_script("return arguments[0].textContent", location_element).split("-")[-1].strip()
        listt= driver.find_elements(By.CSS_SELECTOR, 'span.css-4xky9y')
        experience = listt[0].text if len(listt) > 0 else None
        salary = listt[3].text if len(listt) > 3 else None
        skills = driver.find_element(By.CSS_SELECTOR, 'div.css-1t5f0fr').text
        return {
            "Job Title": jobTitle,
            "Company": company,
            "Date": date,
            "Job type": job_type,
            "Location": location,
            "Experience": experience,
            "Salary": salary,
            "Skills": skills
        }
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Navigate to the main page
driver.get("https://wuzzuf.net/search/jobs/?q=data+analyst&a=na")

# Initialize an empty list to store all job information
allJobInfo = []


current_url = driver.current_url  # Initialize current_url

while True:
    wait = WebDriverWait(driver, 10)

    # Get the job detail page URLs
    jobsList = driver.find_elements(By.CSS_SELECTOR, 'div.css-1gatmva.e1v1l3u10')
    job_links = [job.find_element(By.CSS_SELECTOR, 'a.css-o171kl').get_attribute('href') for job in jobsList]

    print(f"Found {len(job_links)} job links on this page.")

    for job_link in job_links:
        print(f"Visiting job link: {job_link}")
        jobInfo = getJobInfo(job_link)
        if jobInfo is not None:
            allJobInfo.append(jobInfo)
            print(f"Successfully scraped job info from {job_link}")
        else:
            print(f"Failed to scrape job info from {job_link}")
        time.sleep(2)

    # Try to click the 'Next' button to go to the next page
    try:
        next_button = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'button.css-1rodp7n.ezfki8j0')))

        # Check if the 'Next' button is clickable (optional)
        if 'disabled' not in next_button.get_attribute('class'):
            print("Next button is enabled.")
            current_url = driver.current_url  # Store the current URL
            print("Current URL before clicking Next:", current_url)

            # Use JavaScript to click the 'Next' button
            driver.execute_script("arguments[0].click();", next_button)
            time.sleep(5)  # Wait for 5 seconds (adjust as needed)

            # Store the job links of the current page
            prev_job_links = job_links

            # Get the job links of the new page
            new_jobsList = driver.find_elements(By.CSS_SELECTOR, 'div.css-1gatmva.e1v1l3u10')
            new_job_links = [job.find_element(By.CSS_SELECTOR, 'a.css-o171kl').get_attribute('href') for job in new_jobsList]

            # Compare the job links
            if new_job_links != prev_job_links:
                print("Successfully navigated to the next page.")
            else:
                print("Failed to navigate to the next page.")

        else:
            print("Reached the last page. The 'Next' button is disabled.")
            break

    except TimeoutException:
        print("Reached the last page or element not found.")
        break
    except Exception as e:
        print(f"Failed to navigate to the next page. Error: {e}")
        break
    wait = WebDriverWait(driver, 10)

    # Get the job detail page URLs
    jobsList = driver.find_elements(By.CSS_SELECTOR, 'div.css-1gatmva.e1v1l3u10')
    job_links = [job.find_element(By.CSS_SELECTOR, 'a.css-o171kl').get_attribute('href') for job in jobsList]

    print(f"Found {len(job_links)} job links on this page.")
    counter = 0  # Initialize counter

    for job_link in job_links:
        if counter == 1:  # If counter is 1, break the loop
            break
        print(f"Visiting job link: {job_link}")
        jobInfo = getJobInfo(job_link)
        if jobInfo is not None:
            allJobInfo.append(jobInfo)
            print(f"Successfully scraped job info from {job_link}")
        else:
            print(f"Failed to scrape job info from {job_link}")
        time.sleep(2)
        counter += 1  # Increment counter at the end of each loop

    # Try to click the 'Next' button to go to the next page
    try:
        next_button = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'button.css-1rodp7n.ezfki8j0')))

        # Check if the 'Next' button is clickable (optional)
        if 'disabled' not in next_button.get_attribute('class'):
            print("Next button is enabled.")
            current_url = driver.current_url  # Store the current URL
            print("Current URL before clicking Next:", current_url)
            next_button.click()  # Click the 'Next' button
            time.sleep(5)  # Wait for 5 seconds (adjust as needed)

            # Store the job links of the current page
            prev_job_links = job_links

            # Get the job links of the new page
            new_jobsList = driver.find_elements(By.CSS_SELECTOR, 'div.css-1gatmva.e1v1l3u10')
            new_job_links = [job.find_element(By.CSS_SELECTOR, 'a.css-o171kl').get_attribute('href') for job in new_jobsList]

            # Compare the job links
            if new_job_links != prev_job_links:
                print("Successfully navigated to the next page.")
            else:
                print("Failed to navigate to the next page.")

        else:
            print("Reached the last page. The 'Next' button is disabled.")
            break

    except TimeoutException:
        print("Reached the last page or element not found.")
        break
    except Exception as e:
        print(f"Failed to navigate to the next page. Error: {e}")
        break
    wait = WebDriverWait(driver, 10)

    # Get the job detail page URLs
    jobsList = driver.find_elements(By.CSS_SELECTOR, 'div.css-1gatmva.e1v1l3u10')
    job_links = [job.find_element(By.CSS_SELECTOR, 'a.css-o171kl').get_attribute('href') for job in jobsList]

    print(f"Found {len(job_links)} job links on this page.")
    counter = 0  # Initialize counter

    for job_link in job_links:
        if counter == 1:  # If counter is 1, break the loop
            break
        print(f"Visiting job link: {job_link}")
        jobInfo = getJobInfo(job_link)
        if jobInfo is not None:
            allJobInfo.append(jobInfo)
            print(f"Successfully scraped job info from {job_link}")
        else:
            print(f"Failed to scrape job info from {job_link}")
        time.sleep(2)
        counter += 1  # Increment counter at the end of each loop

    # Try to click the 'Next' button to go to the next page
    try:
        next_button = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'button.css-1rodp7n.ezfki8j0')))


        # Check if the 'Next' button is clickable (optional)
        if 'disabled' not in next_button.get_attribute('class'):
            print("Next button is enabled.")
            current_url = driver.current_url  # Store the current URL
            print("Current URL before clicking Next:", current_url)
            next_button.click()  # Click the 'Next' button
            time.sleep(5)  # Wait for 5 seconds (adjust as needed)

        # Check for successful navigation (alternative approach)
            if wait.until(EC.url_to_be(current_url + "&page=2")):  # Check for specific URL pattern
                print("Successfully navigated to the next page.")
            else:
                print("Failed to navigate to the next page. URL pattern mismatch.")
        else:
            print("Reached the last page. The 'Next' button is disabled.")
            break

    except TimeoutException:
        print("Reached the last page or element not found.")
        break
    except Exception as e:
        print(f"Failed to navigate to the next page. Error: {e}")
        break


driver.quit()


keys = allJobInfo[0].keys() if allJobInfo else []  # Adjusting for empty jobInfo list
filename = "jobs.csv"
path = os.path.join(os.path.expanduser("~"), "Desktop", filename)

if allJobInfo:
    with open(path, "w", newline="", encoding="utf-8") as webscraping:
        dict_writer = csv.DictWriter(webscraping, keys)
        dict_writer.writeheader()
        for job in allJobInfo:
            match_encoded = {k: v.encode('utf-8').decode('utf-8') if isinstance(v, str) else v for k, v in job.items()}
            dict_writer.writerow(match_encoded)
        print(f"File '{filename}' saved to Desktop.")
else:
    print("No jobs found.")

这是我试图从 wuzzuf_website 提取数据的页面 我尝试了很多方法来选择下一个按钮,但没有任何效果

python selenium-webdriver web-scraping selenium-chromedriver data-analysis
1个回答
0
投票

在浏览器中尝试一下:

document.querySelectorAll('button.css-1rodp7n.ezfki8j0');

虽然第一个选择器针对具有特定类的按钮,但它可能不起作用,因为类名称似乎是动态生成的。

这是使用 HTML 结构中按钮路径的另一种方法:

document.querySelector("#app > div > div.css-1omce3u > div > div > div.css-if9uys > ul > li:last-child > button");

请注意,我已将

li:nth-child(7) 
更新为
li:last-child
,因为它应该是最后一个。浏览器发现它位于第七或第九位。

这个更具体的选择器根据按钮在 HTML 树中的位置来定位按钮。

如何找到这条路径:

  1. 打开浏览器的开发者工具(通常是 F12 键)。
  2. 检查目标按钮元素
  3. 右键单击目标按钮元素。
  4. 在 Chrome 上选择
    Copy
    ->
    Copy JS path
    (或类似的,具体取决于您的浏览器)。

对于第 4 点,您可以根据需要选择

Copy selector
Copy JS path

© www.soinside.com 2019 - 2024. All rights reserved.