Selenium 网络抓取脚本未返回预期结果

问题描述 投票:0回答:1

我有一个使用 Selenium 的 Python 脚本,用于从网站上抓取公司信息。该脚本昨天工作正常,但今天即使我没有对代码进行任何更改,它也没有返回任何结果。

当脚本执行搜索时,网页显示“无匹配项”或没有结果。但是,如果我在网站上手动执行相同的搜索,就会有可见的结果。

我不确定我做错了什么或者为什么脚本不再按预期工作。任何见解或建议将不胜感激。

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
from webdriver_manager.chrome import ChromeDriverManager
import re


url = "https://ruesfront.rues.org.co/"

nom_empresa = "LINEAS ESCOLARES Y TURISMO S.A.S"


def extrae_nit(nom_empresa, url):
    options = webdriver.FirefoxOptions()
    driver = webdriver.Firefox(options=options)

    driver.get(url)

    driver.find_element(By.ID, "search") \
        .send_keys(nom_empresa)
    driver.implicitly_wait(10)
    driver.find_element(By.CLASS_NAME,
                        "d-none d-sm-block btn btn-primary input-group-append btn-busqueda busqueda__button--xs".replace(
                            " ", ".")) \
        .click()
    driver.implicitly_wait(30)
    text = driver.find_element(By.CLASS_NAME, "row card-result p-4 bg-featured".replace(" ", ".")) \
        .text

    print(text)

    driver.quit()

    result = text.split('\n')
    id_index = result.index("Identificación")
    nit = result[id_index + 1]

    return nit


print(extrae_nit(nom_empresa,url))

results

enter image description here

python selenium-webdriver web-scraping firefox
1个回答
0
投票

这是获取该信息的一种经过测试的方法:

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument("window-size=1280,1080")

with webdriver.Chrome(options=chrome_options) as driver:
    wait = WebDriverWait(driver, 15)

    url = 'https://ruesfront.rues.org.co/'
    driver.get(url) 
    comp_data = {}
    wait.until(EC.presence_of_element_located((By.XPATH, '//input[@id="search"]'))).click()
    wait.until(EC.presence_of_element_located((By.XPATH, '//input[@id="search"]'))).send_keys('LINEAS ESCOLARES Y TURISMO S.A.S')
    wait.until(EC.presence_of_element_located((By.XPATH, '//i[@class="bi bi-search ps-2"]'))).click()
    comp_data['sigla'] = wait.until(EC.presence_of_element_located((By.XPATH, '//p[text()="Sigla"]//following-sibling::span'))).text
    comp_data['identif_code'] = wait.until(EC.presence_of_element_located((By.XPATH, '//p[text()="Identificación"]//following-sibling::span'))).text
    comp_data['inscrip_code'] = wait.until(EC.presence_of_element_located((By.XPATH, '//p[text()="Numero de Inscripción"]//following-sibling::span'))).text
    comp_data['category'] = wait.until(EC.presence_of_element_located((By.XPATH, '//p[text()="Categoria"]//following-sibling::span'))).text
    print(comp_data)

终端结果:

{'sigla': 'LIDERTUR S.A.S', 'identif_code': '800126471-1', 'inscrip_code': '38610', 'category': 'Sociedad ó persona juridica principal ó esal'}

Selenium 文档可以在这里找到。

© www.soinside.com 2019 - 2024. All rights reserved.