我有一个使用 Selenium 的 Python 脚本,用于从网站上抓取公司信息。该脚本昨天工作正常,但今天即使我没有对代码进行任何更改,它也没有返回任何结果。
当脚本执行搜索时,网页显示“无匹配项”或没有结果。但是,如果我在网站上手动执行相同的搜索,就会有可见的结果。
我不确定我做错了什么或者为什么脚本不再按预期工作。任何见解或建议将不胜感激。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
from webdriver_manager.chrome import ChromeDriverManager
import re
url = "https://ruesfront.rues.org.co/"
nom_empresa = "LINEAS ESCOLARES Y TURISMO S.A.S"
def extrae_nit(nom_empresa, url):
options = webdriver.FirefoxOptions()
driver = webdriver.Firefox(options=options)
driver.get(url)
driver.find_element(By.ID, "search") \
.send_keys(nom_empresa)
driver.implicitly_wait(10)
driver.find_element(By.CLASS_NAME,
"d-none d-sm-block btn btn-primary input-group-append btn-busqueda busqueda__button--xs".replace(
" ", ".")) \
.click()
driver.implicitly_wait(30)
text = driver.find_element(By.CLASS_NAME, "row card-result p-4 bg-featured".replace(" ", ".")) \
.text
print(text)
driver.quit()
result = text.split('\n')
id_index = result.index("Identificación")
nit = result[id_index + 1]
return nit
print(extrae_nit(nom_empresa,url))
这是获取该信息的一种经过测试的方法:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument("window-size=1280,1080")
with webdriver.Chrome(options=chrome_options) as driver:
wait = WebDriverWait(driver, 15)
url = 'https://ruesfront.rues.org.co/'
driver.get(url)
comp_data = {}
wait.until(EC.presence_of_element_located((By.XPATH, '//input[@id="search"]'))).click()
wait.until(EC.presence_of_element_located((By.XPATH, '//input[@id="search"]'))).send_keys('LINEAS ESCOLARES Y TURISMO S.A.S')
wait.until(EC.presence_of_element_located((By.XPATH, '//i[@class="bi bi-search ps-2"]'))).click()
comp_data['sigla'] = wait.until(EC.presence_of_element_located((By.XPATH, '//p[text()="Sigla"]//following-sibling::span'))).text
comp_data['identif_code'] = wait.until(EC.presence_of_element_located((By.XPATH, '//p[text()="Identificación"]//following-sibling::span'))).text
comp_data['inscrip_code'] = wait.until(EC.presence_of_element_located((By.XPATH, '//p[text()="Numero de Inscripción"]//following-sibling::span'))).text
comp_data['category'] = wait.until(EC.presence_of_element_located((By.XPATH, '//p[text()="Categoria"]//following-sibling::span'))).text
print(comp_data)
终端结果:
{'sigla': 'LIDERTUR S.A.S', 'identif_code': '800126471-1', 'inscrip_code': '38610', 'category': 'Sociedad ó persona juridica principal ó esal'}
Selenium 文档可以在这里找到。