我正在尝试通过 Python 自动化网络搜索。
该网站位于
hCaptcha
后面,但我正在使用2captcha
解算器。
虽然我已经复制了网络浏览器的行为,但我仍然被要求再次解决
hCaptcha
。
这是我尝试过的:
import httpx
import trio
from twocaptcha import TwoCaptcha
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0',
'Referer': 'https://iapps.courts.state.ny.us/nyscef/CaseSearch?TAB=courtDateRange',
'Origin': 'https://iapps.courts.state.ny.us'
}
API_KEY = 'hidden'
async def solve_captcha():
solver = TwoCaptcha(API_KEY)
return solver.hcaptcha(
sitekey='600d5d8e-5e97-4059-9fd8-373c17f73d11',
url='https://iapps.courts.state.ny.us/'
)['code']
async def main():
async with httpx.AsyncClient(base_url='https://iapps.courts.state.ny.us/nyscef/', headers=headers, follow_redirects=True) as client:
r = await client.post('CaseSearch?TAB=courtDateRange')
print('[*] - Solving CAPTCHA!')
cap = await solve_captcha()
print('[*] - CAPTCHA Solved')
# Court: Chautauqua County Supreme Court
data = {
'selCountyCourt': '4667226',
'txtFilingDate': '02/14/2024',
'g-recaptcha-response': cap,
'h-captcha-response': cap,
'btnSubmit': 'Search',
}
r = await client.post('https://iapps.courts.state.ny.us/nyscef/CaseSearch?TAB=courtDateRange', data=data)
with open('r.html', 'w') as f:
f.write(r.text)
if __name__ == "__main__":
trio.run(main)
正如我在上面的评论中所说,我调整了您的代码以重复解决验证码(如果出现)。连续输入 10 个验证码后,我认为该网站知道我正在抓取数据,并且会无限地提供验证码。
虽然可能有一些与您的代码更相关的方法未被检测到,但我无法找到它们,而且我尝试过的方法也不起作用。我能找到的唯一解决方案使用
Selenium
,并且需要 undetected_chromedriver
。为了不要求您安装其他驱动程序,我尝试了许多其他方法,但这是唯一可行的方法。该驱动程序是开源的,位于here。它可以通过以下方式安装:
pip install undetected-chromedriver
这是抓取您想要的页面的代码:
import undetected_chromedriver as uc
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
import random
def main():
options = uc.ChromeOptions()
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36")
driver = uc.Chrome(options=options)
url = 'https://iapps.courts.state.ny.us/nyscef/CaseSearch?TAB=courtDateRange'
driver.get(url)
time.sleep(random.uniform(1, 5))
WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.ID, 'selCountyCourt')))
time.sleep(random.uniform(1, 5)) # human-like wait time
dropdown = Select(driver.find_element(By.ID, 'selCountyCourt'))
dropdown.select_by_value('4667226') # value for 'Chautauqua County Supreme Court'
time.sleep(random.uniform(1, 5))
date_input = driver.find_element(By.ID, 'txtFilingDate')
date_input.send_keys('02/14/2024') #value of the desired date
time.sleep(random.uniform(1, 5))
date_input.send_keys(Keys.ENTER)
time.sleep(3)
# save the html
with open('page.html', 'w', encoding='utf-8') as f:
f.write(driver.page_source)
driver.quit()
if __name__ == "__main__":
main()
这会将所需页面的完整 html 保存到“page.html”。
注意:如果您最初收到有关 Chromedriver 版本不受支持的错误,请关闭浏览器并在不打开浏览器的情况下运行该模块。
如果您希望看到之前的代码运行,并看到它无限地运行验证码,这里是我用来确定这一点的代码:
import httpx
import trio
import random
from twocaptcha import TwoCaptcha
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0',
'Referer': 'https://iapps.courts.state.ny.us/nyscef/CaseSearch?TAB=courtDateRange',
'Origin': 'https://iapps.courts.state.ny.us'
}
API_KEY = 'hidden'
async def solve_captcha(sitekey, page_url, attempts=3):
solver = TwoCaptcha(API_KEY)
for attempt in range(attempts):
try:
solution = await trio.to_thread.run_sync(
lambda: solver.hcaptcha(
sitekey=sitekey,
url=page_url,
timeout=180
)
)
return solution['code']
except Exception as e:
print(f"attempt {attempt + 1}: error solving captcha: {e}")
if attempt < attempts - 1:
await trio.sleep(10) # wait 10 seconds before retrying
return None
async def main():
async with httpx.AsyncClient(headers=headers, follow_redirects=True) as client:
page_url = 'https://iapps.courts.state.ny.us/nyscef/CaseSearch?TAB=courtDateRange'
await trio.sleep(random.uniform(1, 3)) # mimic real waiting period
cap_solution = None
attempts = 0
while True: # start captcha loop
attempts += 1
print(f"attempt # {attempts} to access the page")
data = {
'selCountyCourt': '4667226',
'txtFilingDate': '02/14/2024',
'h-captcha-response': cap_solution
}
response = await client.post(page_url, data=data)
# check if captcha is on page
soup = BeautifulSoup(response.text, 'html.parser')
captcha_div = soup.find('div', class_='h-captcha')
if captcha_div:
print(f"captcha found, solving...")
new_sitekey = captcha_div['data-sitekey']
cap_solution = await solve_captcha(new_sitekey, page_url)
if not cap_solution:
print('faled to solve captcha')
return
# if solved, the loop will continue and use the new solution
print('captcha solved, retrying...')
await trio.sleep(random.uniform(2, 5))
else:
# if no captcha is found, break the loop
break
if response.status_code == 200: # save page if loop is broken
with open('r.html', 'w') as f:
f.write(response.text)
print("all captchas completed, response saved.")
else:
print("failed")
if __name__ == "__main__":
trio.run(main)