使用2captcha解决Hcaptcha自动化网站搜索(Python)

问题描述 投票:0回答:1

我正在尝试通过 Python 自动化网络搜索。

该网站位于

hCaptcha
后面,但我正在使用
2captcha
解算器。

虽然我已经复制了网络浏览器的行为,但我仍然被要求再次解决

hCaptcha

这是我尝试过的:

import httpx
import trio
from twocaptcha import TwoCaptcha


headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0',
    'Referer': 'https://iapps.courts.state.ny.us/nyscef/CaseSearch?TAB=courtDateRange',
    'Origin': 'https://iapps.courts.state.ny.us'

}


API_KEY = 'hidden'


async def solve_captcha():
    solver = TwoCaptcha(API_KEY)
    return solver.hcaptcha(
        sitekey='600d5d8e-5e97-4059-9fd8-373c17f73d11',
        url='https://iapps.courts.state.ny.us/'
    )['code']


async def main():
    async with httpx.AsyncClient(base_url='https://iapps.courts.state.ny.us/nyscef/', headers=headers, follow_redirects=True) as client:
        r = await client.post('CaseSearch?TAB=courtDateRange')
        print('[*] - Solving CAPTCHA!')
        cap = await solve_captcha()
        print('[*] - CAPTCHA Solved')
        # Court: Chautauqua County Supreme Court
        data = {
            'selCountyCourt': '4667226',
            'txtFilingDate': '02/14/2024',
            'g-recaptcha-response': cap,
            'h-captcha-response': cap,
            'btnSubmit': 'Search',
        }
        r = await client.post('https://iapps.courts.state.ny.us/nyscef/CaseSearch?TAB=courtDateRange', data=data)
        with open('r.html', 'w') as f:
            f.write(r.text)

if __name__ == "__main__":
    trio.run(main)
python beautifulsoup python-requests httpx 2captcha
1个回答
0
投票

正如我在上面的评论中所说,我调整了您的代码以重复解决验证码(如果出现)。连续输入 10 个验证码后,我认为该网站知道我正在抓取数据,并且会无限地提供验证码。

虽然可能有一些与您的代码更相关的方法未被检测到,但我无法找到它们,而且我尝试过的方法也不起作用。我能找到的唯一解决方案使用

Selenium
,并且需要
undetected_chromedriver
。为了不要求您安装其他驱动程序,我尝试了许多其他方法,但这是唯一可行的方法。该驱动程序是开源的,位于here。它可以通过以下方式安装:

pip install undetected-chromedriver

这是抓取您想要的页面的代码:

import undetected_chromedriver as uc
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
import random

def main():
    options = uc.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36")
    
    driver = uc.Chrome(options=options)
    
    url = 'https://iapps.courts.state.ny.us/nyscef/CaseSearch?TAB=courtDateRange'
    driver.get(url)
    time.sleep(random.uniform(1, 5))
    
    WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.ID, 'selCountyCourt')))
    time.sleep(random.uniform(1, 5)) # human-like wait time
    
    dropdown = Select(driver.find_element(By.ID, 'selCountyCourt'))
    dropdown.select_by_value('4667226')  # value for 'Chautauqua County Supreme Court'
    time.sleep(random.uniform(1, 5))    
    date_input = driver.find_element(By.ID, 'txtFilingDate')
    date_input.send_keys('02/14/2024')  #value of the desired date
    time.sleep(random.uniform(1, 5))
    date_input.send_keys(Keys.ENTER)
    time.sleep(3)
    # save the html
    with open('page.html', 'w', encoding='utf-8') as f:
        f.write(driver.page_source)
    driver.quit()

if __name__ == "__main__":
    main()

这会将所需页面的完整 html 保存到“page.html”。

注意:如果您最初收到有关 Chromedriver 版本不受支持的错误,请关闭浏览器并在不打开浏览器的情况下运行该模块。

如果您希望看到之前的代码运行,并看到它无限地运行验证码,这里是我用来确定这一点的代码:

import httpx
import trio
import random
from twocaptcha import TwoCaptcha
from bs4 import BeautifulSoup

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0',
    'Referer': 'https://iapps.courts.state.ny.us/nyscef/CaseSearch?TAB=courtDateRange',
    'Origin': 'https://iapps.courts.state.ny.us'
}

API_KEY = 'hidden'

async def solve_captcha(sitekey, page_url, attempts=3):
    solver = TwoCaptcha(API_KEY)
    for attempt in range(attempts):
        try:
            solution = await trio.to_thread.run_sync(
                lambda: solver.hcaptcha(
                    sitekey=sitekey,
                    url=page_url,
                    timeout=180
                )
            )
            return solution['code']
        except Exception as e:
            print(f"attempt {attempt + 1}: error solving captcha: {e}")
            if attempt < attempts - 1:
                await trio.sleep(10)  # wait 10 seconds before retrying
    return None

async def main():
    async with httpx.AsyncClient(headers=headers, follow_redirects=True) as client:
        page_url = 'https://iapps.courts.state.ny.us/nyscef/CaseSearch?TAB=courtDateRange'
        await trio.sleep(random.uniform(1, 3))  # mimic real waiting period
        
        cap_solution = None
        attempts = 0
        
        while True:  # start captcha loop
            attempts += 1
            print(f"attempt # {attempts} to access the page")
            
            data = {
                'selCountyCourt': '4667226',
                'txtFilingDate': '02/14/2024',
                'h-captcha-response': cap_solution
            }
            response = await client.post(page_url, data=data)
            
            # check if captcha is on page
            soup = BeautifulSoup(response.text, 'html.parser')
            captcha_div = soup.find('div', class_='h-captcha')
            if captcha_div:
                print(f"captcha found, solving...")
                new_sitekey = captcha_div['data-sitekey']
                cap_solution = await solve_captcha(new_sitekey, page_url)
                if not cap_solution:
                    print('faled to solve captcha')
                    return
                
                # if solved, the loop will continue and use the new solution
                print('captcha solved, retrying...')
                await trio.sleep(random.uniform(2, 5))
            else:
                # if no captcha is found, break the loop
                break
        
        if response.status_code == 200: # save page if loop is broken
            with open('r.html', 'w') as f:
                f.write(response.text)
            print("all captchas completed, response saved.")
        else:
            print("failed")

if __name__ == "__main__":
    trio.run(main)
© www.soinside.com 2019 - 2024. All rights reserved.