这是我的代码:
<?php
// for recaptcha
set_time_limit(0);
ob_implicit_flush();
date_default_timezone_set('America/New_York');
define('CAPTCHA_APIKEY', 'XXXX');
define('DELIM', '\n');
$links_arr = array();
$parser = new Parser();
//
#################################################################################
############################### START THE SCRIPT ################################
#################################################################################
$carteville_page = $parser->fetchHtml(
'https://www.ville.quebec.qc.ca/citoyens/taxes_evaluation/evaluation_fonciere/role/index.aspx?mslink=98001',
'2951, Avenue D\'Entremont',
'2951, Avenue D\'Entremont, app. 301, Québec (Sainte-Foy/Sillery/Cap-Rouge), Quartier Plateau, G1X 4N1');
#################################################################################
################################### RECAPTCHA ###################################
#################################################################################
class Parser {
function fetchHtml($fidUrl, $address = null, $address2 = null) {
echo "Url: $fidUrl\n";
$host = "https://www.ville.quebec.qc.ca";
$html = $this->getPageHtml($fidUrl);
$dom = new DomDocument();
@$dom->loadHTML($html);
$xpath = new DomXPath($dom);
$telerikScriptItem = $xpath->query("//script[contains(@src, 'Telerik')]")->item(0);
if (!$telerikScriptItem) {echo ("Telerik script not found");return false;}
$telerikUrl = $telerikScriptItem->getAttribute('src');
if (!preg_match('/^http/', $telerikUrl)) $telerikUrl = $host. (preg_match('/^\//', $telerikUrl)?'':'/'). $telerikUrl;
$telerikHtml = $this->getPageHtml($telerikUrl);
if (!preg_match("/hf\.value\s*\+=\s*'(.+)'/U", $telerikHtml, $matches)) {
echo("TSM parameter value not found"); return false;
}
$tsm = trim($matches[1]);
$headers = array('X-Requested-With: XMLHttpRequest', 'X-MicrosoftAjax: Delta=true', 'Content-Type: application/x-www-form-urlencoded; charset=utf-8', "Origin: $host");
$post = array();
$inputNodes = $xpath->query("//form[@id='aspnetForm'] // input");
foreach ($inputNodes as $inputItem) {
$post[$inputItem->getAttribute('name')] = trim($inputItem->getAttribute('value'));
}
$addressItem = $xpath->query("//select[contains(@id, 'RechercheAdresse1_ddChoix')]")->item(0);
if ($addressItem) {
echo "address found\n";
$address = trim(strtolower(preg_replace('/\s+/', ' ', $address)));
$addressValueId = false;
foreach ($addressItem->getElementsByTagName('option') as $option) {
preg_match("~app.(.*?),~",$address2,$match);
if(!empty($match[1])) {
$appnum = trim($match[1]);
//print_r($option->nodeValue);echo"\n";print_r($appnum);die;
preg_match("~app.(.*?)[,(]~",$option->nodeValue,$match);
if(!empty($match[1])) {
$option1 = trim(str_replace("-","",$match[1]));
if ($option1 == $appnum) {
$addressValueId = $option->getAttribute('value');
break;
}
elseif (strpos($option1, $appnum) !== false) {
$addressValueId = $option->getAttribute('value');
break;
}
elseif (strpos($appnum, $option1) !== false) {
$addressValueId = $option->getAttribute('value');
break;
}
}
else{
$option1 = preg_replace("/\([^)]+\)/","",$option->nodeValue); // remove text from the brackets
$option1 = substr($option1, 0, -1); // remove last character
//check is value from the list in the orginal string
$address2 = strtolower(str_replace(array("l'"," ","-"),array("l ' "," ",""),($address2)));
$option1 = strtolower(str_replace(array("l'"," ","-"),array("l ' "," ",""),($option1)));
print_r($option1);echo"\n";print_r($address2);
if(strpos(urlencode($address2), urlencode($option1)) !== false){
$addressValueId = $option->getAttribute('value');
break;
}
else{
$option1 = substr($option1, 0, strrpos( $option1, ','));
if(strpos(urlencode($address2), urlencode($option1)) !== false){
$addressValueId = $option->getAttribute('value');
break;
}
}
}
}
else{
$option1 = preg_replace("/\([^)]+\)/","",$option->nodeValue); // remove text from the brackets
$option1 = substr($option1, 0, -1); // remove last character
//check is value from the list in the orginal string
$address2 = strtolower(str_replace(array("l'"," ","-"),array("l ' "," ",""),($address2)));
$option1 = strtolower(str_replace(array("l'"," ","-"),array("l ' "," ",""),($option1)));
if(strpos(urlencode($address2), urlencode($option1)) !== false){
$addressValueId = $option->getAttribute('value');
break;
}
else{
$option1 = substr($option1, 0, strrpos( $option1, ','));
if(strpos(urlencode($address2), urlencode($option1)) !== false){
$addressValueId = $option->getAttribute('value');
break;
}
}
}
/*old code
$currentAddress = trim(strtolower(preg_replace('/\s+/', ' ', $option->nodeValue)));
if ($address == $currentAddress) {
//$addressValueId = $option->getAttribute('value');
//break;
}*/
}
if (!$addressValueId) {echo("Matching address not found: $address, $fidUrl"); return false;}
$postdata = 'ctl00$ctl00$contenu$texte_page$fichePropriete$rsm1=ctl00$ctl00$contenu$texte_page$fichePropriete$ctl00$ctl00$contenu$texte_page$fichePropriete$RechercheAdresse1$pnlAdressePanel|ctl00$ctl00$contenu$texte_page$fichePropriete$RechercheAdresse1$btnChoix&ctl00_ctl00_contenu_texte_page_fichePropriete_rsm1_TSM='. $tsm. '&__LASTFOCUS=&__EVENTTARGET=ctl00$ctl00$contenu$texte_page$fichePropriete$RechercheAdresse1$btnChoix&__EVENTARGUMENT=&__VIEWSTATE='. urlencode($post['__VIEWSTATE']). '&__VIEWSTATEGENERATOR='. urlencode($post['__VIEWSTATEGENERATOR']). '&ctl00$ctl00$HautDePageMobile1$Recherche$txt_recherche=&ctl00$ctl00$HautDePage1$Recherche$txt_recherche=&ctl00_ctl00_contenu_texte_page_fichePropriete_RechercheAdresse1_RadTabStrip1_ClientState={"selectedIndexes":["0"],"logEntries":[],"scrollState":{}}&ctl00$ctl00$contenu$texte_page$fichePropriete$RechercheAdresse1$txtNomRue=&ctl00$ctl00$contenu$texte_page$fichePropriete$RechercheAdresse1$txtCodePostal=&ctl00$ctl00$contenu$texte_page$fichePropriete$RechercheAdresse1$RadMaskedTextLot=0000000&ctl00_ctl00_contenu_texte_page_fichePropriete_RechercheAdresse1_RadMaskedTextLot_ClientState={"enabled":true,"emptyMessage":"","validationText":"0000000","valueAsString":"0000000","valueWithPromptAndLiterals":"0000000","lastSetTextBoxValue":"0000000"}&ctl00$ctl00$contenu$texte_page$fichePropriete$RechercheAdresse1$RmTextMatricule1=0000&ctl00_ctl00_contenu_texte_page_fichePropriete_RechercheAdresse1_RmTextMatricule1_ClientState={"enabled":true,"emptyMessage":"","validationText":"0000","valueAsString":"0000","valueWithPromptAndLiterals":"0000","lastSetTextBoxValue":"0000"}&ctl00$ctl00$contenu$texte_page$fichePropriete$RechercheAdresse1$RmTextMatricule2=00&ctl00_ctl00_contenu_texte_page_fichePropriete_RechercheAdresse1_RmTextMatricule2_ClientState={"enabled":true,"emptyMessage":"","validationText":"00","valueAsString":"00","valueWithPromptAndLiterals":"00","lastSetTextBoxValue":"00"}&ctl00$ctl00$contenu$texte_page$fichePropriete$RechercheAdresse1$RmTextMatricule3=0000&ctl00_ctl00_contenu_texte_page_fichePropriete_RechercheAdresse1_RmTextMatricule3_ClientState={"enabled":true,"emptyMessage":"","validationText":"0000","valueAsString":"0000","valueWithPromptAndLiterals":"0000","lastSetTextBoxValue":"0000"}&ctl00$ctl00$contenu$texte_page$fichePropriete$RechercheAdresse1$RmTextMatricule4=0&ctl00_ctl00_contenu_texte_page_fichePropriete_RechercheAdresse1_RmTextMatricule4_ClientState={"enabled":true,"emptyMessage":"","validationText":"0","valueAsString":"0","valueWithPromptAndLiterals":"0","lastSetTextBoxValue":"0"}&ctl00$ctl00$contenu$texte_page$fichePropriete$RechercheAdresse1$RmTextMatricule5=000&ctl00_ctl00_contenu_texte_page_fichePropriete_RechercheAdresse1_RmTextMatricule5_ClientState={"enabled":true,"emptyMessage":"","validationText":"000","valueAsString":"000","valueWithPromptAndLiterals":"000","lastSetTextBoxValue":"000"}&ctl00$ctl00$contenu$texte_page$fichePropriete$RechercheAdresse1$RmTextMatricule6=0000&ctl00_ctl00_contenu_texte_page_fichePropriete_RechercheAdresse1_RmTextMatricule6_ClientState={"enabled":true,"emptyMessage":"","validationText":"0000","valueAsString":"0000","valueWithPromptAndLiterals":"0000","lastSetTextBoxValue":"0000"}&ctl00_ctl00_contenu_texte_page_fichePropriete_RechercheAdresse1_RadMultiPage1_ClientState=&ctl00$ctl00$contenu$texte_page$fichePropriete$RechercheAdresse1$ddChoix='. urlencode($addressValueId). '&__ASYNCPOST=true&RadAJAXControlID=ctl00_ctl00_contenu_texte_page_fichePropriete_RadManAdresse';
$html = $this->getPageHtml($fidUrl, $postdata, $headers);
$dom = new DomDocument();
@$dom->loadHTML($html);
$xpath = new DomXPath($dom);
foreach ($xpath->query("//input[@type='hidden']") as $input) $post[$input->getAttribute('name')] = $input->getAttribute('value');
preg_match_all('/\|hiddenField\|(.+)\|(.*)\|/U', $html, $matches);
foreach ($matches[1] as $fieldKey => $fieldName) {
$post[trim($fieldName)] = trim($matches[2][$fieldKey]);
}
}
$captchaKeyItem = $xpath->query("//div[@class='g-recaptcha'][@data-sitekey]")->item(0);
if (!$captchaKeyItem) {
file_put_contents(__DIR__. '/error.html', $html);
echo("Captcha key item not item");return false;
}
$captchaSiteKey = $captchaKeyItem->getAttribute('data-sitekey');
$captchaResponse = $this->resolveCaptcha($captchaSiteKey, $host);
$post['ctl00_ctl00_contenu_texte_page_fichePropriete_rsm1_TSM'] = $tsm;
$post['ctl00$ctl00$contenu$texte_page$fichePropriete$rsm1'] = 'ctl00$ctl00$contenu$texte_page$fichePropriete$ctl00$ctl00$contenu$texte_page$fichePropriete$pnlCaptchaPanel|ctl00$ctl00$contenu$texte_page$fichePropriete$btnCaptcha';
$post['__EVENTTARGET'] = 'ctl00$ctl00$contenu$texte_page$fichePropriete$btnCaptcha';
$post['__ASYNCPOST'] = 'true';
$post['RadAJAXControlID'] = 'ctl00_ctl00_contenu_texte_page_fichePropriete_RadManAdresse';
$post['g-recaptcha-response'] = $captchaResponse;
unset($post['ctl00$ctl00$contenu$texte_page$fichePropriete$btnCaptcha']);
unset($post['ctl00$ctl00$HautDePage1$Recherche$imglnkGo']);
unset($post['ctl00$ctl00$HautDePageMobile1$Recherche$imglnkGo']);
foreach ($post as $fieldName => $fieldValue) {
if (strpos($fieldName, 'RechercheAdresse') !== false) unset($post[$fieldName]);
}
$postdata = array();
foreach ($post as $fieldName => $fieldValue) {
$postdata[] = $fieldName. '='. urlencode($fieldValue);
}
return $this->getPageHtml($fidUrl, implode('&', $postdata), $headers);
}
function resolveCaptcha($siteKey, $websiteUrl) {
$antigateBaseUrl = "http://2captcha.com";
do {
echo "Creating captcha task". DELIM;
$post = array('key' => CAPTCHA_APIKEY, 'method' => 'userrecaptcha', 'googlekey' => $siteKey, 'pageurl' => $websiteUrl, 'invisible' => 1);
$resp = $this->getPageHtml("$antigateBaseUrl/in.php", http_build_query($post));
$data = explode('|', $resp);
if (trim($data[0]) != 'OK') {echo("Wrong captcha status: $resp");return false;}
$taskId = trim($data[1]);
do {
sleep(15);
echo "Checking captcha solving result". DELIM;
$url = "$antigateBaseUrl/res.php?key=". CAPTCHA_APIKEY. "&action=get&id=$taskId";
$resp = trim($this->getPageHtml($url));
}while ($resp == 'CAPCHA_NOT_READY');
$data = explode('|', $resp);
$data[0] = trim($data[0]);
if ($data[0] == 'ERROR_CAPTCHA_UNSOLVABLE') {
echo "Unsolvable, try again". DELIM;
}elseif ($data[0] != 'OK') {
echo("Wrong captcha response: $resp");return false;
}else{
return trim($data[1]);
}
}while (true);
}
function getPageHtml($url, $post = false, array $addHeaders = array(), $full = false) {
$headers = array(
'User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36',
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language: en-US,en;q=0.5',
'Accept-Encoding: gzip, deflate, br',
'Connection: keep-alive'
);
if (count($addHeaders)) {
$headers = array_merge($headers, $addHeaders);
}
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, 1);
curl_setopt($ch, CURLOPT_TIMEOUT, 60);
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
curl_setopt($ch, CURLOPT_ENCODING, 'gzip,deflate');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
if ($post) {
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_POSTFIELDS, $post);
}
$res = trim(curl_exec($ch));
$headerIndex = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
$body = trim(substr($res, $headerIndex));
if (!$full) {
$res = $body;
}else {
$res = array('header' => trim(substr($res, 0, $headerIndex - 1)), 'body' => $body);
}
curl_close($ch);
return $res;
}
}
在解析器类中,有一个来自2captcha站点的代码,我用它来解析示例中放置的网站中的验证码。当我测试它一一链接时,它的效果很好。但是,当我在foreach中的脚本中开始浏览3000links时,出现了问题。对于25%的人,我无法解析验证码,然后无法获取页面源代码。 25%对我来说是个大数字,有人对此有建议吗?
几个问题:1)您的脚本如何工作:单线程还是多线程?您可能已达到2captcha.com上的请求数量上限,导致拒绝了其中一些请求2)您得到什么错误?是2captcha服务还是您的代码出错?