我目前正在使用
requests
库和 BeautifulSoup
来编写 Python 脚本以进行网页抓取。目标是从网站“https://letmepost.com/check-da-pa”检索特定信息,特别是 DA(域权限)、PA(页面权限)、垃圾邮件分数、年龄、过期时间、以及给定域的 IP 地址。然而,尽管成功获得响应,但检索到的数据并不符合预期。
我已经分享了我的代码,其中包括处理 X-CSRF-TOKEN 并发送带有必要负载的 POST 请求。然而,我收到的输出只是域名“cnn.com”,而不是预期的结构化数据,其中包括 DA、PA、垃圾邮件分数、年龄、过期时间和 IP。
import requests
from bs4 import BeautifulSoup
import re
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
session = requests.Session()
r = session.get('https://letmepost.com/check-da-pa', headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
script_tag = soup.find("script", string=lambda text: text and "X-CSRF-TOKEN" in text)
if script_tag:
token_pattern = r"'X-CSRF-TOKEN': '(.*?)'"
csrf_token_match = re.search(token_pattern, script_tag.string)
if csrf_token_match:
csrf_token = csrf_token_match.group(1)
print(f"CSRF Token: {csrf_token}")
else:
print("CSRF Token pattern not found in script content.")
else:
print("Script tag with 'X-CSRF-TOKEN' not found.")
data = "prothomalo.com"
data_payload = {
"domains ": data,
"X-CSRF-TOKEN": csrf_token
}
# Existing data_payload
data_payload = {
"X-CSRF-TOKEN": csrf_token
}
# Adding new data to data_payload
data_payload["listen"] = "4OdeyKA6Sku1ff86NqkAibxXGf4kfOQUcbepB84U"
data_payload["secret"] = "PNxcg1LI9331xX7"
data_payload["urls"] = "prothomalo.com"
data_payload["fromBrowser"] = "1705462709623"
data_payload["timestamp"] = "1705462686"
data_payload["direct"] = True # Assuming you want to add a boolean False, not the string 'false'
# The data_payload now contains the additional data
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"X-CSRF-TOKEN": csrf_token
}
url = "https://letmepost.com/check-da-pa"
response = session.post(url, data=data_payload, headers=headers)
print(response.status_code)
import pandas as pd
result = BeautifulSoup(response.content, 'html.parser')
print(result)
正如其他人所说,该页面受 Cloudflare 和(随机)ReCaptcha 保护。
不知道如何在 Python 中做到这一点,但是在 PHP 中
<?php
declare(strict_types=1);
use function var_dump as d;
require_once __DIR__ . '/vendor/autoload.php';
$chromiumPath = '/usr/bin/chromium-browser';
function dd(...$args): void
{
$trace = debug_backtrace(0, 2);
$file = $trace[0]['file'];
$line = $trace[0]['line'];
echo "dd {$file}:{$line}: \n";
var_dump(...$args);
die();
}
$factory = new \HeadlessChromium\BrowserFactory($chromiumPath);
$browser = $factory->createBrowser(
[
// 'headless' => false,
]
);
$page = $browser->createPage();
$page->navigate("https://letmepost.com/check-da-pa")->waitForNavigation();
$page->evaluate('document.querySelector("#urls").value = ' . json_encode('cnn.com'));
$page->mouse()->find('#btnTransferDomain')->click();
// <tbody id="bulk-da-pa-checker-results"><tr><th scope="row">1</th><td class="td-wrap"><a href="http://cnn.com" target="_blank">cnn.com</a><a></a></td><td>95</td><td>82</td><td>1</td><td>30Y, 128D</td><td>2027-09-21</td><td>151.101.67.5</td></tr></tbody>
for(;;){
$results = $page->evaluate('document.querySelector("#bulk-da-pa-checker-results")?.innerHTML')->getReturnValue();
if($results && !str_contains($results, 'No Data yet') && !str_contains($results, 'Loading...')){
break;
}
sleep(1);
try{
$recaptcha = $page->mouse()->find("#recaptcha-anchor > div.recaptcha-checkbox-border");
} catch (\Exception $e){
$recaptcha = null;
}
if(!$recaptcha){
try{
$recaptcha = $page->mouse()->findElement(
// XPathSelector
new HeadlessChromium\Dom\Selector\XPathSelector(
'//iframe[@title="reCAPTCHA"]'
),
);
} catch (\Exception $e){
$recaptcha = null;
}
}
if($recaptcha){
$recaptcha->click();
sleep(1);
$page->mouse()->find('#btnTransferDomain')->click();
}
}
// <table class="table-font table table-hover table-fixed"> <thead> <tr> <th>#</th> <th class="sort" data-sort="u" data-mode="up">Domain <i class="fas fa-chevron-down inactive sort-obj"></i></th> <th class="sort" data-sort="d" data-mode="up">DA <i class="fas fa-chevron-down inactive sort-obj"></i></th> <th>PA</th> <th class="sort" data-sort="s" data-mode="up">Spam <i class="fas fa-chevron-down inactive sort-obj"></i></th> <th>Age</th> <th class="sort" data-sort="de" data-mode="up">Expiring At <i class="fas fa-chevron-down inactive sort-obj"></i></th> <th class="sort" data-sort="i" data-mode="up">IP <i class="fas fa-chevron-down inactive sort-obj"></i></th> </tr> </thead> <tbody id="bulk-da-pa-checker-results"><tr><th scope="row">1</th><td class="td-wrap"><a href="http://cnn.com" target="_blank">cnn.com</a><a></a></td><td>95</td><td>82</td><td>1</td><td>30Y, 128D</td><td>2027-09-21</td><td>151.101.195.5</td></tr></tbody> </table>
$dom = new DOMDocument();
@$dom->loadHTML($results);
$tds = $dom->getElementsByTagName('td');
$da = $tds[1]->textContent;
$pa = $tds[2]->textContent;
$spam = $tds[3]->textContent;
$age = $tds[4]->textContent;
$expires = $tds[5]->textContent;
$ip = $tds[6]->textContent;
$results = compact('da', 'pa', 'spam', 'age', 'expires', 'ip');
d($results);
可进行前 20 次左右的调用并打印
hans@DESKTOP-EE15SLU:~/projects/misc$ time php test2.php
array(6) {
["da"]=>
string(2) "95"
["pa"]=>
string(2) "82"
["spam"]=>
string(1) "1"
["age"]=>
string(9) "30Y, 128D"
["expires"]=>
string(10) "2027-09-21"
["ip"]=>
string(11) "151.101.3.5"
}
^C
real 0m10.008s
user 0m0.029s
sys 0m0.000s
(是的,为了击败前几个 reCaptcha 挑战,随机 click() 实际上可以解决问题!)
并且..有方法可以绕过它,我现在放弃了。