我想从serp中提取结果号。但我的响应html源代码不包含任何结果号。我搜索了许多其他与我类似的帖子,但它们是很久以前发布的并且没有帮助。
我需要使用 js 渲染来达到我的目的吗?谢谢。
import httpx
import asyncio
async def main(keyword: str):
async with httpx.AsyncClient() as client:
response = await client.get(f"https://www.google.com/search?q={keyword}")
return response.content.decode("latin1")
但是你的 html 响应是什么样的?
您的请求是否使用了 User-Agent HTTP 标头?
你的项目需要Python吗?
不久前我在 JS(节点)上写了某种爬虫:
看
_fetchUrl()
功能
import { writeFile } from "node:fs";
const SPECIAL_OFFERS = [];
function _saveResult(payload) {
const specialOffersOfThisPage = payload["results"];
for (let specialOffer of specialOffersOfThisPage) {
SPECIAL_OFFERS.push(specialOffer);
}
}
function prepareUrl(pageNumber = 1) {
var url = `secretUrl`;
return url;
}
async function _getPage(pageNumber) {
const url = prepareUrl(pageNumber);
async function _fetchUrl() {
let nextPageNumber = null;
const result = await fetch(url, {
cache: "default",
credentials: "include",
headers: {
Accept: "application/json, text/plain, */*",
"Accept-Language": "en-GB,en;q=0.9",
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4.1 Safari/605.1.15",
method: "GET",
})
.then(response => response.text())
.then(body => {
let requestResult = JSON.parse(body);
let nextPageUrl = requestResult["next"];
_saveResult(requestResult);
if (nextPageUrl) {
nextPageNumber = nextPageUrl.match(/&page=(\d{1,})&/)[1];
} else {
console.log("Done");
}
});
return Number(nextPageNumber);
}
const result = await _fetchUrl();
return result;
}
async function getSpecialOffers() {
let nextPageNumber = 1;
while (nextPageNumber) {
console.log("Getting special offers on page", nextPageNumber);
nextPageNumber = await _getPage(nextPageNumber);
}
console.log("Total special offers:", SPECIAL_OFFERS.length);
writeFile("result.txt", JSON.stringify(SPECIAL_OFFERS, null, "\t"), error => {
if (error) {
console.error(error);
} else {
// file written successfully
}
});
}
await getSpecialOffers();