[嗨,我正在使用puppeteer和cheerio在我的网站上运行抓取工具。 Scrapper运行良好,可以正常运行所有操作,但是当我尝试返回这些数组时,它仅返回一个对象而不是返回所有对象,这是我的代码:
async function mainScraper(url) {
try {
const page = await browser.newPage();
await page.goto(url);
let html = await page.evaluate(() => document.body.innerHTML);
const $ = await cheerio.load(html);
const link = $('.business-name').map((i, elem) =>
"http://mysiteURL.com" + $(elem).attr("href")).get()
return link
} catch (e) {
console.error(e)
}
}
async function scrapeLinks(url, page) {
try {
let results = [];
await page.goto(url)
const html = await page.evaluate(() => document.body.innerHTML);
const $ = cheerio.load(html);
const title = $(".title").text();
const websiteURL = $(".link").attr("href");
let email = $(".email").attr("href");
const metadata = {
email: email,
websiteURL: websiteURL,
title: title,
}
results.push(metadata);
console.log(results)
return results
}
catch (e) {
console.log(e)
}
}
async function clientScraper(city, searchTerm, index) {
browser = await puppeteer.launch({ headless: false });
const linkPage = await browser.newPage();
for (i = 1; i <= index; i = index + 1) {
const list = await mainScraper(`http://mysiteURL.com/${city}/${searchTerm}&page=${index}`)
for (let i = 0; i < list.length; i++) {
// here when I return it gives only single value and looks like scrapper only scrape once and then stops
return await scrapeLinks(list[i], linkPage);
}
}
}
这可能是因为您在两个循环中都使用了变量i
,并且也可能存在闭包问题。尝试更新两个for循环:
for (let j = 1; j <= index; j = index + 1) {
const list = await mainScraper(`http://mysiteURL.com/${city}/${searchTerm}&page=${index}`)
for (let i = 0; i < list.length; i++) {
(function(x) {
let result = await scrapeLinks(list[x], linkPage);
console.log(result) // Debug to see if all results are returned or not
})(i)
}
}