我正在尝试创建一个 Puppeteer 应用程序,它可以打开一个动态创建的网站 (landa.com),找到并点击所有缩略图,并从点击功能将 Puppeteer 带到的页面中抓取信息。
没有 URL 可以抓取到数组中以允许我执行
page.goto()
,这会使事情变得更简单。我不想手动创建这个数组,而是想让 Puppeteer 通过单击所有缩略图来打开所有缩略图。
到目前为止我已经成功了:
我正在挣扎的是:
waitFor
它都会超时。我正在寻找一些指导,以了解这是否是处理此类网站的最佳方法,或者我是否过度复杂化了它。
有没有办法获取点击后生成的 URL,因为它不在 HTML 中?
基本上我尝试通过这样做从我用
page.click()
打开的页面中获取信息:
输入
const puppeteer = require('puppeteer');
async function autoScroll(page){
await page.evaluate(async () => {
await new Promise((resolve) => {
var totalHeight = 0;
var distance = 100;
var timer = setInterval(() => {
var scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
if(totalHeight >= scrollHeight - window.innerHeight){
clearInterval(timer);
resolve();
}
}, 100);
});
});
}
async function clickElements(url) {
const browser = await puppeteer.launch({headless: 'false'});
const page = await browser.newPage();
await page.goto(url);
await autoScroll(page);
const elHandleArray = await page.$$('.PropertyItem_propertyItem__Fla4I');
console.log(elHandleArray);
for (const el of elHandleArray) {
await el.evaluate(e => e.click());
await page.waitForSelector('#APP_CONTAINER > div > main > div.DrawerLayout_container__vO7gb > div._address__container___iyko > div._address__content__8pvIa > ul > div.BasePropertySection_container__qchvD.PropertyHero_container__sFUC2 > div.PropertyHero_title__zaro0 > div.HeroTitle_topContainer__9RhDH > div');
var el2 = await page.$('#APP_CONTAINER > div > main > div.DrawerLayout_container__vO7gb > div._address__container___iyko > div._address__content__8pvIa > ul > div.BasePropertySection_container__qchvD.PropertyHero_container__sFUC2 > div.PropertyHero_title__zaro0 > div.HeroTitle_topContainer__9RhDH > div');
var txt = await el2.getProperty('textContent');
var name = await txt.jsonValue();
console.log(name);
}
await browser.close();
}
clickElements('https://landa.app/m/feed');
输出
[
CDPElementHandle {}, CDPElementHandle {}, CDPElementHandle {},
CDPElementHandle {}, CDPElementHandle {}, CDPElementHandle {},
CDPElementHandle {}, CDPElementHandle {}, CDPElementHandle {},
CDPElementHandle {}, CDPElementHandle {}, CDPElementHandle {},
CDPElementHandle {}, CDPElementHandle {}, CDPElementHandle {},
CDPElementHandle {}, CDPElementHandle {}, CDPElementHandle {},
CDPElementHandle {}, CDPElementHandle {}, CDPElementHandle {},
CDPElementHandle {}, CDPElementHandle {}, CDPElementHandle {},
CDPElementHandle {}, CDPElementHandle {}, CDPElementHandle {},
CDPElementHandle {}, CDPElementHandle {}, CDPElementHandle {},
CDPElementHandle {}, CDPElementHandle {}, CDPElementHandle {},
CDPElementHandle {}, CDPElementHandle {}, CDPElementHandle {},
CDPElementHandle {}, CDPElementHandle {}, CDPElementHandle {},
CDPElementHandle {}, CDPElementHandle {}, CDPElementHandle {},
CDPElementHandle {}, CDPElementHandle {}, CDPElementHandle {},
CDPElementHandle {}, CDPElementHandle {}, CDPElementHandle {},
CDPElementHandle {}, CDPElementHandle {}, CDPElementHandle {},
CDPElementHandle {}
]
1363 Hancock Street
(node:16300) UnhandledPromiseRejectionWarning: Error: Execution context was destroyed, most likely because of a navigation.
at rewriteError (C:\Users\Nathaniel.Bowen\Desktop\Personal\scrapers\landaScraper\node_modules\puppeteer-core\lib\cjs\puppeteer\common\ExecutionContext.js:312:15)
at processTicksAndRejections (internal/process/task_queues.js:93:5)
at async ExecutionContext._ExecutionContext_evaluate (C:\Users\Nathaniel.Bowen\Desktop\Personal\scrapers\landaScraper\node_modules\puppeteer-core\lib\cjs\puppeteer\common\ExecutionContext.js:253:56)
at async ExecutionContext.evaluate (C:\Users\Nathaniel.Bowen\Desktop\Personal\scrapers\landaScraper\node_modules\puppeteer-core\lib\cjs\puppeteer\common\ExecutionContext.js:143:16)
at async clickElements