通过 page.Click() 和 Puppeteer 抓取多个页面

问题描述 投票:0回答:0

我正在尝试创建一个 Puppeteer 应用程序,它可以打开一个动态创建的网站 (landa.com),找到并点击所有缩略图,并从点击功能将 Puppeteer 带到的页面中抓取信息。

没有 URL 可以抓取到数组中以允许我执行

page.goto()
,这会使事情变得更简单。我不想手动创建这个数组,而是想让 Puppeteer 通过单击所有缩略图来打开所有缩略图。

到目前为止我已经成功了:

  1. 创建元素句柄数组
  2. 循环点击当前加载的元素句柄
  3. 点击页面时的屏幕截图

我正在挣扎的是:

  1. 通过向下滚动加载整个元素句柄
  2. 从我点击的页面中抓取信息(我不知道该怎么做)
  3. 正确计时,以便正确加载整个页面。 任何时候我在循环中使用
    waitFor
    它都会超时。

我正在寻找一些指导,以了解这是否是处理此类网站的最佳方法,或者我是否过度复杂化了它。

有没有办法获取点击后生成的 URL,因为它不在 HTML 中?

基本上我尝试通过这样做从我用

page.click()
打开的页面中获取信息:

输入

const puppeteer = require('puppeteer');

async function autoScroll(page){
    await page.evaluate(async () => {
        await new Promise((resolve) => {
            var totalHeight = 0;
            var distance = 100;
            var timer = setInterval(() => {
                var scrollHeight = document.body.scrollHeight;
                window.scrollBy(0, distance);
                totalHeight += distance;

                if(totalHeight >= scrollHeight - window.innerHeight){
                    clearInterval(timer);
                    resolve();
                }
            }, 100);
        });
    });
}

async function clickElements(url) {
    const browser = await puppeteer.launch({headless: 'false'});
    const page = await browser.newPage();
    await page.goto(url);

    await autoScroll(page);
    
    const elHandleArray = await page.$$('.PropertyItem_propertyItem__Fla4I');
    console.log(elHandleArray);
    for (const el of elHandleArray) {
        await el.evaluate(e => e.click());

        await page.waitForSelector('#APP_CONTAINER > div > main > div.DrawerLayout_container__vO7gb > div._address__container___iyko > div._address__content__8pvIa > ul > div.BasePropertySection_container__qchvD.PropertyHero_container__sFUC2 > div.PropertyHero_title__zaro0 > div.HeroTitle_topContainer__9RhDH > div');
        var el2 = await page.$('#APP_CONTAINER > div > main > div.DrawerLayout_container__vO7gb > div._address__container___iyko > div._address__content__8pvIa > ul > div.BasePropertySection_container__qchvD.PropertyHero_container__sFUC2 > div.PropertyHero_title__zaro0 > div.HeroTitle_topContainer__9RhDH > div');
        var txt = await el2.getProperty('textContent');
        var name = await txt.jsonValue();
        console.log(name);

    }

    await browser.close();
}

clickElements('https://landa.app/m/feed');

输出

[
  CDPElementHandle {}, CDPElementHandle {}, CDPElementHandle {},
  CDPElementHandle {}, CDPElementHandle {}, CDPElementHandle {},
  CDPElementHandle {}, CDPElementHandle {}, CDPElementHandle {},
  CDPElementHandle {}, CDPElementHandle {}, CDPElementHandle {},
  CDPElementHandle {}, CDPElementHandle {}, CDPElementHandle {},
  CDPElementHandle {}, CDPElementHandle {}, CDPElementHandle {},
  CDPElementHandle {}, CDPElementHandle {}, CDPElementHandle {},
  CDPElementHandle {}, CDPElementHandle {}, CDPElementHandle {},
  CDPElementHandle {}, CDPElementHandle {}, CDPElementHandle {},
  CDPElementHandle {}, CDPElementHandle {}, CDPElementHandle {},
  CDPElementHandle {}, CDPElementHandle {}, CDPElementHandle {},
  CDPElementHandle {}, CDPElementHandle {}, CDPElementHandle {},
  CDPElementHandle {}, CDPElementHandle {}, CDPElementHandle {},
  CDPElementHandle {}, CDPElementHandle {}, CDPElementHandle {},
  CDPElementHandle {}, CDPElementHandle {}, CDPElementHandle {},
  CDPElementHandle {}, CDPElementHandle {}, CDPElementHandle {},
  CDPElementHandle {}, CDPElementHandle {}, CDPElementHandle {},
  CDPElementHandle {}
]
1363 Hancock Street
(node:16300) UnhandledPromiseRejectionWarning: Error: Execution context was destroyed, most likely because of a navigation.
    at rewriteError (C:\Users\Nathaniel.Bowen\Desktop\Personal\scrapers\landaScraper\node_modules\puppeteer-core\lib\cjs\puppeteer\common\ExecutionContext.js:312:15)
    at processTicksAndRejections (internal/process/task_queues.js:93:5)
    at async ExecutionContext._ExecutionContext_evaluate (C:\Users\Nathaniel.Bowen\Desktop\Personal\scrapers\landaScraper\node_modules\puppeteer-core\lib\cjs\puppeteer\common\ExecutionContext.js:253:56)
    at async ExecutionContext.evaluate (C:\Users\Nathaniel.Bowen\Desktop\Personal\scrapers\landaScraper\node_modules\puppeteer-core\lib\cjs\puppeteer\common\ExecutionContext.js:143:16)
    at async clickElements
arrays web-scraping jquery-selectors puppeteer
© www.soinside.com 2019 - 2024. All rights reserved.