节点js cheerio显示错误的html源代码

问题描述 投票:0回答:1

我正在尝试刮取此代码中给出的url以使用类ng-binding获取标记p的坐标

Const request = require('request') 
Const cheerio = require('cheerio') 
Const logger = require('winston') 

request(`https://pokedex100.com/discord/coord=2zHeG5fz71icW`, function (error, response, html) {
                if (!error && response.statusCode == 200) {
                    logger.info(html);
                    var $ = cheerio.load(html)
                    var cord = $('p.ng-binding').text()
                    logger.error(cord)
                }
            });

我得到以下html。,这与我手动查看源代码时得到的不一样。有人可以指导我吗?

22:32:43 - info: <!DOCTYPE html><html>

<head>
  <meta charset="utf-8" />
  <meta content="width=device-width, initial-scale=1.0, maximum-scale=1, user-scalable=no" name="viewport">

  <meta property="og:type" content="website" />  <meta property="og:site_name" content="Discord" />  <meta property="og:title" content="Discord - Free voice and text chat for gamers" />  <meta property="og:description" content="Step up your game with a modern voice & text chat app. Crystal clear voice, multiple server and channel support, mobile apps, and more. Get your free server now!"  /><meta property="og:image" content="https://discordapp.com/assets/ee7c382d9257652a88c8f7b7f22a994d.png" />  <meta name="twitter:card" content="summary_large_image">  <meta name="twitter:site" content="@discordapp">  <meta name="twitter:creator" content="@discordapp">
  <link rel="chrome-webstore-item" href="https://chrome.google.com/webstore/detail/lcbhdgefieegnkbopmgklhlpjjdgmbog">
<link rel="stylesheet" href="/assets/24cfd050a820092e88717a7b474a1087.css" integrity="sha256-WYgktx8T6Pkz7IBvCEpViHtcNBJDI9tykxV3SpLkz+s= sha512-fXegHnQBxgOX43MAMz9XZZpa4gxVLtgwj2vfRn/OOsMUGT+VJYqti68o0ok7QunHggCNlv1oNsM1DDl9o+jZJw=="><link rel="icon" href="/assets/07dca80a102d4149e9736d4b162cff6f.ico" />  <title>Discord</title>
</head>

<body>
  <div id="app-mount"></div>
  <script>window.__require = window.require</script>
  <script>window.__OVERLAY__ = /overlay/.test(location.pathname)</script><script>!function(){if(null!=window.WebSocket){var n=function(n){try{var e=localStorage.getItem(n);return null==e?null:JSON.parse(e)}catch(n){return null}},e=n("token"),o=n("gatewayURL");if(e&&o){var r=null!=window.__require?"etf":"json",t=o+"/?encoding="+r+"&v=6";void 0!==window.Uint8Array&&(t+="&compress=zlib-stream"),console.log("[FAST CONNECT] "+t+", encoding: "+r+", version: 6");var a=new WebSocket(t);a.binaryType="arraybuffer";var s=Date.now(),i={open:!1,gateway:t,messages:[]};a.onopen=function(){console.log("[FAST CONNECT] connected in "+(Date.now()-s)+"ms"),i.open=!0},a.onclose=a.onerror=function(){window._ws=null},a.onmessage=function(n){i.messages.push(n)},window._ws={ws:a,state:i}}}}();</script><script src="/assets/ef85b442dc6e960fcdb2.js" integrity="sha256-YRcm2EGe1y248RLha42j5bJzgo7unYVPfEaRjIfI6OU= sha512-5rFJb426vcPzMhaOWbyhgHe5rP59EMHbQlYChDbJ5Ivjxkg3HBcF4qhU6d6OMxNaQ8ivLJGiReIVUNyQYKOE9A=="></script><script src="/assets/f36f220d9b6f06eed734.js" integrity="sha256-gERbeDHAE3txb+KP87BEkbIawMeMaMnLTZ5gGRXpYv8= sha512-2mJlwmLdySqA+3psIzKCkNG+ThSLeOAls13fO6sFLQveqomiys1HAei4MHx4eCYizZ1uzXqJWq6gmrmMpzHH+Q=="></script></body>

</html>
22:32:43 - error:

编辑:

const browser = await puppeteer.launch({ headless: false, timeout: 1000000 });
    let page = await browser.newPage();
    page.setDefaultNavigationTimeout(1000000)
    await page.goto('https://pokedex100.com/?d=dcgrtr4WmaRvW', { waitUntil: "domcontentloaded" });
    await page.waitForSelector('input[id="register-email"]', { timeout: 1000000 });
    await page.type('input[id="register-email"]', "my_email_id")
    await page.type('input[id="register-password"]', "my_login_password")
    await page.click('button[class="btn btn-primary"]')
    await page.waitForSelector('button[class="primary"]', { timeout: 1000000 })
    await page.click('button[class="primary"]')

    page = await browser.newPage(); // See justification below
    page.waitFor(5000)
    page.setDefaultNavigationTimeout(1000000)
    await page.goto('https://pokedex100.com/?d=dcgrtr4WmaRvW', { waitUntil: "domcontentloaded" })
    await page.waitForSelector('p[class="ng-binding"]', { timeout: 1000000 }).then(async cords => {
        //let cord = await page.$('p[class="ng-binding"]')
        console.log("target")
        console.log(page.target)
        page.$eval('p[class="ng-binding"]', element => {
            console.log("element.innerHTML")
            console.log(element.innerHTML)
            console.log(element.textContent)
            console.log(element.nodeName + ' ' + element.nodeValue)
        })
    })

我使用的原因是,page = browser.newPage()是因为,我观察到,页面加载target需要时间,但是当我打开新窗口时加载速度更快。

javascript node.js
1个回答
0
投票

看起来页面的内容是由页面上的脚本加载的。 request只是获取html,因此它不会执行任何脚本。您可以使用无头浏览器(例如puppeteer)加载页面并允许在抓取之前填充内容。

© www.soinside.com 2019 - 2024. All rights reserved.