我正在尝试刮取此代码中给出的url以使用类ng-binding
获取标记p
的坐标
Const request = require('request')
Const cheerio = require('cheerio')
Const logger = require('winston')
request(`https://pokedex100.com/discord/coord=2zHeG5fz71icW`, function (error, response, html) {
if (!error && response.statusCode == 200) {
logger.info(html);
var $ = cheerio.load(html)
var cord = $('p.ng-binding').text()
logger.error(cord)
}
});
我得到以下html。,这与我手动查看源代码时得到的不一样。有人可以指导我吗?
22:32:43 - info: <!DOCTYPE html><html>
<head>
<meta charset="utf-8" />
<meta content="width=device-width, initial-scale=1.0, maximum-scale=1, user-scalable=no" name="viewport">
<meta property="og:type" content="website" /> <meta property="og:site_name" content="Discord" /> <meta property="og:title" content="Discord - Free voice and text chat for gamers" /> <meta property="og:description" content="Step up your game with a modern voice & text chat app. Crystal clear voice, multiple server and channel support, mobile apps, and more. Get your free server now!" /><meta property="og:image" content="https://discordapp.com/assets/ee7c382d9257652a88c8f7b7f22a994d.png" /> <meta name="twitter:card" content="summary_large_image"> <meta name="twitter:site" content="@discordapp"> <meta name="twitter:creator" content="@discordapp">
<link rel="chrome-webstore-item" href="https://chrome.google.com/webstore/detail/lcbhdgefieegnkbopmgklhlpjjdgmbog">
<link rel="stylesheet" href="/assets/24cfd050a820092e88717a7b474a1087.css" integrity="sha256-WYgktx8T6Pkz7IBvCEpViHtcNBJDI9tykxV3SpLkz+s= sha512-fXegHnQBxgOX43MAMz9XZZpa4gxVLtgwj2vfRn/OOsMUGT+VJYqti68o0ok7QunHggCNlv1oNsM1DDl9o+jZJw=="><link rel="icon" href="/assets/07dca80a102d4149e9736d4b162cff6f.ico" /> <title>Discord</title>
</head>
<body>
<div id="app-mount"></div>
<script>window.__require = window.require</script>
<script>window.__OVERLAY__ = /overlay/.test(location.pathname)</script><script>!function(){if(null!=window.WebSocket){var n=function(n){try{var e=localStorage.getItem(n);return null==e?null:JSON.parse(e)}catch(n){return null}},e=n("token"),o=n("gatewayURL");if(e&&o){var r=null!=window.__require?"etf":"json",t=o+"/?encoding="+r+"&v=6";void 0!==window.Uint8Array&&(t+="&compress=zlib-stream"),console.log("[FAST CONNECT] "+t+", encoding: "+r+", version: 6");var a=new WebSocket(t);a.binaryType="arraybuffer";var s=Date.now(),i={open:!1,gateway:t,messages:[]};a.onopen=function(){console.log("[FAST CONNECT] connected in "+(Date.now()-s)+"ms"),i.open=!0},a.onclose=a.onerror=function(){window._ws=null},a.onmessage=function(n){i.messages.push(n)},window._ws={ws:a,state:i}}}}();</script><script src="/assets/ef85b442dc6e960fcdb2.js" integrity="sha256-YRcm2EGe1y248RLha42j5bJzgo7unYVPfEaRjIfI6OU= sha512-5rFJb426vcPzMhaOWbyhgHe5rP59EMHbQlYChDbJ5Ivjxkg3HBcF4qhU6d6OMxNaQ8ivLJGiReIVUNyQYKOE9A=="></script><script src="/assets/f36f220d9b6f06eed734.js" integrity="sha256-gERbeDHAE3txb+KP87BEkbIawMeMaMnLTZ5gGRXpYv8= sha512-2mJlwmLdySqA+3psIzKCkNG+ThSLeOAls13fO6sFLQveqomiys1HAei4MHx4eCYizZ1uzXqJWq6gmrmMpzHH+Q=="></script></body>
</html>
22:32:43 - error:
编辑:
const browser = await puppeteer.launch({ headless: false, timeout: 1000000 });
let page = await browser.newPage();
page.setDefaultNavigationTimeout(1000000)
await page.goto('https://pokedex100.com/?d=dcgrtr4WmaRvW', { waitUntil: "domcontentloaded" });
await page.waitForSelector('input[id="register-email"]', { timeout: 1000000 });
await page.type('input[id="register-email"]', "my_email_id")
await page.type('input[id="register-password"]', "my_login_password")
await page.click('button[class="btn btn-primary"]')
await page.waitForSelector('button[class="primary"]', { timeout: 1000000 })
await page.click('button[class="primary"]')
page = await browser.newPage(); // See justification below
page.waitFor(5000)
page.setDefaultNavigationTimeout(1000000)
await page.goto('https://pokedex100.com/?d=dcgrtr4WmaRvW', { waitUntil: "domcontentloaded" })
await page.waitForSelector('p[class="ng-binding"]', { timeout: 1000000 }).then(async cords => {
//let cord = await page.$('p[class="ng-binding"]')
console.log("target")
console.log(page.target)
page.$eval('p[class="ng-binding"]', element => {
console.log("element.innerHTML")
console.log(element.innerHTML)
console.log(element.textContent)
console.log(element.nodeName + ' ' + element.nodeValue)
})
})
我使用的原因是,page = browser.newPage()是因为,我观察到,页面加载target
需要时间,但是当我打开新窗口时加载速度更快。
看起来页面的内容是由页面上的脚本加载的。 request
只是获取html,因此它不会执行任何脚本。您可以使用无头浏览器(例如puppeteer)加载页面并允许在抓取之前填充内容。