试图从一类元素创建一个链接数组,它返回空值。我如何从这个 HTML 获得链接列表?
输入
const puppeteer = require('puppeteer');
async function getLink(url) {
const browser = await puppeteer.launch({headless: 'false'});
const page = await browser.newPage();
await page.goto(url);
const linksRaw = await page.$$eval(
'.PropertyItem_propertyItem__Fla4I',
links => links.map(link => link.getAttribute('data-url'))
);
console.log(linksRaw)
await browser.close();
}
getLink('https://landa.app/m/feed');
输出
[
null, null, null, null,
null, null, null, null,
null, null, null, null,
null, null, null, null,
null, null, null
]
在这样的页面上:
剩下的只是获取脚本并解析它。
以下代码将获取链接:
async function getLink(url) {
const browser = await puppeteer.launch({headless: 'false'});
const page = await browser.newPage();
await page.goto(url);
let raw = await page.$eval('script[id="__NEXT_DATA__"]', el => el.innerText.trim());
let parsed = JSON.parse(raw);
let json = parsed.props.pageProps.properties; // check the parsed file to see its structure
let urls = [];
for (let el of json) {
let e = el.address;
if (e !== undefined) { // bypass those that don't have a address property
let street = e.street.replace(/\s+/g, '-'); // replace spaces with '-'
let houseNo = e.houseNumber;
let city = e.city.name;
let stateCode = e.state.code;
let zip = e.zipCode;
let ticker = el.ticker.replace(/@/g, '_'); // replace @ with '_'
let url = `https://landa.app/m/property/${street}-${houseNo}-${city}-${stateCode}-${zip}/${ticker}`;
urls.push(url);
}
}
console.log(urls);
await browser.close();
}