这是我第一次在这里发帖,所以请温柔一点:)
我正在使用节点爬虫(在这里找到:https://www.npmjs.com/package/crawler)来爬行网站并查找所有内部链接。该脚本运行并将链接输出到控制台。我想做的是检测一旦完成并且没有找到进一步的链接,然后我想做一些事情,例如输出所有链接,也许添加到文本文件或Google表格。我尝试过异步,但有点超出我的深度。代码如下
const Crawler = require("crawler");
let obselete = []; // Array of what was crawled already
let c = new Crawler();
const crawlAllUrls = async function (url) {
c.queue({
uri: url,
callback: function (err, res, done) {
if (err) throw err;
let $ = res.$;
try {
let urls = $("a"); // all urls
Object.keys(urls).forEach((item) => {
if (urls[item].type === 'tag') {
let href = urls[item].attribs.href;
if (href && !obselete.includes(href)) {
href = href.trim();
console.log(`href is ${href}`);
obselete.push(href);
// Slow down the
setTimeout(function() {
href.startsWith(url) ? crawlAllUrls(href) : crawlAllUrls(`${url}${href}`)
}, 5000)
}
}
});
} catch (e) {
console.error(`Encountered an error crawling ${url}. Aborting crawl.`);
done()
}
done();
}
})
}
c.on('drain',function(){
// For example, release a connection to database.
console.log("Done");
});
// Start function
const start = async function(a, b) {
const result = await crawlAllUrls('http://www.five12.co.uk/');
}
start();
您可以使用
node:fs
来读取和写入文件。您需要更改 drain
函数才能写入文件。这是一个例子:
const fs = require("node:fs");
c.on("drain", function () {
console.log("Done. Now writing result to a file");
// __dirname is a node variable for your current directory
fs.writeFile(`${__dirname}/results.txt`, JSON.stringify(obselete),
(err) => {
if (err) {
console.error(err);
} else {
console.log("File updated");
}
});
});