我想在
node.js
中使用爬虫来爬取网站中的所有链接(内部链接)并获取每个页面的标题,我在npm
crawler上看到了这个插件,如果我检查那里的文档是下面的例子:
var Crawler = require("crawler");
var c = new Crawler({
maxConnections : 10,
// This will be called for each crawled page
callback : function (error, res, done) {
if (error) {
console.log(error);
} else {
var $ = res.$;
// $ is Cheerio by default
//a lean implementation of core jQuery designed specifically for the server
console.log($("title").text());
}
done();
}
});
// Queue just one URL, with default callback
c.queue('http://balenol.com');
但是我想爬取站点中所有的内部URL;这是内置在这个插件中还是需要单独编写?我在插件中没有看到任何用于访问网站上所有链接的选项,这可能吗?
以下代码片段会抓取它找到的每个 URL 中的所有 URL。
const Crawler = require("crawler");
let obselete = []; // Array of what was crawled already
let c = new Crawler();
function crawlAllUrls(url) {
console.log(`Crawling ${url}`);
c.queue({
uri: url,
callback: function (err, res, done) {
if (err) throw err;
let $ = res.$;
try {
let urls = $("a");
Object.keys(urls).forEach((item) => {
if (urls[item].type === 'tag') {
let href = urls[item].attribs.href;
if (href && !obselete.includes(href)) {
href = href.trim();
obselete.push(href);
// Slow down the
setTimeout(function() {
href.startsWith('http') ? crawlAllUrls(href) : crawlAllUrls(`${url}${href}`) // The latter might need extra code to test if its the same site and it is a full domain with no URI
}, 5000)
}
}
});
} catch (e) {
console.error(`Encountered an error crawling ${url}. Aborting crawl.`);
done()
}
done();
}
})
}
crawlAllUrls('https://github.com/evyatarmeged/');
在上面的代码中,只需更改以下内容即可获取网站的内部链接...
来自
href.startsWith('http') ? crawlAllUrls(href) : crawlAllUrls(`${url}${href}`)
到
href.startsWith(url) ? crawlAllUrls(href) : crawlAllUrls(`${url}${href}`)