我做了一个递归爬取功能来爬取网页。 maxPage限制为200,但是爬虫即使已经达到爬取页面的最大限制,也会继续爬取并发出请求。当我还检查 mySQL 数据库中的 crawledPages 表时,它有超过 200 个爬网页面。
我试图使用条件来阻止它,但我失败了。我也使用了 process.exit() 每当它达到 maxPage 限制但我不希望服务器被关闭。
这是代码:
const express = require("express");
const router = express.Router();
const { validateToken } = require("../middlewares/AuthMiddleware");
const { Crawledpage } = require("../models");
const request = require("request");
const cheerio = require("cheerio");
const urlModule = require("url");
router.get("/crawl", validateToken, (req, res) => {
const baseUrl = req.query.url;
const maxDepth = parseInt(req.query.depth || "3", 10); // Default to depth of 3 if not provided
const maxPages = parseInt(req.query.maxPages || "200", 10);
// Use an object to keep track of visited URLs to avoid infinite loops
const visitedUrls = {};
// Use an array to keep track of crawled pages
const crawledPages = [];
// Keep track of the number of crawled pages
let crawledPagesCount = 0;
// Define a recursive function to crawl each page
async function crawl(url, depth) {
// Exit the function if the maximum number of pages has been reached
if (crawledPagesCount >= maxPages) {
console.log(
`Maximum number of pages (${maxPages}) reached. Stopping crawling.`
);
return;
}
if (visitedUrls[url] || depth > maxDepth) {
return;
}
visitedUrls[url] = true;
let currentRequest;
currentRequest = request(url, async (error, response, html) => {
if (!error && response.statusCode == 200) {
const $ = cheerio.load(html);
const title = $("title").text();
const titleLength = title.length;
const metaDescription = $('meta[name="description"]').attr("content");
const metaDescriptionLength = metaDescription
? metaDescription.length
: 0;
const ogTitle = $('meta[property="og:title"]').attr("content");
const ogTitleCharactersCount = ogTitle ? ogTitle.length : 0;
const twitterTitle = $('meta[name="twitter:title"]').attr("content");
const twitterTitleCharactersCount = twitterTitle
? twitterTitle.length
: 0;
const ogDescription = $('meta[property="og:description"]').attr(
"content"
);
const ogDescriptionCharactersCount = ogDescription
? ogDescription.length
: 0;
const twitterDescription = $('meta[name="twitter:description"]').attr(
"content"
);
const twitterDescriptionCharactersCount = twitterDescription
? twitterDescription.length
: 0;
const ogImage = $('meta[property="og:image"]').attr("content");
const ogUrl = $('meta[property="og:url"]').attr("content");
const ogType = $('meta[property="og:type"]').attr("content");
const canonicalUrl = $('link[rel="canonical"]').attr("href");
const twitterCard = $('meta[name="twitter:card"]').attr("content");
const twitterSite = $('meta[name="twitter:site"]').attr("content");
const twitterCreator = $('meta[name="twitter:creator"]').attr(
"content"
);
const ogLocale = $('meta[property="og:locale"]').attr("content");
const ogImageWidth = $('meta[property="og:image:width"]').attr(
"content"
);
const ogImageHeight = $('meta[property="og:image:height"]').attr(
"content"
);
const num_of_h1 = $("h1").length;
const h1_characters_count = $("h1").text().length;
const num_of_h2 = $("h2").length;
const num_of_h3 = $("h3").length;
const num_of_h4 = $("h4").length;
const num_of_h5 = $("h5").length;
const num_of_h6 = $("h6").length;
const num_of_images = $("img").length;
const num_of_alt = $("img[alt]").length;
const currentPage = {
url,
title,
titleLength,
metaDescription,
metaDescriptionLength,
ogTitle,
ogTitleCharactersCount,
twitterTitle,
twitterTitleCharactersCount,
ogDescription,
ogDescriptionCharactersCount,
twitterDescription,
twitterDescriptionCharactersCount,
ogImage,
ogUrl,
ogType,
canonicalUrl,
twitterCard,
twitterSite,
twitterCreator,
ogLocale,
ogImageWidth,
ogImageHeight,
num_of_h1,
h1_characters_count,
num_of_h2,
num_of_h3,
num_of_h4,
num_of_h5,
num_of_h6,
num_of_images,
num_of_alt,
};
console.log(currentPage); // Output the current page data (for debugging purposes)
if (currentPage.url.startsWith(baseUrl)) {
crawledPages.push(currentPage);
await Crawledpage.create(currentPage);
crawledPagesCount++;
}
// Follow links found on the page and crawl each linked page
$("a").each((i, el) => {
const href = $(el).attr("href");
if (href && !href.startsWith("#")) {
const linkedUrl = urlModule.resolve(url, href);
crawl(linkedUrl, depth + 1);
}
});
} else {
console.error(`Error crawling page: ${url}`);
}
});
}
crawl(baseUrl, 0);
res.send("Crawling in progress"); // Return a response to the client indicating that crawling has started
});
module.exports = router;