爬虫不会在最大页面限制处停止并继续爬行

问题描述 投票:0回答:0

我做了一个递归爬取功能来爬取网页。 maxPage限制为200,但是爬虫即使已经达到爬取页面的最大限制,也会继续爬取并发出请求。当我还检查 mySQL 数据库中的 crawledPages 表时,它有超过 200 个爬网页面。

我试图使用条件来阻止它,但我失败了。我也使用了 process.exit() 每当它达到 maxPage 限制但我不希望服务器被关闭。

这是代码:

const express = require("express");
const router = express.Router();
const { validateToken } = require("../middlewares/AuthMiddleware");
const { Crawledpage } = require("../models");
const request = require("request");
const cheerio = require("cheerio");
const urlModule = require("url");

router.get("/crawl", validateToken, (req, res) => {
  const baseUrl = req.query.url;
  const maxDepth = parseInt(req.query.depth || "3", 10); // Default to depth of 3 if not provided
  const maxPages = parseInt(req.query.maxPages || "200", 10);

  // Use an object to keep track of visited URLs to avoid infinite loops
  const visitedUrls = {};

  // Use an array to keep track of crawled pages
  const crawledPages = [];

  // Keep track of the number of crawled pages
  let crawledPagesCount = 0;

  // Define a recursive function to crawl each page
  async function crawl(url, depth) {
    // Exit the function if the maximum number of pages has been reached
    if (crawledPagesCount >= maxPages) {
      console.log(
        `Maximum number of pages (${maxPages}) reached. Stopping crawling.`
      );
      return;
    }
    if (visitedUrls[url] || depth > maxDepth) {
      return;
    }

    visitedUrls[url] = true;

    let currentRequest;
    currentRequest = request(url, async (error, response, html) => {
      if (!error && response.statusCode == 200) {
        const $ = cheerio.load(html);

        const title = $("title").text();
        const titleLength = title.length;

        const metaDescription = $('meta[name="description"]').attr("content");
        const metaDescriptionLength = metaDescription
          ? metaDescription.length
          : 0;

        const ogTitle = $('meta[property="og:title"]').attr("content");
        const ogTitleCharactersCount = ogTitle ? ogTitle.length : 0;

        const twitterTitle = $('meta[name="twitter:title"]').attr("content");
        const twitterTitleCharactersCount = twitterTitle
          ? twitterTitle.length
          : 0;

        const ogDescription = $('meta[property="og:description"]').attr(
          "content"
        );
        const ogDescriptionCharactersCount = ogDescription
          ? ogDescription.length
          : 0;

        const twitterDescription = $('meta[name="twitter:description"]').attr(
          "content"
        );
        const twitterDescriptionCharactersCount = twitterDescription
          ? twitterDescription.length
          : 0;

        const ogImage = $('meta[property="og:image"]').attr("content");

        const ogUrl = $('meta[property="og:url"]').attr("content");

        const ogType = $('meta[property="og:type"]').attr("content");

        const canonicalUrl = $('link[rel="canonical"]').attr("href");

        const twitterCard = $('meta[name="twitter:card"]').attr("content");

        const twitterSite = $('meta[name="twitter:site"]').attr("content");

        const twitterCreator = $('meta[name="twitter:creator"]').attr(
          "content"
        );

        const ogLocale = $('meta[property="og:locale"]').attr("content");

        const ogImageWidth = $('meta[property="og:image:width"]').attr(
          "content"
        );

        const ogImageHeight = $('meta[property="og:image:height"]').attr(
          "content"
        );

        const num_of_h1 = $("h1").length;
        const h1_characters_count = $("h1").text().length;

        const num_of_h2 = $("h2").length;
        const num_of_h3 = $("h3").length;
        const num_of_h4 = $("h4").length;
        const num_of_h5 = $("h5").length;
        const num_of_h6 = $("h6").length;

        const num_of_images = $("img").length;
        const num_of_alt = $("img[alt]").length;

        const currentPage = {
          url,
          title,
          titleLength,
          metaDescription,
          metaDescriptionLength,
          ogTitle,
          ogTitleCharactersCount,
          twitterTitle,
          twitterTitleCharactersCount,
          ogDescription,
          ogDescriptionCharactersCount,
          twitterDescription,
          twitterDescriptionCharactersCount,
          ogImage,
          ogUrl,
          ogType,
          canonicalUrl,
          twitterCard,
          twitterSite,
          twitterCreator,
          ogLocale,
          ogImageWidth,
          ogImageHeight,
          num_of_h1,
          h1_characters_count,
          num_of_h2,
          num_of_h3,
          num_of_h4,
          num_of_h5,
          num_of_h6,
          num_of_images,
          num_of_alt,
        };
        console.log(currentPage); // Output the current page data (for debugging purposes)

        if (currentPage.url.startsWith(baseUrl)) {
          crawledPages.push(currentPage);
          await Crawledpage.create(currentPage);
          crawledPagesCount++;
        }

        // Follow links found on the page and crawl each linked page
        $("a").each((i, el) => {
          const href = $(el).attr("href");
          if (href && !href.startsWith("#")) {
            const linkedUrl = urlModule.resolve(url, href);
            crawl(linkedUrl, depth + 1);
          }
        });
      } else {
        console.error(`Error crawling page: ${url}`);
      }
    });
  }

    crawl(baseUrl, 0);

  res.send("Crawling in progress"); // Return a response to the client indicating that crawling has started
});

module.exports = router;

database express recursion request web-crawler
© www.soinside.com 2019 - 2024. All rights reserved.