如何避免在Python Scrapy中提取重复链接?

问题描述 投票:0回答:1

我使用Python Scrapy编写了一个简单的示例来遍历网站上的所有链接。但是,extract_links 方法返回重复的地址。我尝试了多种方法,但没有取得任何效果。

这是我的代码:

import json
import mimetypes
from typing import Any, Optional

import scrapy.linkextractors
import scrapy.spiders
from scrapy.http import Response

# https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types
ALLOWED_DOWNLOAD_TYPES = {
    "application/pdf",
    "application/msword",
    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
}


def is_download_type(typ: str) -> bool:
    return typ in ALLOWED_DOWNLOAD_TYPES


class QuotesSpider(scrapy.Spider):
    name = "quotes"

    allowed_domains = [
        "quotes.toscrape.com"
    ]

    start_urls = [
        "https://quotes.toscrape.com/"
    ]

    def __init__(self, name: Optional[str] = None, **kwargs: Any):
        super().__init__(name, **kwargs)

        if not mimetypes.inited:
            mimetypes.init()

        self.link_extractor = scrapy.linkextractors.LinkExtractor()

    def parse(self, response: Response, **kwargs: Any) -> Any:
        file_path = "output.txt"
        with open(file_path, "a") as output_file:
            for link in self.link_extractor.extract_links(response):
                typ, encoding = mimetypes.guess_type(link.url)
                if typ is not None:
                    if typ == "text/html":
                        yield response.follow(link, callback=self.parse)
                    if is_download_type(typ):
                        result = {"nofollow": link.nofollow, "url": link.url, "text": link.text}
                        json.dump(result, output_file)
                        output_file.write('\n')
                else:
                    yield response.follow(link, callback=self.parse)
python scrapy
1个回答
0
投票

您可以在

dont_filter
方法中使用 Scrapy 的
response.follow
参数来防止提取重复链接。默认情况下,Scrapy 中的链接提取器不会过滤掉重复的链接。不过,通过在跟踪链接时显式指定
dont_filter=True
,您可以避免进行冗余查询。

以下是修改蜘蛛代码以避免提取重复链接的方法:

import json
import mimetypes
from typing import Any, Optional

import scrapy.linkextractors
import scrapy.spiders
from scrapy.http import Response
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

# Define the allowed MIME types for downloading
ALLOWED_DOWNLOAD_TYPES = {
    "application/pdf",
    "application/msword",
    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
}

# Function to check if a MIME type is allowed for download
def is_download_type(typ: str) -> bool:
    return typ in ALLOWED_DOWNLOAD_TYPES

# Define the spider class
class QuotesSpider(scrapy.Spider):
    name = "quotes"

    allowed_domains = [
        "quotes.toscrape.com"
    ]

    start_urls = [
        "https://quotes.toscrape.com/"
    ]

    def __init__(self, name: Optional[str] = None, **kwargs: Any):
        super().__init__(name, **kwargs)

        if not mimetypes.inited:
            mimetypes.init()

        self.link_extractor = scrapy.linkextractors.LinkExtractor()

    def parse(self, response: Response, **kwargs: Any) -> Any:
        file_path = "output.txt"
        with open(file_path, "a") as output_file:
            for link in self.link_extractor.extract_links(response):
                typ, encoding = mimetypes.guess_type(link.url)
                if typ is not None:
                    if typ == "text/html":
                        # Follow the link without filtering duplicates
                        yield response.follow(link, callback=self.parse, dont_filter=True)
                    if is_download_type(typ):
                        result = {"nofollow": link.nofollow, "url": link.url, "text": link.text}
                        json.dump(result, output_file)
                        output_file.write('\n')
                else:
                    # Follow the link without filtering duplicates
                    yield response.follow(link, callback=self.parse, dont_filter=True)


# Execute the spider
if __name__ == "__main__":
    process = CrawlerProcess(get_project_settings())
    process.crawl(QuotesSpider)
    process.start()
© www.soinside.com 2019 - 2024. All rights reserved.