我使用Python Scrapy编写了一个简单的示例来遍历网站上的所有链接。但是,extract_links 方法返回重复的地址。我尝试了多种方法,但没有取得任何效果。
这是我的代码:
import json
import mimetypes
from typing import Any, Optional
import scrapy.linkextractors
import scrapy.spiders
from scrapy.http import Response
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types
ALLOWED_DOWNLOAD_TYPES = {
"application/pdf",
"application/msword",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
}
def is_download_type(typ: str) -> bool:
return typ in ALLOWED_DOWNLOAD_TYPES
class QuotesSpider(scrapy.Spider):
name = "quotes"
allowed_domains = [
"quotes.toscrape.com"
]
start_urls = [
"https://quotes.toscrape.com/"
]
def __init__(self, name: Optional[str] = None, **kwargs: Any):
super().__init__(name, **kwargs)
if not mimetypes.inited:
mimetypes.init()
self.link_extractor = scrapy.linkextractors.LinkExtractor()
def parse(self, response: Response, **kwargs: Any) -> Any:
file_path = "output.txt"
with open(file_path, "a") as output_file:
for link in self.link_extractor.extract_links(response):
typ, encoding = mimetypes.guess_type(link.url)
if typ is not None:
if typ == "text/html":
yield response.follow(link, callback=self.parse)
if is_download_type(typ):
result = {"nofollow": link.nofollow, "url": link.url, "text": link.text}
json.dump(result, output_file)
output_file.write('\n')
else:
yield response.follow(link, callback=self.parse)
您可以在
dont_filter
方法中使用 Scrapy 的 response.follow
参数来防止提取重复链接。默认情况下,Scrapy 中的链接提取器不会过滤掉重复的链接。不过,通过在跟踪链接时显式指定 dont_filter=True
,您可以避免进行冗余查询。
以下是修改蜘蛛代码以避免提取重复链接的方法:
import json
import mimetypes
from typing import Any, Optional
import scrapy.linkextractors
import scrapy.spiders
from scrapy.http import Response
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
# Define the allowed MIME types for downloading
ALLOWED_DOWNLOAD_TYPES = {
"application/pdf",
"application/msword",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
}
# Function to check if a MIME type is allowed for download
def is_download_type(typ: str) -> bool:
return typ in ALLOWED_DOWNLOAD_TYPES
# Define the spider class
class QuotesSpider(scrapy.Spider):
name = "quotes"
allowed_domains = [
"quotes.toscrape.com"
]
start_urls = [
"https://quotes.toscrape.com/"
]
def __init__(self, name: Optional[str] = None, **kwargs: Any):
super().__init__(name, **kwargs)
if not mimetypes.inited:
mimetypes.init()
self.link_extractor = scrapy.linkextractors.LinkExtractor()
def parse(self, response: Response, **kwargs: Any) -> Any:
file_path = "output.txt"
with open(file_path, "a") as output_file:
for link in self.link_extractor.extract_links(response):
typ, encoding = mimetypes.guess_type(link.url)
if typ is not None:
if typ == "text/html":
# Follow the link without filtering duplicates
yield response.follow(link, callback=self.parse, dont_filter=True)
if is_download_type(typ):
result = {"nofollow": link.nofollow, "url": link.url, "text": link.text}
json.dump(result, output_file)
output_file.write('\n')
else:
# Follow the link without filtering duplicates
yield response.follow(link, callback=self.parse, dont_filter=True)
# Execute the spider
if __name__ == "__main__":
process = CrawlerProcess(get_project_settings())
process.crawl(QuotesSpider)
process.start()