Scrapy - 爬行网站档案以及所有子目录

问题描述 投票:0回答:1

所以我尝试使用 Scrapy 从网站的存档版本中抓取数据。这是我的代码:

import scrapy
from scrapy.crawler import *
from scrapy.item import *
from scrapy.linkextractors import *
from scrapy.loader import *
from scrapy.spiders import *
from scrapy.utils.log import *
from scrapy.utils.project import *

try:
    from urllib.parse import urlparse
except ImportError:
    from urlparse import urlparse
    
class VSItem(Item):
    value = Field()

class vsSpider(scrapy.Spider):
    name = "lever"
    start_urls = [
        "https://web.archive.org/web/20051120125133/http://www.novi.k12.mi.us/default.aspx"
    ]
    rules = (
            Rule(
                LinkExtractor(allow="https:\/\/web.archive.org\/web\/\d{14}\/http:\/\/www.novi.k12.mi.us\/.*"),
                callback="parse"
                ),
            )

    def parse(self, response):
        for elem in response.xpath("/html"):
            it = VSItem()
            it["value"] = elem.css("input[name='__VIEWSTATE']").extract()
            yield it
 
process = CrawlerProcess(get_project_settings())

process.crawl(vsSpider)
process.start() # the script will block here until the crawling is finished

我将

start_urls
设置为
https://web.archive.org/web/20051120125133/http://www.novi.k12.mi.us/
,因为这是该页面最早的存档版本。

此脚本从列出的页面中提取我想要的元素,但然后停在这里。

我的问题是:如何自动抓取主页 (/default.aspx) 和主站点的每个子目录的每个存档(例如,不仅是 /default.aspx,还包括 /Schools/noviHigh /default.aspx 和其他所有内容)? (基本上循环遍历每个可能匹配

/https:\/\/web.archive.org\/web\/.\d{14}/http:\/\/www.novi.k12.mi.us\/.*/g
的 URL,
\d{14}
是因为日期戳的形式为
YYYYMMDDHHmmSS

python scrapy web-crawler
1个回答
0
投票
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.loader import ItemLoader
from scrapy.item import Item, Field
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

class VSItem(Item):
    value = Field()

class vsSpider(CrawlSpider):
    name = "lever"
    allowed_domains = ["web.archive.org"]  # Limit crawling to this domain
    start_urls = [
        "https://web.archive.org/web/20051120125133/http://www.novi.k12.mi.us/"
    ]
    rules = (
        Rule(
            LinkExtractor(allow=(r"https://web.archive.org/web/\d{14}/http://www.novi.k12.mi.us/.*")),
            callback="parse_page"
        ),
    )

    def parse_page(self, response):
        l = ItemLoader(item=VSItem(), response=response)
        l.add_css("value", "input[name='__VIEWSTATE']::attr(value)")
        yield l.load_item()

process = CrawlerProcess(get_project_settings())
process.crawl(vsSpider)
process.start()

通过这种方式构建代码,Scrapy 将自动遵循规则并爬行每个存档页面,从每个页面中提取所需的数据。只需确保您的正则表达式模式和 CSS 选择器准确匹配存档页面上的相关元素即可。

© www.soinside.com 2019 - 2024. All rights reserved.