所以我尝试使用 Scrapy 从网站的存档版本中抓取数据。这是我的代码:
import scrapy
from scrapy.crawler import *
from scrapy.item import *
from scrapy.linkextractors import *
from scrapy.loader import *
from scrapy.spiders import *
from scrapy.utils.log import *
from scrapy.utils.project import *
try:
from urllib.parse import urlparse
except ImportError:
from urlparse import urlparse
class VSItem(Item):
value = Field()
class vsSpider(scrapy.Spider):
name = "lever"
start_urls = [
"https://web.archive.org/web/20051120125133/http://www.novi.k12.mi.us/default.aspx"
]
rules = (
Rule(
LinkExtractor(allow="https:\/\/web.archive.org\/web\/\d{14}\/http:\/\/www.novi.k12.mi.us\/.*"),
callback="parse"
),
)
def parse(self, response):
for elem in response.xpath("/html"):
it = VSItem()
it["value"] = elem.css("input[name='__VIEWSTATE']").extract()
yield it
process = CrawlerProcess(get_project_settings())
process.crawl(vsSpider)
process.start() # the script will block here until the crawling is finished
我将
start_urls
设置为 https://web.archive.org/web/20051120125133/http://www.novi.k12.mi.us/
,因为这是该页面最早的存档版本。
此脚本从列出的页面中提取我想要的元素,但然后停在这里。
我的问题是:如何自动抓取主页 (/default.aspx) 和主站点的每个子目录的每个存档(例如,不仅是 /default.aspx,还包括 /Schools/noviHigh /default.aspx 和其他所有内容)? (基本上循环遍历每个可能匹配
/https:\/\/web.archive.org\/web\/.\d{14}/http:\/\/www.novi.k12.mi.us\/.*/g
的 URL,\d{14}
是因为日期戳的形式为 YYYYMMDDHHmmSS
)
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.loader import ItemLoader
from scrapy.item import Item, Field
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
class VSItem(Item):
value = Field()
class vsSpider(CrawlSpider):
name = "lever"
allowed_domains = ["web.archive.org"] # Limit crawling to this domain
start_urls = [
"https://web.archive.org/web/20051120125133/http://www.novi.k12.mi.us/"
]
rules = (
Rule(
LinkExtractor(allow=(r"https://web.archive.org/web/\d{14}/http://www.novi.k12.mi.us/.*")),
callback="parse_page"
),
)
def parse_page(self, response):
l = ItemLoader(item=VSItem(), response=response)
l.add_css("value", "input[name='__VIEWSTATE']::attr(value)")
yield l.load_item()
process = CrawlerProcess(get_project_settings())
process.crawl(vsSpider)
process.start()
通过这种方式构建代码,Scrapy 将自动遵循规则并爬行每个存档页面,从每个页面中提取所需的数据。只需确保您的正则表达式模式和 CSS 选择器准确匹配存档页面上的相关元素即可。