类似的问题已在here讨论过,但我无法使我的代码工作。目的是 scrapy-playwright 为 start_urls 中的每个 URL 生成请求响应,并以相同的方式解析每个响应。 带有 url 的 CSV 已正确读取到列表中,但 start_requests 未生成请求。请参阅下面注释的代码。
import scrapy
import asyncio
from scrapy_playwright.page import PageMethod
class MySpider(scrapy.Spider):
name = "Forum01"
allowed_domains = ["example.com"]
def start_requests(self):
with open('FullLink.csv') as file:
start_urls = [line.strip() for line in file]
print(start_urls) # When Scrapy crawl the list of URLs is correctly printed
for u in self.start_urls:
yield scrapy.Request(
u,
meta=dict(
playwright=True,
playwright_include_page=False,
playwright_page_methods=[
PageMethod("wait_for_selector", "div.modal-body > p")
], # End of methods
), # End of meta
callback=self.parse
)
async def parse(self, response): # Does not work either with sync or async
for item in response.css('div.modal-content'):
yield{
'title': item.css('h1::text').get(),
'info': item.css('.row+ p::text').get(),
}
您知道如何正确地将 URL 提供给蜘蛛吗? 谢谢!
您正在尝试在 for 循环中迭代一个空序列,而不是从 csv 文件中提取的序列。请参阅下面的注释。
import scrapy
import asyncio
from scrapy_playwright.page import PageMethod
class MySpider(scrapy.Spider):
name = "Forum01"
allowed_domains = ["example.com"]
def start_requests(self):
with open('FullLink.csv') as file:
start_urls = [line.strip() for line in file]
print(start_urls) # When Scrapy crawl the list of URLs is correctly printed
for u in self.start_urls: # <-- change self.start_urls to just start_urls
yield scrapy.Request(
u,
meta=dict(
playwright=True,
playwright_include_page=False,
playwright_page_methods=[
PageMethod("wait_for_selector", "div.modal-body > p")
], # End of methods
), # End of meta
callback=self.parse
)
async def parse(self, response): # Does not work either with sync or async
for item in response.css('div.modal-content'):
yield{
'title': item.css('h1::text').get(),
'info': item.css('.row+ p::text').get(),
}