我正在抓取罗马、米兰和贝加莫餐厅的评论。对于其中每一个城市,都有一个包含 30 家或更多餐厅的专用 URL。刮刀开始爬行罗马餐厅,但从未切换到其他城市。它正确地抓取了罗马的所有餐厅和评论,但随后蜘蛛就关闭了。
罗马餐厅同时被抓取,我希望起始网址具有相同的行为,但只考虑第一个网址
class ReviewSpider2(scrapy.Spider):
name= 'reviews2'
def start_requests(self):
urls = [
'https://www.tripadvisor.it/Restaurants-g187791-Rome_Lazio.html'
'https://www.tripadvisor.it/Restaurants-g187849-Milan_Lombardy.html'
'https://www.tripadvisor.it/Restaurants-g187830-Bergamo_Province_of_Bergamo_Lombardy.html'
]
for url in urls:
yield scrapy.Request(url, callback = self.parse_restaurants)
def parse_restaurants(self, response):
all_restaurants = list(set(response.xpath("//div[contains(@data-test,'_list_item')]//div/div/div/span/a[starts-with(@href,'/Restaurant_Review')]/@href").extract()))
for restaurant in all_restaurants:
url = 'https://www.tripadvisor.it' + restaurant
yield response.follow(url, callback = self.parse_restaurant)
def parse_restaurant(self, response):
all_reviews_containers = response.xpath('//div[@class="rev_wrap ui_columns is-multiline"]/div[2]')
if all_reviews_containers is not None:
for review_container in all_reviews_containers:
items = ReviewscraperItem()
items['restaurant_name'] = response.css('.HjBfq::text').extract_first()
items['rating'] = 0
rating_classes = {
'ui_bubble_rating bubble_50': 5,
'ui_bubble_rating bubble_40': 4,
'ui_bubble_rating bubble_30': 3,
'ui_bubble_rating bubble_20': 2,
'ui_bubble_rating bubble_10': 1
}
rating_class = review_container.css('span::attr(class)').extract_first()
items['rating'] = rating_classes.get(rating_class)
items['quote'] = review_container.css('.noQuotes::text').extract_first()
items['address'] = response.xpath("//span/span/a[@class='AYHFM']/text()").extract_first()
items['review'] = review_container.css('.partial_entry::text').extract_first()
yield items
#check if the next page button is disabled (there are no pages left)
if response.xpath('//a[@class = "nav next ui_button primary disabled"]').extract_first() is None:
next_page = 'https://www.tripadvisor.it' + response.xpath('//a[@class = "nav next ui_button primary"]/@href').extract_first()
yield response.follow(url=next_page, callback = self.parse_restaurant)
您缺少一些逗号,请参阅评论:
import scrapy
class ReviewSpider2(scrapy.Spider):
name = 'reviews2'
allowed_domains = ['tripadvisor.it']
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.5",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"DNT": "1",
"Host": "www.tripadvisor.it",
"Pragma": "no-cache",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"TE": "trailers",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
}
def start_requests(self):
# missing commas:
# urls = [
# 'https://www.tripadvisor.it/Restaurants-g187791-Rome_Lazio.html'
# 'https://www.tripadvisor.it/Restaurants-g187849-Milan_Lombardy.html'
# 'https://www.tripadvisor.it/Restaurants-g187830-Bergamo_Province_of_Bergamo_Lombardy.html'
# ]
urls = [
'https://www.tripadvisor.it/Restaurants-g187791-Rome_Lazio.html',
'https://www.tripadvisor.it/Restaurants-g187849-Milan_Lombardy.html',
'https://www.tripadvisor.it/Restaurants-g187830-Bergamo_Province_of_Bergamo_Lombardy.html'
]
# example_list = ['1' '2' '3'] = ['123']
for url in urls:
# use headers
yield scrapy.Request(url, callback=self.parse_restaurants, headers=self.headers)
def parse_restaurants(self, response):
# unnecessary because Scrapy has a built in duplicate filter:
# all_restaurants = list(set(response.xpath("//div[contains(@data-test,'_list_item')]//div/div/div/span/a[starts-with(@href,'/Restaurant_Review')]/@href").extract()))
all_restaurants = response.xpath("//div[contains(@data-test,'_list_item')]//div/div/div/span/a[starts-with(@href,'/Restaurant_Review')]/@href").getall()
for restaurant in all_restaurants:
url = 'https://www.tripadvisor.it' + restaurant
print(url)
# yield response.follow(url, callback = self.parse_restaurant)