Scrapy - 仅抓取 url 列表中的第一个 url

问题描述 投票:0回答:1

我正在抓取罗马、米兰和贝加莫餐厅的评论。对于其中每一个城市,都有一个包含 30 家或更多餐厅的专用 URL。刮刀开始爬行罗马餐厅,但从未切换到其他城市。它正确地抓取了罗马的所有餐厅和评论,但随后蜘蛛就关闭了。

罗马餐厅同时被抓取,我希望起始网址具有相同的行为,但只考虑第一个网址

class ReviewSpider2(scrapy.Spider):

    name= 'reviews2'


    def start_requests(self):
        urls = [
        'https://www.tripadvisor.it/Restaurants-g187791-Rome_Lazio.html'
        'https://www.tripadvisor.it/Restaurants-g187849-Milan_Lombardy.html'
        'https://www.tripadvisor.it/Restaurants-g187830-Bergamo_Province_of_Bergamo_Lombardy.html'
        ]
        for url in urls:
            yield scrapy.Request(url, callback = self.parse_restaurants)
        
    def parse_restaurants(self, response):    
        all_restaurants = list(set(response.xpath("//div[contains(@data-test,'_list_item')]//div/div/div/span/a[starts-with(@href,'/Restaurant_Review')]/@href").extract()))
        for restaurant in all_restaurants:
            url = 'https://www.tripadvisor.it' + restaurant
            yield response.follow(url, callback = self.parse_restaurant)
    def parse_restaurant(self, response):

        all_reviews_containers = response.xpath('//div[@class="rev_wrap ui_columns is-multiline"]/div[2]')
        if all_reviews_containers is not None:
            for review_container in all_reviews_containers:
                items = ReviewscraperItem()
                items['restaurant_name'] = response.css('.HjBfq::text').extract_first()
                items['rating'] = 0
                rating_classes = {
                    'ui_bubble_rating bubble_50': 5,
                    'ui_bubble_rating bubble_40': 4,
                    'ui_bubble_rating bubble_30': 3,
                    'ui_bubble_rating bubble_20': 2,
                    'ui_bubble_rating bubble_10': 1
                }
                rating_class = review_container.css('span::attr(class)').extract_first()
                items['rating'] = rating_classes.get(rating_class)
                items['quote'] = review_container.css('.noQuotes::text').extract_first()
                items['address'] = response.xpath("//span/span/a[@class='AYHFM']/text()").extract_first()
                items['review'] = review_container.css('.partial_entry::text').extract_first()
                yield items
            #check if the next page button is disabled (there are no pages left)
            if response.xpath('//a[@class = "nav next ui_button primary disabled"]').extract_first() is None:
                next_page = 'https://www.tripadvisor.it' + response.xpath('//a[@class = "nav next ui_button primary"]/@href').extract_first()
                yield response.follow(url=next_page, callback = self.parse_restaurant)
python html web-scraping xpath scrapy
1个回答
0
投票

您缺少一些逗号,请参阅评论:

import scrapy


class ReviewSpider2(scrapy.Spider):
    name = 'reviews2'
    allowed_domains = ['tripadvisor.it']
    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "en-US,en;q=0.5",
        "Cache-Control": "no-cache",
        "Connection": "keep-alive",
        "DNT": "1",
        "Host": "www.tripadvisor.it",
        "Pragma": "no-cache",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "none",
        "Sec-Fetch-User": "?1",
        "TE": "trailers",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
    }

    def start_requests(self):
        # missing commas:
        # urls = [
        #     'https://www.tripadvisor.it/Restaurants-g187791-Rome_Lazio.html'
        #     'https://www.tripadvisor.it/Restaurants-g187849-Milan_Lombardy.html'
        #     'https://www.tripadvisor.it/Restaurants-g187830-Bergamo_Province_of_Bergamo_Lombardy.html'
        # ]
        urls = [
            'https://www.tripadvisor.it/Restaurants-g187791-Rome_Lazio.html',
            'https://www.tripadvisor.it/Restaurants-g187849-Milan_Lombardy.html',
            'https://www.tripadvisor.it/Restaurants-g187830-Bergamo_Province_of_Bergamo_Lombardy.html'
        ]
        # example_list = ['1' '2' '3'] = ['123']
        for url in urls:
            # use headers
            yield scrapy.Request(url, callback=self.parse_restaurants, headers=self.headers)

    def parse_restaurants(self, response):
        # unnecessary because Scrapy has a built in duplicate filter:
        # all_restaurants = list(set(response.xpath("//div[contains(@data-test,'_list_item')]//div/div/div/span/a[starts-with(@href,'/Restaurant_Review')]/@href").extract()))
        all_restaurants = response.xpath("//div[contains(@data-test,'_list_item')]//div/div/div/span/a[starts-with(@href,'/Restaurant_Review')]/@href").getall()

        for restaurant in all_restaurants:
            url = 'https://www.tripadvisor.it' + restaurant
            print(url)
            # yield response.follow(url, callback = self.parse_restaurant)
© www.soinside.com 2019 - 2024. All rights reserved.