Scrapy:已抓取但未抓取任何数据

问题描述 投票:1回答:1

我写了以下代码,以给定城市名称来刮擦Booking.com。理想情况下,该程序应找出城市中所有可用的酒店,并刮除每个酒店的所有评论。不幸的是,它只会刮擦一些酒店,并且只会刮擦那些酒店的前75条评论。您能告诉我我在做什么错吗?

import scrapy
from scrapy import Spider
from scrapy.loader import ItemLoader
from booking_spider.items import BookingSpiderItem


class PerhotelrevSpider(Spider):
    name = 'perhotelrev'
    allowed_domains = ['booking.com']
    #start_urls = ['https://booking.com/reviews/us/hotel/maison-st-charles-quality-inn-suites.html?/']
    start_urls = ['https://www.booking.com/searchresults.html?ss=New%20Orleans&']
    #handle_httpstatus_list = [301, 302]
    def parse(self, response):
        all_hotels = response.xpath('.//*[@class="sr-hotel__title  \n"]')
        for ahotel in all_hotels:
            hotel_name = ahotel.xpath('.//*[@class="sr-hotel__name\n"]/text()').extract_first().replace('\n','')
            hotel_url = ahotel.xpath('.//*[@class="hotel_name_link url"]/@href').extract_first().replace('\n','')
            full_hotel_url = 'https://www.booking.com'+str(hotel_url)
            request = scrapy.Request(full_hotel_url, callback = self.parse_hotels)
            request.meta['adict'] = {'HotelName':hotel_name}
            yield request

        next_page = response.xpath('.//*[@class="bui-pagination__item bui-pagination__next-arrow"]/a/@href').extract_first()
        if next_page is not None:
            next_page_url = response.urljoin(next_page)
            yield scrapy.Request(next_page_url, callback=self.parse)

    def parse_hotels(self, response):
        adict = response.meta['adict']
        hotel_name = adict['HotelName']
        #hotel_name = response.xpath('.//*[@class="hp__hotel-name"]/text()')[1].extract().replace('\n','')
        image_urls = response.xpath('.//*[@class="b_nha_hotel_small_images hp_thumbgallery_with_counter"]/a/@href').extract()
        all_facilities = response.xpath('.//*[@class="facilitiesChecklistSection"]/ul/li/span/text()').extract()
        all_facilities = [x.replace('\n','') for x in all_facilities]
        important_facility = response.xpath('.//*[@class="important_facility "]/@data-name-en').extract()
        #print(hotel_name)
        all_review_url = response.xpath('.//*[@class="show_all_reviews_btn"]/@href').extract_first()

        adict = {   'HotelName':hotel_name,
                    'ImageUrls':image_urls,
                    'Facilities':all_facilities,
                    'MainFacilities':important_facility
                }

        if all_review_url is not None:
            review_url = "https://booking.com"+all_review_url
            request = scrapy.Request(review_url, callback=self.parse_review)
            request.meta['adict'] = adict
            yield request


    def parse_review(self, response):
        allreviewsinpage = response.xpath('.//*[@itemprop="review"]')
        adict = response.meta['adict']
        hotel_name = adict['HotelName']
        image_urls = adict['ImageUrls']
        all_facilities = adict['Facilities']
        important_facility = adict['MainFacilities']

        for eachreview in allreviewsinpage:
            username = eachreview.xpath('.//p[@class="reviewer_name"]/*[@itemprop="name"]/text()').extract_first()
            usercountry = eachreview.xpath('.//*[@itemprop="nationality"]/*[@itemprop="name"]/text()').extract_first()
            numreviewgiven = eachreview.xpath('.//*[@class="review_item_user_review_count"]/text()').extract_first()
            useragegroup = eachreview.xpath('.//*[@class="user_age_group"]/text()').extract_first()
            heading = eachreview.xpath('.//*[@class="review_item_header_content\n"]/*[@itemprop="name"]/text()').extract_first()
            neg_rev = eachreview.xpath('.//p[@class="review_neg "]/*[@itemprop="reviewBody"]/text()').extract_first()
            pos_rev = eachreview.xpath('.//p[@class="review_pos "]/*[@itemprop="reviewBody"]/text()').extract_first()
            tagging = eachreview.xpath('.//ul[@class="review_item_info_tags"]/*[@class="review_info_tag "]/text()').extract()
            stayedin = eachreview.xpath('.//p[@class="review_staydate "]/text()').extract_first()
            givenscore = eachreview.xpath('.//span[@class="review-score-badge"]/text()').extract_first()


            l = ItemLoader(item=BookingSpiderItem(), selector=response)
            l.add_value('HotelName',hotel_name)
            #l.add_value('ImageUrls',image_urls)
            l.add_value('Facilities',all_facilities)
            l.add_value('MainFacilities',important_facility)
            l.add_value('UserName',username)
            l.add_value('UserCountry',usercountry)
            l.add_value('NumReviewGiven',numreviewgiven)
            l.add_value('UserAgeGroup',useragegroup)
            l.add_value('Heading',heading)
            l.add_value('NegativeReview',neg_rev)
            l.add_value('PositiveReview',pos_rev)
            l.add_value('SelfTag',tagging)
            l.add_value('StayDate',stayedin)
            l.add_value('GivenScore',givenscore)
            yield l.load_item()
        next_page = response.xpath('.//*[@class="page_link review_next_page"]/a/@href').extract_first()
        if next_page is not None:
            next_page_url = response.urljoin(next_page)
            yield scrapy.Request(next_page_url, callback=self.parse_review)
python scrapy
1个回答
0
投票

[嗨@Argha,我查看了您的代码,想知道您是否最近运行它。 Booking.com现在正在使用一个弹出窗口列出所有评论,然后我

© www.soinside.com 2019 - 2024. All rights reserved.