得到响应 200，但不使用带有 scrapy 的 selenium 来抓取数据

Question

import scrapy
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from scrapy.selector import Selector
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

class YapoSpider(scrapy.Spider):
    name = 'yapo'
    allowed_domains = ['yapo.cl']
    start_urls = ['https://www.yapo.cl/region-metropolitana/inmuebles/inmuebles/arrendar?tipo-inmueble=departamento,casa&pagina=1']

    def __init__(self):
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        self.driver = webdriver.Chrome(options= chrome_options)

    def parse(self, response):
        self.driver.get(response.url)

        # parse speed
        incremento = 50 
        velocidad = 0.5  

        # scroll height
        altura_total = self.driver.execute_script("return document.body.scrollHeight")

        for posicion in range(0, altura_total, incremento):
            # scroll
            self.driver.execute_script(f"window.scrollTo(0, {posicion});")
            time.sleep(velocidad)

        # bottom page
        self.driver.execute_script(f"window.scrollTo(0, {altura_total});")
        sel = Selector(text=self.driver.page_source)
        # Selector Scrapy.
        for href in sel.xpath("//a[contains(@class,'card inmo subcategory-1240 category-1000 has-cover is-visible')]/@href").extract():
            url = response.urljoin(href)
            yield scrapy.Request(url, callback=self.parse_dir_contents)

    def parse_dir_contents(self, response):
        title = response.xpath("//h1[@class='my-2 title order-1 ng-star-inserted']/text()").extract_first()
        yield {'title': title}

    def closed(self):
        self.driver.quit()

我有一个蜘蛛，其中我将selenium与scrapy结合起来，导致每个广告的hrefs的加载都是动态加载的，所以我用selenium滚动来获取它们。当我进行 scrapy 抓取时，我得到每个广告的响应 200，但我没有从广告中获取任何数据，我不知道我做错了什么。我正在测试代码以获取每个广告的标题，但我需要获取广告的所有数据，特别是纬度和经度

Answer 1

虽然我永远不会建议使用 scrapy 和 selenium，但您只需在

parse_dir_contents

方法中添加几行代码就可以解决您的问题。

您没有获得最终请求的详细信息的原因是因为这些响应不是通过 selenium 处理的，因此没有加载动态内容。为了使用当前策略解决此问题，您还需要在

self.driver.get(response.url)

回调方法中调用

parse_dir_contents

并将源转换为 scrapy 选择器，就像在解析方法中所做的那样。然后 Selenium 将渲染动态内容，您将可以访问完整的 html。

例如：

import scrapy
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from scrapy.selector import Selector
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

class YapoSpider(scrapy.Spider):
    name = 'yapo'
    allowed_domains = ['yapo.cl']
    start_urls = ['https://www.yapo.cl/region-metropolitana/inmuebles/inmuebles/arrendar?tipo-inmueble=departamento,casa&pagina=1']

    def __init__(self):
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        self.driver = webdriver.Chrome(options= chrome_options)

    def parse(self, response):
        self.driver.get(response.url)

        # parse speed
        incremento = 50 
        velocidad = 0.5  

        # scroll height
        altura_total = self.driver.execute_script("return document.body.scrollHeight")

        for posicion in range(0, altura_total, incremento):
            # scroll
            self.driver.execute_script(f"window.scrollTo(0, {posicion});")
            time.sleep(velocidad)

        # bottom page
        self.driver.execute_script(f"window.scrollTo(0, {altura_total});")
        sel = Selector(text=self.driver.page_source)
        # Selector Scrapy.
        for href in sel.xpath("//a[contains(@class,'card inmo subcategory-1240 category-1000 has-cover is-visible')]/@href").extract():
            url = response.urljoin(href)
            yield scrapy.Request(url, callback=self.parse_dir_contents)

    def parse_dir_contents(self, response):
        self.driver.get(response.url)
        selector = Selector(text=self.driver.page_source)
        title = selector.xpath("//h1[@class='my-2 title order-1 ng-star-inserted']/text()").extract_first()
        yield {'title': title}

    def closed(self):
        self.driver.quit()

得到响应 200，但不使用带有 scrapy 的 selenium 来抓取数据

问题描述投票：0回答：1

1个回答

最新问题

得到响应 200，但不使用带有 scrapy 的 selenium 来抓取数据

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1