import scrapy
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from scrapy.selector import Selector
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
class YapoSpider(scrapy.Spider):
name = 'yapo'
allowed_domains = ['yapo.cl']
start_urls = ['https://www.yapo.cl/region-metropolitana/inmuebles/inmuebles/arrendar?tipo-inmueble=departamento,casa&pagina=1']
def __init__(self):
chrome_options = Options()
chrome_options.add_argument("--headless")
self.driver = webdriver.Chrome(options= chrome_options)
def parse(self, response):
self.driver.get(response.url)
# parse speed
incremento = 50
velocidad = 0.5
# scroll height
altura_total = self.driver.execute_script("return document.body.scrollHeight")
for posicion in range(0, altura_total, incremento):
# scroll
self.driver.execute_script(f"window.scrollTo(0, {posicion});")
time.sleep(velocidad)
# bottom page
self.driver.execute_script(f"window.scrollTo(0, {altura_total});")
sel = Selector(text=self.driver.page_source)
# Selector Scrapy.
for href in sel.xpath("//a[contains(@class,'card inmo subcategory-1240 category-1000 has-cover is-visible')]/@href").extract():
url = response.urljoin(href)
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
title = response.xpath("//h1[@class='my-2 title order-1 ng-star-inserted']/text()").extract_first()
yield {'title': title}
def closed(self):
self.driver.quit()
我有一个蜘蛛,其中我将selenium与scrapy结合起来,导致每个广告的hrefs的加载都是动态加载的,所以我用selenium滚动来获取它们。当我进行 scrapy 抓取时,我得到每个广告的响应 200,但我没有从广告中获取任何数据,我不知道我做错了什么。我正在测试代码以获取每个广告的标题,但我需要获取广告的所有数据,特别是纬度和经度
虽然我永远不会建议使用 scrapy 和 selenium,但您只需在
parse_dir_contents
方法中添加几行代码就可以解决您的问题。
您没有获得最终请求的详细信息的原因是因为这些响应不是通过 selenium 处理的,因此没有加载动态内容。为了使用当前策略解决此问题,您还需要在
self.driver.get(response.url)
回调方法中调用 parse_dir_contents
并将源转换为 scrapy 选择器,就像在解析方法中所做的那样。然后 Selenium 将渲染动态内容,您将可以访问完整的 html。
例如:
import scrapy
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from scrapy.selector import Selector
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
class YapoSpider(scrapy.Spider):
name = 'yapo'
allowed_domains = ['yapo.cl']
start_urls = ['https://www.yapo.cl/region-metropolitana/inmuebles/inmuebles/arrendar?tipo-inmueble=departamento,casa&pagina=1']
def __init__(self):
chrome_options = Options()
chrome_options.add_argument("--headless")
self.driver = webdriver.Chrome(options= chrome_options)
def parse(self, response):
self.driver.get(response.url)
# parse speed
incremento = 50
velocidad = 0.5
# scroll height
altura_total = self.driver.execute_script("return document.body.scrollHeight")
for posicion in range(0, altura_total, incremento):
# scroll
self.driver.execute_script(f"window.scrollTo(0, {posicion});")
time.sleep(velocidad)
# bottom page
self.driver.execute_script(f"window.scrollTo(0, {altura_total});")
sel = Selector(text=self.driver.page_source)
# Selector Scrapy.
for href in sel.xpath("//a[contains(@class,'card inmo subcategory-1240 category-1000 has-cover is-visible')]/@href").extract():
url = response.urljoin(href)
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
self.driver.get(response.url)
selector = Selector(text=self.driver.page_source)
title = selector.xpath("//h1[@class='my-2 title order-1 ng-star-inserted']/text()").extract_first()
yield {'title': title}
def closed(self):
self.driver.quit()