我需要有关如何在 Python 中使用 Scrapy 从以下页面提取数据的帮助
https://fincaraiz.com.co/apartamentos/arriendos?ubicacion=cali
我需要提取每个项目的链接,例如第一个项目在悬停在照片上时显示详细链接:
https://fincaraiz.com.co/inmueble/apartamento-en-arriendo/florida-blanca/bogota/6738284
这个页面动态加载内容,所以当我从Scrapy发出请求时,我得到的结果是HTML,CSS,JavaScript和一些东西。
但是,并未获得公寓本身的数据。
所以我无法应用 XPath,因为响应不包含数据,而是动态加载的。
如何在不使用 Selenium、Scrapy Splash 或其他外部库的情况下抓取它?
之前的答案需要费力地手动收集您正在使用的每个不同网址的
Json
。我找到了一个更简单的解决方案,它允许您上传多个 start_urls
并根据您需要的数据排列 json
我集成了
scrapy_playwright
而不是 scrapy_splash
,因为 splash
会返回空白屏幕并且无论出于何种原因都不会加载 - 无论我设置计时器多长时间。
Playwright 需要相当长的时间来加载单个页面,但 scrapy 速度很快,因为它是异步的,因此您可以快速检索页面中的所有 json。
这是脚本:
import scrapy
from scrapy_playwright.page import PageCoroutine
from scrapy import http
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:97.0) Gecko/20100101 Firefox/97.0',
'Accept': '*/*',
'Accept-Language': 'en-GB,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'Referer': 'https://fincaraiz.com.co/finca-raiz/venta/apartamento-en-arriendo/florida-blanca/bogota?pagina=1',
'Connection': 'keep-alive',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'TE': 'trailers',
}
class ListingsSpider(scrapy.Spider):
name = 'listings'
start_urls = []
for pages in range(1, 40):
start_urls.append(f'https://fincaraiz.com.co/finca-raiz/venta/apartamento-en-arriendo/florida-blanca/bogota?pagina={pages+1}')
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url=url,
callback = self.parse,
meta = dict(
playwright = True,
playwright_page_coroutines = [
PageCoroutine("screenshot", path="image.png", full_page=True), #screenshot to check that you're on the page
PageCoroutine("wait_for_timeout", 30000)
]
)
)
def parse(self, response):
""" Parse the data:
1. Get the href for each link on the page -
this provides a unique id used in the json """
container = response.xpath("//div[@id='listingContainer']/div")
listing_id = []
for links in container:
data=links.xpath(".//a//@href").get()
try:
listing_id.append(data.split("/")[-1])
except:
continue
"""Send the Json Request to another parser"""
for listings in listing_id:
yield http.JsonRequest(
url = f'https://fincaraiz.com.co/_next/data/build/es/proyecto-de-vivienda/mirador-del-puerto/puerto-colombia/puerto-colombia/{listings}.json',
callback = self.parse_listing,
headers=headers
)
def parse_listing(self, response):
""" Parse the Json here however you wish """
yield {
'json':response.json()
}
输出:
{'json': {'pageProps': {'additional': {'benefit': None, 'deliveryTime': None, 'downPayment': None, 'finishes': None, 'schedule': None, 'financing': None, 'units': None, 'tourUrl': None}, 'capacity': {'name': 'Sin especificar', 'id': 0, 'slug': 'UNDEFINED'}, 'dates': {'renovated': '2022-02-19T20:01:47.783000+00:00', 'deleted': '0001-01-01', 'expired': '2022-05-20T21:16:31.453000+00:00', 'moderated': '2021-10-09T23:25:09+00:00', 'created': '2021-10-09T23:24:31.770000+00:00', 'published': '2022-02-19T21:20:44.207000+00:00', 'updated': '2022-02-19T21:20:44.207000+00:00'}, 'description': 'Espectacular casa Esquinera para uso residencial o comercial, 1 piso:Sala,Comedor,Estudio,Cocina,Deposito,Baño y garaje 2 carros, 2Piso:5 habiraciones y 2 baños, 3Piso: Zona lavadero,lavadoras,1 habiracion y cocina con terraza (El apto del 3 piso esta en obra gris), a 2 cuadras de plaza del quirigua y 20 min portal 80, alimentadores y servicio publico a 1 cuadra, en frente posee parqueadero y zona verde.', ... ...
...
...
...
当您在网站加载时检查网络面板时,您可以找到它用于动态加载内容的API调用。然后,您只需复制curl请求并将其转换为python即可复制api调用:
import requests
headers = {
'authority': 'api.fincaraiz.com.co',
'accept': '*/*',
'origin': 'https://fincaraiz.com.co',
'referer': 'https://fincaraiz.com.co/'
}
json_data = {
'filter': {
'offer': {
'slug': [
'rent',
],
},
'property_type': {
'slug': [
'apartment',
],
},
'location_path': 'cali',
},
'fields': {
'exclude': [],
'facets': [
'rooms.slug',
'baths.slug',
'locations.countries.slug',
'locations.states.slug',
'locations.cities.slug',
'locations.neighbourhoods.slug',
'locations.groups.slug',
'locations.groups.subgroups.slug',
'offer.slug',
'property_type.slug',
'categories.slug',
'stratum.slug',
'age.slug',
'media.floor_plans.with_content',
'media.photos.with_content',
'media.videos.with_content',
'products.slug',
'is_new',
],
'include': [
'area',
'baths.id',
'baths.name',
'baths.slug',
'client.client_type',
'client.company_name',
'client.first_name',
'client.fr_client_id',
'client.last_name',
'client.logo.full_size',
'garages.name',
'is_new',
'locations.cities.fr_place_id',
'locations.cities.name',
'locations.cities.slug',
'locations.countries.fr_place_id',
'locations.countries.name',
'locations.countries.slug',
'locations.groups.name',
'locations.groups.slug',
'locations.groups.subgroups.name',
'locations.groups.subgroups.slug',
'locations.neighbourhoods.fr_place_id',
'locations.neighbourhoods.name',
'locations.neighbourhoods.slug',
'locations.states.fr_place_id',
'locations.states.name',
'locations.states.slug',
'locations.location_point',
'max_area',
'max_price',
'media.photos.list.image.full_size',
'media.photos.list.is_main',
'media.videos.list.is_main',
'media.videos.list.video',
'media.logo.full_size',
'min_area',
'min_price',
'offer.name',
'price',
'products.configuration.tag_id',
'products.configuration.tag_name',
'products.label',
'products.name',
'products.slug',
'property_id',
'property_type.name',
'fr_property_id',
'fr_parent_property_id',
'rooms.id',
'rooms.name',
'rooms.slug',
'stratum.name',
'title',
],
'limit': 25,
'offset': 0, #set to 25 to get the second page, 50 for the 3rd page etc.
'ordering': [],
'platform': 41,
'with_algorithm': False,
},
}
response = requests.post('https://api.fincaraiz.com.co/document/api/1.0/listing/search', headers=headers, json=json_data)
data = response.json()
对于第二个网址(https://fincaraiz.com.co/apartamentos/arriendos/florida-blanca/zona-occidente/bogota?pagina=1):
import requests
headers = {
'authority': 'api.fincaraiz.com.co',
'accept': '*/*',
'origin': 'https://fincaraiz.com.co',
'referer': 'https://fincaraiz.com.co/'
}
json_data = {
'filter': {
'offer': {
'slug': [
'rent',
],
},
'property_type': {
'slug': [
'apartment',
],
},
'locations': {
'neighbourhoods': {
'slug': [
'colombia-cundinamarca-bogot\xE1-3632371-florida-blanca',
],
},
},
},
'fields': {
'exclude': [],
'facets': [
'rooms.slug',
'baths.slug',
'locations.countries.slug',
'locations.states.slug',
'locations.cities.slug',
'locations.neighbourhoods.slug',
'locations.groups.slug',
'locations.groups.subgroups.slug',
'offer.slug',
'property_type.slug',
'categories.slug',
'stratum.slug',
'age.slug',
'media.floor_plans.with_content',
'media.photos.with_content',
'media.videos.with_content',
'products.slug',
'is_new',
],
'include': [
'area',
'baths.id',
'baths.name',
'baths.slug',
'client.client_type',
'client.company_name',
'client.first_name',
'client.fr_client_id',
'client.last_name',
'client.logo.full_size',
'garages.name',
'is_new',
'locations.cities.fr_place_id',
'locations.cities.name',
'locations.cities.slug',
'locations.countries.fr_place_id',
'locations.countries.name',
'locations.countries.slug',
'locations.groups.name',
'locations.groups.slug',
'locations.groups.subgroups.name',
'locations.groups.subgroups.slug',
'locations.neighbourhoods.fr_place_id',
'locations.neighbourhoods.name',
'locations.neighbourhoods.slug',
'locations.states.fr_place_id',
'locations.states.name',
'locations.states.slug',
'locations.location_point',
'max_area',
'max_price',
'media.photos.list.image.full_size',
'media.photos.list.is_main',
'media.videos.list.is_main',
'media.videos.list.video',
'media.logo.full_size',
'min_area',
'min_price',
'offer.name',
'price',
'products.configuration.tag_id',
'products.configuration.tag_name',
'products.label',
'products.name',
'products.slug',
'property_id',
'property_type.name',
'fr_property_id',
'fr_parent_property_id',
'rooms.id',
'rooms.name',
'rooms.slug',
'stratum.name',
'title',
],
'limit': 25,
'offset': 0,
'ordering': [],
'platform': 41,
'with_algorithm': True,
},
}
response = requests.post('https://api.fincaraiz.com.co/document/api/1.0/listing/search', headers=headers, json=json_data)
data = response.json()
data['hits']['hits']
有 11 件商品
剧本还有吗?可以分享给我吗?预先感谢!