从昨天开始,我遇到了一个问题,我的 Facebook 市场抓取工具停止获取数据,由于他的功能,我目前正在使用 scrapy,我是否犯了任何错误?输出已在我的要点
上共享下面是当前代码
from scrapy import Spider
import logging
class Facebook(Spider):
name = 'facebook'
start_urls = ["https://www.facebook.com/marketplace/112047398814697/search?query=funko&sortBy=creation_time_descend&radius=500"]
def parse(self, response):
from pdb import set_trace; set_trace()
# get all HTML product elements
products = response.xpath('//div[@style="max-width:1872px"]/div[2]/div')
# iterate over the list of products
for product in products:
# return a generator for the scraped item
yield {
"name": product.css("h2::text").get(),
"image": product.css("img").attrib["src"],
"price": product.css('span::text').getall()[2],
"url": product.css("a").attrib["href"],
}
我已经使用 selenium 和 requests-html 进行了测试,但它们并不像预期的那样工作。
数据现在以 JSON 形式存在于脚本标签中,这就是您无法提取任何产品详细信息的原因。您需要获取 JSON 字符串,然后将其转换为字典以从中访问必要的详细信息。您可以在下面找到代码片段。
from scrapy import Spider
import logging
import json
class Facebook(Spider):
name = 'facebook'
start_urls = ["https://www.facebook.com/marketplace/112047398814697/search?query=funko&sortBy=creation_time_descend&radius=500"]
def parse(self, response):
# Get json string from the HTML
nodes = response.xpath("//script[contains(text(),'MarketplaceFeedListingStoryObject')]/text()")
json_str = nodes[0].get()
data_dict = json.loads(json_str)
products = data_dict['require'][0][3][0]['__bbox']['require'][0][3][1]['__bbox']['result']['data']['marketplace_search']['feed_units']['edges']
# iterate over the list of products
for product in products:
# return a generator for the scraped item
name = product['node']['listing']['marketplace_listing_title']
img_url = product['node']['listing']['primary_listing_photo']['image']['uri']
price = product['node']['listing']['listing_price']['formatted_amount']
prod_id = product['node']['story_key']
url = f"https://www.facebook.com/marketplace/item/{prod_id}"
yield {
"name": name,
"image": img_url,
"price": price,
"url" : url
}
输出(前5条记录的样本)
[{"name": "Personagens, pel\u00facias lindas e de qualidade da Black", "image": "https://scontent.fhyd14-2.fna.fbcdn.net/v/t45.5328-4/429928588_24793716480274856_8546570146970955724_n.jpg?stp=c0.43.261.261a_dst-jpg_p261x260&_nc_cat=108&ccb=1-7&_nc_sid=247b10&_nc_ohc=ceFbab6P7S0AX8eOVPa&_nc_ht=scontent.fhyd14-2.fna&oh=00_AfAZIwkb0bEG3bv3TaxDT54uoZdCfYjMVRRR7TEizsZLTw&oe=65EB7194", "price": "R$45", "url": "https://www.facebook.com/marketplace/item/24969789585998853"},
{"name": "Funko Harley Quinn", "image": "https://scontent.fhyd14-1.fna.fbcdn.net/v/t45.5328-4/429821610_7200991503350116_4095918823174063593_n.jpg?stp=c0.43.261.261a_dst-jpg_p261x260&_nc_cat=102&ccb=1-7&_nc_sid=247b10&_nc_ohc=mlyHaWXBChAAX8C6U9Q&_nc_ht=scontent.fhyd14-1.fna&oh=00_AfAv9C9spT7Wwt7VkdvbdW0nVVleLLOF508EXLwmqBcVHg&oe=65EA148C", "price": "R$150", "url": "https://www.facebook.com/marketplace/item/7226874327390390"},
{"name": "Funko Homem Aranha", "image": "https://scontent.fhyd14-2.fna.fbcdn.net/v/t45.5328-4/430690561_7027936467316381_2198370265159815215_n.jpg?stp=c0.7.261.261a_dst-jpg_p261x260&_nc_cat=108&ccb=1-7&_nc_sid=247b10&_nc_ohc=rNzEmNcU4j8AX9C_vAG&_nc_ht=scontent.fhyd14-2.fna&oh=00_AfCNX6a4MpResENwtLkRB54cQoruAE1nelGVC0QcHzdpFw&oe=65EACB55", "price": "R$120", "url": "https://www.facebook.com/marketplace/item/7181301185323208"},
{"name": "Pop! Funko Television The Green Hornet and Kato (2029 Fall Convention - Limited edition)", "image": "https://scontent.fhyd14-1.fna.fbcdn.net/v/t45.5328-4/425299286_7285507438200685_1213036715024683971_n.jpg?stp=c43.0.260.260a_dst-jpg_p261x260&_nc_cat=110&ccb=1-7&_nc_sid=247b10&_nc_ohc=TK7r0aRoKaIAX88Gu2D&_nc_ht=scontent.fhyd14-1.fna&oh=00_AfArqT76xeIJrVTqF_bU60oZYfeNfVZ22NNI9QhYmPB_WQ&oe=65E9D22E", "price": "R$170", "url": "https://www.facebook.com/marketplace/item/7513165018740640"},
{"name": "DESAPEGO DE FUNKO POP LOOSES", "image": "https://scontent.fhyd14-1.fna.fbcdn.net/v/t45.5328-4/428183170_7061261334001976_307640594589911567_n.jpg?stp=c0.43.261.261a_dst-jpg_p261x260&_nc_cat=110&ccb=1-7&_nc_sid=247b10&_nc_ohc=Nj2toiYU698AX8SIDFj&_nc_ht=scontent.fhyd14-1.fna&oh=00_AfDdKvO0WkpdvgS3_Z1Okt6kk95N4cnKD8Imww-ZjxY6uw&oe=65EB8BC4", "price": "$150", "url": "https://www.facebook.com/marketplace/item/7455762907800771"}]