这是一个简单的 scrapy 蜘蛛,任何人都可以用来测试。
from scrapy.utils.response import open_in_browser
import scrapy
import json
class TestSpider(scrapy.Spider):
name = "test-spider"
allowed_domains = ["shopee.ph"]
shopee_cookies = '[{"name": "csrftoken", "value": "RvxBdTixvBfdTR3xfQwbcYippqz8jEbF", "domain": "shopee.ph", "path": "/", "expires": -1, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "_gcl_au", "value": "1.1.1251411089.1692464842", "domain": ".shopee.ph", "path": "/", "expires": 1700240842, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "SPC_SI", "value": "sTLbZAAAAABwY1ZrR1NNU+WdNgAAAAAAdzlCYXIyVVQ=", "domain": ".shopee.ph", "path": "/", "expires": 1692551246.336331, "httpOnly": true, "secure": true, "sameSite": "Lax"}, {"name": "_fbp", "value": "fb.1.1692464842990.689078803", "domain": ".shopee.ph", "path": "/", "expires": 1700240846, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "SPC_R_T_IV", "value": "NnVEbThnRjREMnNMZVpGVQ==", "domain": ".shopee.ph", "path": "/", "expires": 1727024846.336348, "httpOnly": false, "secure": true, "sameSite": "Lax"}, {"name": "SPC_T_ID", "value": "fn/OKngQO3doGdfFGyo/6mzLiviELHkKEbWM9J+x/ezTl/baT96grQer6ILrYX9tj3Kqs71Jg+hCimaK/XauidJXrd6HdPd2Smbxbu/fEStjOJi5g9/ucMmbBwuyh5M6H3TOGdpUop/9Q/zdpNj6MyxZaODnNsT5XprfsQxjB5g=", "domain": ".shopee.ph", "path": "/", "expires": 1727024846.336355, "httpOnly": true, "secure": true, "sameSite": "Lax"}, {"name": "SPC_T_IV", "value": "NnVEbThnRjREMnNMZVpGVQ==", "domain": ".shopee.ph", "path": "/", "expires": 1727024846.336362, "httpOnly": true, "secure": true, "sameSite": "Lax"}, {"name": "SPC_F", "value": "jiOtuCSNUaap3U4BHHfzhDihWwFht32f", "domain": ".shopee.ph", "path": "/", "expires": 1727024843.162052, "httpOnly": false, "secure": true, "sameSite": "Lax"}, {"name": "REC_T_ID", "value": "dc8a2570-3eb2-11ee-ac9b-2cea7fce6c95", "domain": ".shopee.ph", "path": "/", "expires": 1727024843.16206, "httpOnly": true, "secure": true, "sameSite": "Lax"}, {"name": "SPC_R_T_ID", "value": "fn/OKngQO3doGdfFGyo/6mzLiviELHkKEbWM9J+x/ezTl/baT96grQer6ILrYX9tj3Kqs71Jg+hCimaK/XauidJXrd6HdPd2Smbxbu/fEStjOJi5g9/ucMmbBwuyh5M6H3TOGdpUop/9Q/zdpNj6MyxZaODnNsT5XprfsQxjB5g=", "domain": ".shopee.ph", "path": "/", "expires": 1727024846.33634, "httpOnly": false, "secure": true, "sameSite": "Lax"}, {"name": "_QPWSDCXHZQA", "value": "4a585493-a7a0-4f0e-d696-687295d3a4c3", "domain": "shopee.ph", "path": "/", "expires": 1692496379, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "IDE", "value": "AHWqTUm1b5ZflCqDTn6cpHDjyoeqH6iLfXcCOOm4YNaP8CHTsAZ7F_Daq4-zO-bsGIk", "domain": ".doubleclick.net", "path": "/", "expires": 1727024843.787698, "httpOnly": true, "secure": true, "sameSite": "None"}, {"name": "AMP_TOKEN", "value": "%24NOT_FOUND", "domain": ".shopee.ph", "path": "/", "expires": 1692468444, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "_ga", "value": "GA1.2.833255521.1692464843", "domain": ".shopee.ph", "path": "/", "expires": 1727024844.498551, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "_gid", "value": "GA1.2.1347861977.1692464844", "domain": ".shopee.ph", "path": "/", "expires": 1692551244, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "_dc_gtm_UA-61918643-6", "value": "1", "domain": ".shopee.ph", "path": "/", "expires": 1692464904, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "shopee_webUnique_ccd", "value": "raj%2F3ukNopIWTrFjVLQeGA%3D%3D%7C1%2BjiV3ga9OlzuAELTZtedUY5BlP1ZNVH5ybZJx2D4KNA9dGTvtFakjnNZvR64zKNG6yBDfEXdabTE%2FRKow%3D%3D%7CsWIQ7u7pR4F3BD7E%7C08%7C3", "domain": "shopee.ph", "path": "/", "expires": 1692496381, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "ds", "value": "065598fda3b7cca4e5e241e446a075e9", "domain": "shopee.ph", "path": "/", "expires": 1692496381, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "SPC_EC", "value": "RTJYa2Q5WEV4UDNnN3VGWr68rFv1FRJEeVkpwAzlu09WhtwSxFE1cZlwpQYRhhR56REixPuKfekz6oioE4EaDK12bvALil+QZ5B0EfG42psIFWNDe1moiErTZndyu1502KUlh5+OQoUWCvm1XkVY+2Iy7Jk5qyPI2J655JeZwv0=", "domain": ".shopee.ph", "path": "/", "expires": 1727024846.336291, "httpOnly": true, "secure": true, "sameSite": "Lax"}, {"name": "SPC_ST", "value": ".ek1DVmo5aGJjaVBxcklYU5o4/3v/8ndPeV2/fwtzWYUh1kWOopWvn7SFoQXWuS37Rs+J+Ym7U8OwOG73JbiFRWyOOo1GhKBgwhUeeWfE+q9XPDZXACC33t7qphoBu5hyWvR/G+WkpSUbIkmGPzprCIvhw7Qwyt8UFxk/4bA+47QQQUiDcPfHIq/sJqmVMEqH3Al6nCTDeEh/JCDLALRvNQ==", "domain": ".shopee.ph", "path": "/", "expires": 1727024846.336324, "httpOnly": true, "secure": true, "sameSite": "Lax"}, {"name": "SPC_CLIENTID", "value": "amlPdHVDU05VYWFwgvlavxoisbqjmacw", "domain": ".shopee.ph", "path": "/", "expires": 1727024846.336374, "httpOnly": false, "secure": false, "sameSite": "Lax"}, {"name": "_ga_CB0044GVTM", "value": "GS1.1.1692464843.1.0.1692464846.57.0.0", "domain": ".shopee.ph", "path": "/", "expires": 1727024846.367333, "httpOnly": false, "secure": false, "sameSite": "Lax"}]'
shopee_cookies = json.loads(shopee_cookies)
def start_requests(self):
yield scrapy.Request(
"https://shopee.ph/api/v4/pdp/get_pc?shop_id=237078553&item_id=6929743700",
cookies=self.shopee_cookies,
headers={"x-api-source":"pc","af-ac-enc-dat":"null"},
callback=self.parse_item,
)
def parse_item(self,response):
open_in_browser(response)
请随意测试一下,因为我也提供了 cookie(因为需要 cookie)。现在,如您所见,这段代码实际上在 2023 年 8 月初左右就已经工作过。之前我在让它工作方面遇到了挑战,但多亏了这个答案,我成功地获取了产品数据。您甚至可以在那里看到我的评论。这是我在证明它在八月初确实有效之前截图的图像。
如您所见,数据就在那里并且运行良好。感谢标题
{"x-api-source":"pc","af-ac-enc-dat":"null"}
使其发挥作用。然而,截至 2023 年 8 月 20 日,当我打字时。看来已经不行了。我不知道为什么,但我认为 API 发生了一些变化。我花了一整天的时间试图弄清楚并使用标题,但没有运气。我现在得到的结果就是这个。
我现在的输出:
{"is_customized":false,"is_login":true,"platform":0,"action_type":2,"error":90309999,"tracking_id":"24d95bd5-40e5-44cd-b30b-885711481170","report_extra_info “:”“}
这是我用于测试的实际产品页面链接。当您执行“检查元素”->“网络”选项卡时,您可以在那里看到 API。请注意,我现在得到的输出与我在设法实现此解决方案之前得到的输出相同。但现在又回来了。那么问题来了,有没有办法让它再次发挥作用呢?我觉得标题中的某些内容我没有得到正确的解决方案,但我不确定如何解决它,这就是为什么我现在正在寻求帮助,因为我没有解决方案。
正如我在另一篇文章的评论中所说,看来他们的反爬虫方法已经改变了。
如果api响应有问题,可以按照以下流程排查问题。
curl 'https://shopee.ph/api/v4/pdp/get_pc?shop_id=237078553&item_id=6929743700' \
-H '744a5de7:<value>' \
-H 'authority:<value>' \
-H 'accept:<value>' \
-H 'accept-language:<value>' \
-H 'af-ac-enc-dat:<value>' \
-H 'af-ac-enc-sz-token:<value>' \
-H $'afe5f082:<value> '\
-H 'content-type:<value>' \
-H 'cookie:<value>' \
-H 'dd46895e:<value>' \
-H 'referer:<value>' \
-H 'sec-ch-ua:<value>' \
-H 'sec-ch-ua-mobile:<value>' \
-H 'sec-ch-ua-platform:<value>' \
-H 'sec-fetch-dest:<value>' \
-H 'sec-fetch-mode:<value>' \
-H 'sec-fetch-site:<value>' \
-H 'sz-token:<value>' \
-H 'user-agent:<value>' \
-H 'x-api-source:<value>' \
-H 'x-csrftoken:<value>' \
-H 'x-requested-with:<value>' \
-H 'x-sap-ri:<value>' \
-H 'x-shopee-language:<value>' \
-H 'x-sz-sdk-version:<value>' \
--compressed
与我之前尝试时不同,添加了一些东西。 随机元素,如 744a5de7、$afe5f082、dd46895e
经测试,他们的api的过期时间是60~120秒。
所以我需要在 1 分钟内获得完整的 api 请求。
-H $'24d10ef2: \
-H '5025235d: \
-H '744a5de7:\
-H 'af-ac-enc-dat:\
-H 'cookie:\
-H 'user-agent:\
-H 'x-csrftoken:\
-H 'x-sap-ri:\
随机元素、af-ac-enc-dat、cookie、用户代理、x-csrftoken、x-sap-ri 是必要的。
令人惊讶的是,这一直有效到八月初。
当我尝试时,“af-ac-enc-dat”是客户端使用 cookie 和时间创建的值。 它不应该改变。 af-ac-enc-dat 是由 cookie 制作的,cookie 也无法更改。
所以你需要弄清楚
在我看来,当你获得cookie时,你是通过登录来获得它的。
根据我的经验,即使您通过代理进行抓取,在一段时间后抓取也会失败。
当你访问它时,它会说检测到异常流量并滑动以证明你不是机器人。
所以我尝试使用非登录 cookie 进行抓取。
可以通过多个 cookie 集和多个代理进行抓取。
我很努力地刮它, 但我无法突破他们不断变化的防御。
所以我在七月初就放弃了。
希望你能解决这个问题。