我正在尝试使用 API(一个无限滚动站点)从 Behance 提取数据。当您滚动时,新产品和请求将开始出现。现在我知道哪些请求包含产品数据,但问题是这些请求是相同的。我无法编写 for 循环来自动创建新请求并提取数据。下面是请求(cURL 转换为 Python 请求)。
import requests
headers = {
'accept': '*/*',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
'content-type': 'application/json',
'^cookie': 'gk_suid=27885050; gki=disable_graphql_metrics_to_timestream: false, test_cross_auth: false, feature_search_gk_debug: false, list_keyboard_nav: false, feature_image_search_with_project_title: false, updated_user_search_cards: false, feature_profile_work_sticky_hire_cta: false,; bcp=1f130e91-c6c8-47e5-99cf-673dad470fd3; dialog_dismissals=following_tags_ftux_tooltip; OptanonAlertBoxClosed=2024-05-01T15:07:33.232Z; OptanonConsent=groups=C0001^%^3A1^%^2CC0002^%^3A1^%^2CC0003^%^3A1^%^2CC0004^%^3A1; kndctr_9E1005A551ED61CA0A490D45_AdobeOrg_identity=CiYxMjA1MjMzODY3NTc5NDA5OTM5MTM5NDY3MzM4NDQ5NzQxMjYyOFITCM-vyaXzMRABGAEqBElSTDEwAPABz6_JpfMx; kndctr_9E1005A551ED61CA0A490D45_AdobeOrg_consent=general=in; AMCV_9E1005A551ED61CA0A490D45^%^40AdobeOrg=MCMID^|12052338675794099391394673384497412628; _cs_mk_aa=0.31804581114055797_1714585870715; gpv=behance.net:search:projects; kndctr_9E1005A551ED61CA0A490D45_AdobeOrg_cluster=irl1^',
'origin': 'https://www.behance.net',
'priority': 'u=1, i',
'referer': 'https://www.behance.net/',
'^sec-ch-ua': '^\\^Chromium^\\^;v=^\\^124^\\^, ^\\^Google',
'sec-ch-ua-mobile': '?0',
'^sec-ch-ua-platform': '^\\^Windows^\\^^',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
'x-bcp': '1f130e91-c6c8-47e5-99cf-673dad470fd3',
'x-newrelic-id': 'VgUFVldbGwsFU1BRDwUBVw==',
'x-requested-with': 'XMLHttpRequest',
}
data = '^^{^\\^query^\\^:^\\^^\\^\\n'
response = requests.post('https://www.behance.net/v3/graphql', headers=headers, data=data)
名为“graphql”的请求是我需要的,每次滚动到底部时,都会出现相同的请求名称,其中一些不包含数据(参见图片。突出显示的是我需要的)任何关于如何区分这两个请求的提示。
您可以使用此示例如何发出 POST 请求来获取更多页面:
import base64
import requests
def encode(s):
encoded_bytes = base64.b64encode(s.encode())
encoded_string = encoded_bytes.decode()
return encoded_string
payload = {
"query": "\n query GetProjectSearchResults($query: query, $filter: SearchResultFilter, $first: Int!, $after: String) {\n search(query: $query, type: PROJECT, filter: $filter, first: $first, after: $after) {\n pageInfo {\n hasNextPage\n endCursor\n }\n nodes {\n ... on Project {\n __typename\n ...projectSearchFields\n }\n }\n metaContent {\n totalEntityCount\n toolCard {\n cta {\n text\n url\n }\n description\n links {\n text\n url\n type\n }\n slug\n title\n }\n schoolCard {\n cta {\n text\n url\n }\n description\n slug\n }\n csam {\n isCSAMViolation\n description\n helpResource\n reportingOption\n }\n followableTag {\n isFollowing\n tag {\n id\n title\n }\n }\n }\n }\n }\n\n \n fragment projectSearchFields on Project {\n id\n colors {\n r\n g\n b\n }\n isMatureReviewSubmitted\n linkedAssetsCount\n name\n premium\n isPrivate\n publishedOn\n isFounder\n isFeatured\n modifiedOn\n canBeAddedToMoodboard\n adminFlags {\n mature_lock\n privacy_lock\n dmca_lock\n flagged_lock\n privacy_violation_lock\n trademark_lock\n spam_lock\n eu_ip_lock\n }\n features {\n featuredOn\n url\n name\n ribbon {\n image\n image2x\n image3x\n }\n }\n slug\n stats {\n views {\n all\n }\n appreciations {\n all\n }\n comments {\n all\n }\n }\n url\n fields {\n label\n }\n linkedAssets {\n ...sourceLinkFields\n }\n sourceFiles {\n ...sourceFileWithCoverFields\n }\n matureAccess\n hasMatureContent\n owners {\n ...OwnerFields\n images {\n size_50 {\n url\n }\n size_100 {\n url\n }\n size_115 {\n url\n }\n size_138 {\n url\n }\n size_230 {\n url\n }\n size_276 {\n url\n }\n }\n }\n covers {\n size_original {\n url\n }\n size_max_808 {\n url\n }\n size_808 {\n url\n }\n size_404 {\n url\n }\n size_202 {\n url\n }\n size_230 {\n url\n }\n size_115 {\n url\n }\n size_original_webp {\n url\n }\n size_max_808_webp {\n url\n }\n size_808_webp {\n url\n }\n size_404_webp {\n url\n }\n size_202_webp {\n url\n }\n size_230_webp {\n url\n }\n size_115_webp {\n url\n }\n }\n }\n\n \n fragment sourceFileWithCoverFields on SourceFile {\n __typename\n sourceFileId\n projectId\n userId\n title\n assetId\n renditionUrl\n mimeType\n size\n category\n licenseType\n unitAmount\n currency\n tier\n hidden\n extension\n hasUserPurchased\n description\n cover {\n coverUrl\n coverX\n coverY\n coverScale\n }\n }\n\n \n fragment sourceLinkFields on LinkedAsset {\n __typename\n name\n premium\n url\n category\n licenseType\n }\n\n \n fragment OwnerFields on User {\n displayName\n hasPremiumAccess\n id\n isFollowing\n isProfileOwner\n location\n locationUrl\n url\n username\n isMessageButtonVisible\n availabilityInfo {\n availabilityTimeline\n isAvailableFullTime\n isAvailableFreelance\n hiringTimeline {\n key\n label\n }\n }\n creatorPro {\n isActive\n initialSubscriptionDate\n }\n }\n\n\n ",
"variables": {"after": "NDg=", "filter": {}, "first": 48},
}
url = "https://www.behance.net/v3/graphql"
headers = {
"X-Requested-With": "XMLHttpRequest",
"X-BCP": "96ee8700-3ce5-4445-96b2-ab0e1a76a63a",
}
cookies = {
"bcp": "96ee8700-3ce5-4445-96b2-ab0e1a76a63a",
}
for page in range(0, 4): # <-- increase number of pages here
payload["variables"]["after"] = encode(str(page * 48))
data = requests.post(url, headers=headers, cookies=cookies, json=payload).json()
# for example print names:
for n in data["data"]["search"]["nodes"]:
print(n["name"])
print("-" * 80)
打印:
...
Modern Expo - animated film
Wavelink-1
Elysium
--------------------------------------------------------------------------------
FORMA FURNITURE
Le Mochi Glacé
2024 국민의 선택
pfp avatars collection
Illustrations for interactive books
NUMBERS ARE NOT AN OPINION
...