import requests
from bs4 import BeautifulSoup
url = 'https://twitter.com/GeniResearch/status/1755137950403219646'
response = requests.get(url)
print(response)
soup = BeautifulSoup(response.content, 'html.parser')
print(soup)
为什么我在获取请求时收到
当我打印 soup 时,我得到:不再支持此浏览器。请切换到支持的浏览器以继续使用 twitter.com。您可以在我们的帮助中心查看支持的浏览器列表。
我想从上述 URL 中删除文本内容。
外部API请求为我们提供了推文数据,但网站页面源不提供它们。 我们需要从页面源获取 guest_token,然后需要向其传递 API 请求。
import requests
from bs4 import BeautifulSoup
import re
headers = {
'authority': 'twitter.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8,ur;q=0.7',
'cache-control': 'no-cache',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',}
guest_token_response = requests.get('https://twitter.com/GeniResearch/status/1755137950403219646', headers=headers)
guest_token_soup = BeautifulSoup(guest_token_response.content, 'html.parser')
pattern = re.compile('document.cookie')
token_script = guest_token_soup.find('script',string = pattern)
guest_token = token_script.text.split(';document.cookie="gt=').pop().split(";")[0]
if guest_token:
url = 'https://twitter.com/GeniResearch/status/1755137950403219646'
headers = {
'authority': 'api.twitter.com',
'accept': '*/*',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8,ur;q=0.7',
'authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA',
'cache-control': 'no-cache',
'content-type': 'application/json',
'origin': 'https://twitter.com',
'pragma': 'no-cache',
'referer': 'https://twitter.com/',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
'x-guest-token': guest_token,
'x-twitter-active-user': 'yes',
'x-twitter-client-language': 'en-GB',
}
params = {
'variables': '{"tweetId":"1755137950403219646","withCommunity":false,"includePromotedContent":false,"withVoice":false}',
'features': '{"creator_subscriptions_tweet_preview_api_enabled":true,"c9s_tweet_anatomy_moderator_badge_enabled":true,"tweetypie_unmention_optimization_enabled":true,"responsive_web_edit_tweet_api_enabled":true,"graphql_is_translatable_rweb_tweet_is_translatable_enabled":true,"view_counts_everywhere_api_enabled":true,"longform_notetweets_consumption_enabled":true,"responsive_web_twitter_article_tweet_consumption_enabled":true,"tweet_awards_web_tipping_enabled":false,"freedom_of_speech_not_reach_fetch_enabled":true,"standardized_nudges_misinfo":true,"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled":true,"rweb_video_timestamps_enabled":true,"longform_notetweets_rich_text_read_enabled":true,"longform_notetweets_inline_media_enabled":true,"responsive_web_graphql_exclude_directive_enabled":true,"verified_phone_label_enabled":true,"responsive_web_graphql_skip_user_profile_image_extensions_enabled":false,"responsive_web_graphql_timeline_navigation_enabled":true,"responsive_web_enhance_cards_enabled":false}',
'fieldToggles': '{"withArticleRichContentState":true}',
}
response = requests.get(
'https://api.twitter.com/graphql/pq4JqttrkAz73WE6s2yUqg/TweetResultByRestId',
params=params,
headers=headers,
)
print(response)
soup = BeautifulSoup(response.content, 'html.parser')
print(soup)