网络抓取 Twitter 帖子

问题描述 投票:0回答:1
import requests
from bs4 import BeautifulSoup

url = 'https://twitter.com/GeniResearch/status/1755137950403219646'

response = requests.get(url)
print(response)
soup = BeautifulSoup(response.content, 'html.parser')
print(soup)

为什么我在获取请求时收到 。 我检查了 Twitter API 400 状态代码,表示请求无效或无法以其他方式提供服务。随附的错误消息将进一步解释。未经身份验证的请求被视为无效,并将产生此响应。

当我打印 soup 时,我得到:不再支持此浏览器。请切换到支持的浏览器以继续使用 twitter.com。您可以在我们的帮助中心查看支持的浏览器列表。

我想从上述 URL 中删除文本内容。

python web-scraping beautifulsoup twitter
1个回答
0
投票

外部API请求为我们提供了推文数据,但网站页面源不提供它们。 我们需要从页面源获取 guest_token,然后需要向其传递 API 请求。

import requests
from bs4 import BeautifulSoup
import re

headers = {
'authority': 'twitter.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8,ur;q=0.7',
'cache-control': 'no-cache',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',}

guest_token_response = requests.get('https://twitter.com/GeniResearch/status/1755137950403219646', headers=headers)
guest_token_soup = BeautifulSoup(guest_token_response.content, 'html.parser')
pattern = re.compile('document.cookie')
token_script = guest_token_soup.find('script',string = pattern)
guest_token = token_script.text.split(';document.cookie="gt=').pop().split(";")[0]

if guest_token:
    url = 'https://twitter.com/GeniResearch/status/1755137950403219646'

    headers = {
        'authority': 'api.twitter.com',
        'accept': '*/*',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8,ur;q=0.7',
        'authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA',
        'cache-control': 'no-cache',
        'content-type': 'application/json',
        'origin': 'https://twitter.com',
        'pragma': 'no-cache',
        'referer': 'https://twitter.com/',
        'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
        'x-guest-token': guest_token,
        'x-twitter-active-user': 'yes',
        'x-twitter-client-language': 'en-GB',
    }

    params = {
        'variables': '{"tweetId":"1755137950403219646","withCommunity":false,"includePromotedContent":false,"withVoice":false}',
        'features': '{"creator_subscriptions_tweet_preview_api_enabled":true,"c9s_tweet_anatomy_moderator_badge_enabled":true,"tweetypie_unmention_optimization_enabled":true,"responsive_web_edit_tweet_api_enabled":true,"graphql_is_translatable_rweb_tweet_is_translatable_enabled":true,"view_counts_everywhere_api_enabled":true,"longform_notetweets_consumption_enabled":true,"responsive_web_twitter_article_tweet_consumption_enabled":true,"tweet_awards_web_tipping_enabled":false,"freedom_of_speech_not_reach_fetch_enabled":true,"standardized_nudges_misinfo":true,"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled":true,"rweb_video_timestamps_enabled":true,"longform_notetweets_rich_text_read_enabled":true,"longform_notetweets_inline_media_enabled":true,"responsive_web_graphql_exclude_directive_enabled":true,"verified_phone_label_enabled":true,"responsive_web_graphql_skip_user_profile_image_extensions_enabled":false,"responsive_web_graphql_timeline_navigation_enabled":true,"responsive_web_enhance_cards_enabled":false}',
        'fieldToggles': '{"withArticleRichContentState":true}',
    }

    response = requests.get(
        'https://api.twitter.com/graphql/pq4JqttrkAz73WE6s2yUqg/TweetResultByRestId',
        params=params,
        headers=headers,
    )
    print(response)
    soup = BeautifulSoup(response.content, 'html.parser')
    print(soup)
© www.soinside.com 2019 - 2024. All rights reserved.