# importing libraries and packages
import snscrape.modules.twitter as sntwitter
import pandas
import time
import pandas as pd
# Creating list to append tweet data
ManUtd_list = []
# Using TwitterSearchScraper to scrape data and append tweets to list
for i,tweet in enumerate(sntwitter.TwitterSearchScraper('Man Utd since:2020-12-31 until:2021-01-02').get_items()):
if i>10000: #number of tweets you want to scrape
break
ManUtd_list.append([tweet.date, tweet.id, tweet.content, tweet.user.username]) #declare the attributes to be returned
# Creating a dataframe from the tweets list above
ManUtd_df = pd.DataFrame(ManUtd_list, columns=['Datetime', 'Tweet Id', 'Text', 'Username'])
我希望在这些日期范围内每天抓取 10,000 条推文,我如何对其进行编码,以便抓取器循环遍历该范围内指定的每个日期并检索最多 10000 条推文?
实现数据范围很容易,只需
filter
data
属性的结果(生成器)。这是一个工作示例:
import snscrape.modules.twitter as sntwitter
import itertools
import multiprocessing.dummy as mp # multithreading
import datetime
start_date = datetime.datetime(2023,2,15,tzinfo=datetime.timezone.utc)
def get_tweets(username,n_tweets = 100):
tweets = itertools.islice(sntwitter.TwitterSearchScraper(f'from:{username}').get_items(),n_tweets) # invoke the scrapper
tweets = filter(lambda t:t.date>=start_date, tweets)
tweets = map(lambda t: (username,t.date,t.url,t.rawContent),tweets) # keep only attributes needed
tweets = list(tweets) # the result has to be pickle'able
return tweets
# a list of accounts to scrape
user_names = ['kevin2kelly','briansolis','PeterDiamandis','Richard_Florida']
# parallelise queries for speed !
with mp.Pool(4) as p:
results = p.map(get_tweets, user_names)
# combine results
results = list(itertools.chain(*results))