假设我有一个.CSV,其中有三列:tidytext、location、vader_senti。
我已经能够得到*正、中、负的量。文字 而不是word* pero country,使用以下代码。
data_vis = pd.read_csv(r"csviamcrpreprocessed.csv", usecols=fields)
def print_sentiment_scores(text):
vadersenti = analyser.polarity_scores(str(text))
return pd.Series([vadersenti['pos'], vadersenti['neg'], vadersenti['neu'], vadersenti['compound']])
data_vis[['vadersenti_pos', 'vadersenti_neg', 'vadersenti_neu', 'vadersenti_compound']] = data_vis['tidytext'].apply(print_sentiment_scores)
data_vis['vader_senti'] = 'neutral'
data_vis.loc[data_vis['vadersenti_compound'] > 0.3 , 'vader_senti'] = 'positive'
data_vis.loc[data_vis['vadersenti_compound'] < 0.23 , 'vader_senti'] = 'negative'
data_vis['vader_possentiment'] = 0
data_vis.loc[data_vis['vadersenti_compound'] > 0.3 , 'vader_possentiment'] = 1
data_vis['vader_negsentiment'] = 0
data_vis.loc[data_vis['vadersenti_compound'] <0.23 , 'vader_negsentiment'] = 1
data_vis['vader_neusentiment'] = 0
data_vis.loc[(data_vis['vadersenti_compound'] <=0.3) & (data_vis['vadersenti_compound'] >=0.23) , 'vader_neusentiment'] = 1
sentimentbylocation = data_vis.groupby(["Location"])['vader_senti'].value_counts()
sentimentbylocation
sentimentbylocation给我以下结果:
Location vader_senti
Afghanistan negative 151
positive 25
neutral 2
Albania negative 6
positive 1
Algeria negative 116
positive 13
neutral 4
为了得到最常见的正面词汇,我使用了这个代码。
def process_text(text):
tokens = []
for line in text:
toks = tokenizer.tokenize(line)
toks = [t.lower() for t in toks if t.lower() not in stopwords_list]
tokens.extend(toks)
return tokens
tokenizer=TweetTokenizer()
punct = list(string.punctuation)
stopwords_list = stopwords.words('english') + punct + ['rt','via','...','…','’','—','—:',"‚","â"]
pos_lines = list(data_vis[data_vis.vader_senti == 'positive'].tidytext)
pos_tokens = process_text(pos_lines)
pos_freq = nltk.FreqDist(pos_tokens)
pos_freq.most_common()
运行这个代码会给我最常见的词以及它们出现的次数,比如: --
[(good, 1212),
(amazing, 123)
然而,我想看到的是,一个国家出现了多少这样的褒义词。
例如,我这里有一个CSV样本。
我这里有一个CSV样本 https:/drive.google.comfiled112k -6VLB3UyljFFUbeo7KhulcrMedR -lview?usp=sharing。
创建一个列,为每个 most_common
词,然后做 groupby
位置和用途 agg
为每个计数应用一个总和。
words = [i[0] for i in pos_freq.most_common()]
# lowering all cases in tidytext
data_vis.tidytext = data_vis.tidytext.str.lower()
for i in words:
data_vis[i] = data_vis.tidytext.str.count(i)
funs = {i: 'sum' for i in words}
grouped = data_vis.groupby('Location').agg(funs)
根据CSV中的例子,用most_common作为例子 ['good', 'amazing']
结果将是:
grouped
# good amazing
# Location
# Australia 0 1
# Belgium 6 4
# Japan 2 1
# Thailand 2 0
# United States 1 0