
问题描述 投票:0回答:1


wv = gensim.models.KeyedVectors.load_word2vec_format("E:\\GoogleNews-vectors-negative300.bin", binary=True)

#Pre-Processor Function
pre_processor = TextPreProcessor(
    omit=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],

    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],






#Averaging Words Vectors to Create Sentence Embedding
def word_averaging(wv, words):
    all_words, mean = set(), []

    for word in words:
        if isinstance(word, np.ndarray):
        elif word in wv.vocab:

    if not mean:
        logging.warning("cannot compute similarity with no input %s", words)
        # FIXME: remove these examples in pre-processing
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, post) for post in text_list ])

#Secondary Word-Averaging Method
def get_mean_vector(word2vec_model, words):
# remove out-of-vocabulary words
words = [word for word in words if word in word2vec_model.vocab]
if len(words) >= 1:
    return np.mean(word2vec_model[words], axis=0)
    return []

#Loading data
raw_train_tweets = pandas.read_excel('E:\\train.xlsx').iloc[:,1] #Loading all train tweets
train_labels = pandas.read_excel('E:\\train.xlsx').iloc[:,2:13] #Loading corresponding train labels (11 emotions)

raw_test_tweets = pandas.read_excel('E:\\test.xlsx').iloc[:300,1] #Loading 300 test tweets
test_gold_labels = pandas.read_excel('E:\\test.xlsx').iloc[:300,2:13] #Loading corresponding test labels (11 emotions)
print("please wait")

for tweets in raw_train_tweets:

for tweets in raw_test_tweets:

train_array = word_averaging_list(wv,train_tweets)
test_array = word_averaging_list(wv,test_tweets)

#Secondary Vectorizing 
for tweet in train_tweets:
    vec = get_mean_vector(wv, tweet)
    if len(vec) > 0:

for tweet in test_tweets:
    vec = get_mean_vector(wv, tweet)
    if len(vec) > 0:

clf = LabelPowerset(LogisticRegression(C=1, class_weight=None))
predicted= clf.predict(test_array)
print("Logistic Regression Accuracy =     ",accuracy_score(test_gold_labels,predicted))
print("F1 score = ",f1_score(test_gold_labels,predicted,    average="micro"))
print("Hamming loss = ",hamming_loss(test_gold_labels,predicted))

我的结果,请逐步进行。首先,我用它来查看Google预训练模型的一部分。我从this link下载了Google预先训练的模型。

from itertools import islice
print(list(islice(wv.vocab, 13030, 13050)))


['Memorial_Hospital', 'Seniors', 'memorandum', 'elephant', 'Trump', 'Census', 'pilgrims', 'De', 'Dogs', '###-####_ext', 'chaotic', 'forgive', 'scholar', 'Lottery', 'decreasing', 'Supervisor', 'fundamentally', 'Fitness', 'abundance', 'Hold']


0       “Worry is a down payment on a problem you may ...
1       Whatever you decide to do make sure it makes y...
2       @Max_Kellerman  it also helps that the majorit...
3       Accept the challenges so that you can literall...
4       My roommate: it's okay that we can't spell bec...
6833    @nicky57672 Hi! We are working towards your hi...
6834    @andreamitchell said @berniesanders not only d...
6835    @isthataspider @dhodgs i will fight this guy! ...
6836    i wonder how a guy can broke his penis while h...
6837     I'm highly animated even though I'm decomposing.
Name: Tweet, Length: 6838, dtype: object


anger  anticipation  disgust  fear  joy  love  optimism  pessimism  \
0         0             1        0     0    0     0         1          0   
1         0             0        0     0    1     1         1          0   
2         1             0        1     0    1     0         1          0   
3         0             0        0     0    1     0         1          0   
4         1             0        1     0    0     0         0          0   
...     ...           ...      ...   ...  ...   ...       ...        ...   
6833      0             0        0     0    0     0         0          0   
6834      0             1        0     0    0     0         0          0   
6835      1             0        1     0    0     0         0          1   
6836      0             0        0     0    0     0         0          0   
6837      0             0        0     0    0     0         0          1   

      sadness  surprise  trust  
0           0         0      1  
1           0         0      0  
2           0         0      0  
3           0         0      0  
4           0         0      0  
...       ...       ...    ...  
6833        0         0      0  
6834        0         1      0  
6835        0         0      0  
6836        0         1      0  
6837        0         0      0  

[6838 rows x 11 columns]


0      @Adnan__786__ @AsYouNotWish Dont worry Indian ...
1      Academy of Sciences, eschews the normally sobe...
2                      I blew that opportunity -__- #mad
3                 This time in 2 weeks I will be 30... 😥
4      #Deppression is real. Partners w/ #depressed p...
295                  you begin to irritate me, primitive
296    @Malala Happy day to you Malala 👏🏻😃I seem to s...
297    Hey The Success Novel thanks for the follow! (...
298    Easy day at school today; last rehearsal befor...
299    @bookish_yogi It's been a little Twitter saga ...
Name: Tweet, Length: 300, dtype: object


anger  anticipation  disgust  fear  joy  love  optimism  pessimism  \
0        1             1        0     0    0     0         1          0   
1        0             0        1     0    0     0         0          0   
2        1             0        1     0    0     0         0          0   
3        0             0        0     0    1     0         0          0   
4        0             0        0     1    0     0         0          0   
..     ...           ...      ...   ...  ...   ...       ...        ...   
295      1             0        1     0    0     0         0          0   
296      0             0        0     0    1     1         1          0   
297      0             0        0     0    1     0         1          0   
298      0             0        0     0    1     0         1          0   
299      1             0        1     1    0     0         0          0   

     sadness  surprise  trust  
0          0         0      1  
1          0         0      0  
2          1         0      0  
3          1         0      0  
4          1         0      0  
..       ...       ...    ...  
295        0         0      0  
296        0         0      0  
297        0         0      1  
298        0         0      0  
299        0         0      0  

[300 rows x 11 columns]


[['“', 'worry', 'is', 'a', 'down', 'payment', 'on', 'a', 'problem', 'you', 'may', 'never', 'have', "'", '.', 'joyce', 'meyer', '.', 'motivation', 'leadership', 'worry'], ['whatever', 'you', 'decide', 'to', 'do', 'make', 'sure', 'it', 'makes', 'you', 'happy', '.'], ['it', 'also', 'helps', 'that', 'the', 'majority', 'of', 'nfl', 'coaching', 'is', 'inept', '.', 'some', 'of', 'bill', 'o', "'", 'brien', "'", 's', 'play', 'calling', 'was', 'wow', ',', '!', 'gopats'],...........['i', 'will', 'fight', 'this', 'guy', '!', 'do', 'not', 'insult', 'the', 'lions', 'like', 'that', '!', 'but', 'seriously', 'they', 'kinda', 'are', '.', 'wasted', 'some', 'of', 'the', 'best', 'players'], ['i', 'wonder', 'how', 'a', 'guy', 'can', 'broke', 'his', 'penis', 'while', 'having', 'sex', '?', 'serious'], ['i', 'am', 'highly', 'animated', 'even', 'though', 'i', 'am', 'decomposing', '.']]


0       [“, worry, is, a, down, payment, on, a, proble...
1       [whatever, you, decide, to, do, make, sure, it...
2       [it, also, helps, that, the, majority, of, nfl...
3       [accept, the, challenges, so, that, you, can, ...
4       [my, roommate, :, it, ', s, okay, that, we, ca...
6833    [hi, !, we, are, working, towards, your, highl...
6834    [said, not, only, did, not, play, up, hrc, in,...
6835    [i, will, fight, this, guy, !, do, not, insult...
6836    [i, wonder, how, a, guy, can, broke, his, peni...
6837    [i, am, highly, animated, even, though, i, am,...
Length: 6838, dtype: object


[[ 4.93713953e-02 -3.80963739e-03  2.69042160e-02  1.20123737e-01
  -6.49027973e-02  3.87404412e-02  1.04131535e-01 -6.84523731e-02
   6.69738054e-02  7.30312392e-02 -4.77092117e-02 -9.83719155e-02
   2.29239091e-03  2.25124136e-02 -7.63789043e-02  6.14396073e-02
   9.35590491e-02  1.06499337e-01 -2.69374326e-02 -1.41343296e-01


Logistic Regression Accuracy =  0.25
F1 score =  0.5894897182025896
Hamming loss =  0.16333333333333333
python nlp classification word2vec emotion

尚不清楚您的TextPreProcessorSocialTokenizer类可能会做什么。您应该编辑问题以显示其代码,或显示结果文本的一些示例,以确保其按预期运行。 (例如:显示all_tweets的前几项和后几项。)

[all_tweets = train_tweets.append(test_tweets)行不可能达到您的期望。 (它将整个列表test_tweets放置为all_tweets的最后一个元素–但返回分配给Noneall_tweets。然后,Word2Vec模型可能为空-您应启用INFO日志记录以观察其进度并查看输出是否有异常,并添加代码后训练以打印有关模型的一些详细信息,从而确认进行了有用的训练。)

您确定train_tweets是通往.fit()的管道的正确格式吗? (发送到Word2Vec培训的文本似乎已经通过.split()被标记化了,但是pandas.Series train_tweets中的文本可能从未被标记化。)


© www.soinside.com 2019 - 2024. All rights reserved.