我是机器学习的新手,并且想到了对Twitter数据进行情感分析的想法。我正在使用Tweepy获取100条推文,这将是我的测试集,并且有两列(文本,标签)。而我的训练集是一个csv文件,包含3列(文本,标签,主题)。到目前为止,我已经为训练集和测试集创建了一个数据框,并希望使用朴素贝叶斯分类器,但是有点卡在需要应用Count Vectorizer和Tfidf的地方。谢谢。
import pandas as pd
import twitter
import re
from nltk.tokenize import word_tokenize
from string import punctuation
from nltk.corpus import stopwords
import nltk
import string
import traceback
import pandas as pd
# initialize api instance
twitter_api = twitter.Api(consumer_key='123456789',
consumer_secret='123456789',
access_token_key='123456789',
access_token_secret='123456789')
# test authentication
print(twitter_api.VerifyCredentials())
def buildTestSet(search_keyword):
try:
tweets_fetched = twitter_api.GetSearch(search_keyword, count=100)
print("Fetched " + str(len(tweets_fetched)) + " tweets for the term " + search_keyword)
data = [{"text": status.text, "label": None} for status in tweets_fetched]
df = pd.DataFrame(data)
print(df)
return df
except Exception:
print("Unfortunately, something went wrong..")
traceback.print_exc()
def buildTrainSet(fileName):
try:
df2 = pd.read_csv(fileName, encoding='unicode_escape')
print(df2)
return df2
except Exception:
traceback.print_exc()
def cleaningSet(words):
for text in words.iterrows() :
lowerSet = text.__str__().lower()
cleaned_text = lowerSet.__str__().translate(str.maketrans('', '', string.punctuation))
cleaned_text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', cleaned_text) # remove URLs
cleaned_text = re.sub('@[^\s]+', 'AT_USER', cleaned_text) # remove usernames
cleaned_text = re.sub(r'#([^\s]+)', r'\1', cleaned_text) # remove the # in #hashtag
search_term = input("Enter a search keyword: ")
testDataSet= buildTestSet(search_term)
trainDataSet = buildTrainSet("TweetDataFile.csv")
testDataSet_clean = cleaningSet(testDataSet)
trainDataSet_clean = cleaningSet(trainDataSet)
categories = ['positive', 'negative', 'neutral']
try :
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()
x_train_tf = count_vector.fit_transform(trainDataSet_clean)
print(x_train_tf.shape)
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_tf)
print(x_train_tfidf.shape)
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB.fit(x_train_tfidf, categories)
x_test_tf = count_vector.transform(testDataSet_clean)
x_test_tfidf = tfidf_transformer.transform(x_test_tf)
predicted = clf.predict(x_test_tfidf)
except Exception:
traceback.print_exc()
编辑1:我对变量_testDataSet_clean_和_trainDataSet_clean_进行了注释,程序可以运行直到分类器再次给出另一个错误为止(参见图2)。
形状也被打印为(3x3)。
您可以看到错误消息。它说trainDataSet_clean
是NoneType
。我认为您在进行上述处理后,您的trainDataSet_clean
就变成了NoneType
,所以我可以看到您的trainDataSet_clean
还是完整的代码?