我正在尝试训练朴素贝叶斯模型进行情感分析,但我是Python新手,因为我一直在R中工作。
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder # Add this line
# Load the preprocessed TF-IDF matrix
tfidf_df = pd.read_excel('/Users/anisabakiu/Downloads/tfidf_r.xlsx')
# Load the original DataFrame with labels
df = pd.read_excel('/Users/anisabakiu/Downloads/all-review_label.xlsx')
# Drop NaN values if any
merged_df = pd.merge(tfidf_df, df[['review_id', 'label']], on='review_id').dropna()
# Encode labels if needed
label_encoder = LabelEncoder() # Instantiate LabelEncoder
merged_df['label'] = label_encoder.fit_transform(merged_df['label']) # Encode labels
# Define features (X) and target variable (y)
X = merged_df.drop(['label', 'review_id'], axis=1)
y = merged_df['label']
# Define a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()
# Create a Naive Bayes classifier
naive_bayes = MultinomialNB()
# Create a pipeline combining TF-IDF vectorizer and Naive Bayes classifier
pipeline = Pipeline([
('tfidf', tfidf_vectorizer),
('clf', naive_bayes),
])
# Perform cross-validation with 10 folds
cv_scores = cross_val_score(pipeline, X, y, cv=10)
# Fit the Naive Bayes model on the entire dataset
naive_bayes_model = pipeline.fit(X, y)
# Make predictions using the trained model
y_pred = naive_bayes_model.predict(X)
# Evaluate the model
accuracy = accuracy_score(y, y_pred)
print('Accuracy:', accuracy)
# Print confusion matrix
conf_matrix = confusion_matrix(y, y_pred)
print('Confusion Matrix:')
print(conf_matrix)
这段代码对我来说似乎是正确的,但我收到此错误:
raise KeyError(key) from err KeyError: 'label'
,我不明白。
你能帮我看看出了什么问题吗?
请检查“label”的拼写...它也区分大小写...还要检查它是否是列名之一...似乎没有名为“label”的列