我正在创建一个 NLP SPAM/HAM 分类应用程序,并且我有一个训练有素的模型,具有很高的准确性
我不确定如何保存此模型以供使用,因此我可以针对它运行一些任意文本
我是 NLP/ML 的新手,非常感谢您的帮助 附言。你可能需要帮我把事情说清楚
我知道你必须使用 pickle 命令,但除此之外我不知道该怎么做
提前为冗长的代码道歉,但我不想错过任何东西,它可以帮助别人更好地理解我的问题
`
import os
import pandas as pd
import re
import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.utils import resample
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from numpy.random import RandomState
from sklearn import preprocessing
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
path = r"C:\Users\Username\Desktop\Project"
HamData = []
SpamData = []
for root, folders, files in os.walk(path):
if 'ham' in str(files):
for file in files:
with open(root + '\\' + file, encoding="iso8859-1") as hamfile:
HamData.append(" ".join(hamfile.readlines()))
elif 'spam' in str(files):
for file in files:
with open(root + '\\' + file, encoding="iso8859-1") as spamfile:
SpamData.append(" ".join(spamfile.readlines()))
# AllEmails = HamData + SpamData
# labels = ["ham"]*len(HamData) + ["spam"]*len(SpamData)
# raw_df = pd.DataFrame(
# {"email": AllEmails,
# "label": labels}
# )
def create_dataframe(HamData,SpamData):
ham_dataframe=pd.DataFrame(HamData,columns =['email'])
# add target column for ham
ham_dataframe['target']=0
# do same process for spam
spam_dataframe=pd.DataFrame(SpamData,columns =['email'])
# add target column for ham
spam_dataframe['target']=1
raw_df = pd.concat([ham_dataframe,spam_dataframe])
#all_emails = all_emails.sample(frac=1).reset_index(drop=True)
return raw_df
def clean_regex(string):
#defining the cleaning function which will run several regex search & replace
# processes on each email
# m for message in email column
string = re.sub(r'([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})',' ',str(string)) # email addresses
string = re.sub(r'(http|https)://[a-zA-Z0-9\\./]+', ' ', str(string)) # url reg exp
string = re.sub(r'\d+', '', str(string)) # numbers # replace any numbers with whitespace
string = re.sub(r'<[^<]+?>', '', str(string)) # html <tags> #+? matches characters any number of times
string = string.replace(r'[^a-zA-Z]', '') # ^ means inverse of anything a-zA -Z
string = string.replace('nbsp', '') # remove multiple spaces and replace with single
string = string.translate(str.maketrans('', '', punctuation)) # remove punctuation
string = string.lower() # lower case
return string
raw_df = create_dataframe(HamData,SpamData)
raw_df['RegEx'] = raw_df['email'].apply(clean_regex)
print(raw_df['RegEx'])
raw_df['Sec_Clean']=raw_df['RegEx'].apply(lambda x: ' '.join([item for item in x.split() if 3 <= len(item) <= 12]))
print(raw_df['Sec_Clean'])
raw_df['Tokenized'] = raw_df.apply(lambda row: nltk.word_tokenize(row['Sec_Clean']), axis=1)
print(raw_df['Tokenized'])
lem = nltk.WordNetLemmatizer()
def lemmatizer(t_text):
text = [lem.lemmatize(word) for word in t_text]
return text
raw_df['Lemmatize'] = raw_df['Tokenized'].apply(lambda x: lemmatizer(x))
print(raw_df['Lemmatize'])
print(raw_df)
def removeStopwords(token):
StopWordsToRemove = nltk.corpus.stopwords.words('english')
text = [word for word in token if word not in StopWordsToRemove]
return text
raw_df['StopWordRemoval']=raw_df['Lemmatize'].apply(lambda x: removeStopwords(x))
print(raw_df['StopWordRemoval'])
print(raw_df)
def downsamp_maj(df,target):
lenclass_1 = len(df[df[target]==1])
lenclass_0 = len(df[df[target]==0])
# if class 1 is larger than class 0
if lenclass_1 > lenclass_0:
# set class 1 as the majority and class 0 as minority
df_majority = df[df[target]==1]
df_minority = df[df[target]==0]
# downsample majority to the number of classes in minority
df_majority_downsampled = resample(df_majority,
replace=False, # sample without replacement
n_samples=lenclass_0, # to match minority class
random_state=123) # reproducible results
# join downsampled majortiy and minority into dataframe
df = pd.concat([df_majority_downsampled, df_minority],ignore_index=True)
# otherwise downsample class 0
else:
df_majority = df[df[target]==0]
df_minority = df[df[target]==1]
df_majority_downsampled = resample(df_majority,
replace=False, # sample without replacement
n_samples=lenclass_1, # to match minority class
random_state=123) # reproducible results
# Combine minority class with downsampled majority class
df = pd.concat([df_majority_downsampled, df_minority],ignore_index=True)
return df
balanced_emails = downsamp_maj(raw_df,'target')
s = balanced_emails.target.value_counts()
print('There are', s.values[0],'spam emails and', s.values[1],'ham emails in the dataset')
def tt_split(df):
xtrain, xtest, ytrain, ytest = train_test_split(df['email'], df['target'], test_size=0.3,random_state=5)
return xtrain,xtest,ytrain,ytest
xtrain,xtest,ytrain,ytest = tt_split(balanced_emails)
def tfid_vec(train,test):
# initialize vectorizer - we will check ngrams up to length 2
transformer = TfidfVectorizer(ngram_range=(1,2),min_df=5)
# using the vectorizer to transform the training and test data to numerical values.
train_tfidf = transformer.fit_transform(train)
test_tfidf = transformer.transform(test)
return train_tfidf,test_tfidf
xtrain_tfidf,xtest_tfidf = tfid_vec(xtrain,xtest)
mymodelNB = MultinomialNB().fit(xtrain_tfidf, ytrain)
mymodelKNN = KNeighborsClassifier().fit(xtrain_tfidf,ytrain)
def evaluate_model(model,xtest,ytest):
# get class predictions
ypreds=model.predict(xtest)
# set model name
model_name = type(model).__name__
print('\n---------',model_name,'---------\n')
# accuracy score
score = metrics.accuracy_score(ytest, ypreds, normalize=True, sample_weight=None)
print('\n\nAccuracy score of ', score)
# call plot_matrix function
classes=['spam','ham']
plt.figure()
#plot_matrix(ytest,ypreds,classes,model_name)
# classification report
print(metrics.classification_report(ytest, ypreds,target_names=['Spam','Ham']))
return score,model_name
score,model_name=evaluate_model(mymodelNB,xtest_tfidf,ytest)