如何挑选我可以用任意数据测试的训练有素的 NLP 模型

我正在创建一个 NLP SPAM/HAM 分类应用程序,并且我有一个训练有素的模型,具有很高的准确性


我是 NLP/ML 的新手,非常感谢您的帮助 附言。你可能需要帮我把事情说清楚

我知道你必须使用 pickle 命令,但除此之外我不知道该怎么做



import os
import pandas as pd
import re
import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.utils import resample
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score  
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from numpy.random import RandomState
from sklearn import preprocessing


path = r"C:\Users\Username\Desktop\Project"

HamData = []
SpamData = []

for root, folders, files in os.walk(path):
    if 'ham' in str(files):
        for file in files:
            with open(root + '\\' + file, encoding="iso8859-1") as hamfile:
                HamData.append(" ".join(hamfile.readlines()))
    elif 'spam' in str(files):
        for file in files:
            with open(root + '\\' + file, encoding="iso8859-1") as spamfile:
                SpamData.append(" ".join(spamfile.readlines()))

# AllEmails = HamData + SpamData
# labels = ["ham"]*len(HamData) + ["spam"]*len(SpamData)

# raw_df = pd.DataFrame(
#         {"email": AllEmails,
#         "label": labels}
#         )

def create_dataframe(HamData,SpamData):
    ham_dataframe=pd.DataFrame(HamData,columns =['email'])
    # add target column for ham
    # do same process for spam
    spam_dataframe=pd.DataFrame(SpamData,columns =['email'])
    # add target column for ham
    raw_df = pd.concat([ham_dataframe,spam_dataframe])
    #all_emails = all_emails.sample(frac=1).reset_index(drop=True)
    return raw_df

def clean_regex(string):
    #defining the cleaning function which will run several regex search & replace
    # processes on each email
    # m for message in email column
    string = re.sub(r'([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})',' ',str(string)) # email addresses
    string = re.sub(r'(http|https)://[a-zA-Z0-9\\./]+', ' ', str(string)) # url reg exp
    string = re.sub(r'\d+', '', str(string)) # numbers # replace any numbers with whitespace
    string = re.sub(r'<[^<]+?>', '', str(string)) # html <tags> #+? matches characters any number of times
    string = string.replace(r'[^a-zA-Z]', '') # ^ means inverse of anything a-zA -Z
    string = string.replace('nbsp', '') # remove multiple spaces and replace with single
    string = string.translate(str.maketrans('', '', punctuation)) # remove punctuation
    string = string.lower() # lower case

    return string

raw_df = create_dataframe(HamData,SpamData)

raw_df['RegEx'] = raw_df['email'].apply(clean_regex)

raw_df['Sec_Clean']=raw_df['RegEx'].apply(lambda x: ' '.join([item for item in x.split() if 3 <= len(item) <= 12]))

raw_df['Tokenized'] = raw_df.apply(lambda row: nltk.word_tokenize(row['Sec_Clean']), axis=1)

lem = nltk.WordNetLemmatizer()

def lemmatizer(t_text):
    text = [lem.lemmatize(word) for word in t_text]
    return text

raw_df['Lemmatize'] = raw_df['Tokenized'].apply(lambda x: lemmatizer(x))


def removeStopwords(token):
    StopWordsToRemove = nltk.corpus.stopwords.words('english')
    text = [word for word in token if word not in StopWordsToRemove]

    return text

raw_df['StopWordRemoval']=raw_df['Lemmatize'].apply(lambda x: removeStopwords(x))


def downsamp_maj(df,target):

    lenclass_1 = len(df[df[target]==1])
    lenclass_0 = len(df[df[target]==0])

    # if class 1 is larger than class 0
    if lenclass_1 > lenclass_0:

        # set class 1 as the majority and class 0 as minority
        df_majority = df[df[target]==1]
        df_minority = df[df[target]==0]
        # downsample majority to the number of classes in minority
        df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=lenclass_0,     # to match minority class
                                 random_state=123) # reproducible results 

        # join downsampled majortiy and minority into dataframe
        df = pd.concat([df_majority_downsampled, df_minority],ignore_index=True)
    # otherwise downsample class 0

        df_majority = df[df[target]==0]
        df_minority = df[df[target]==1]

        df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=lenclass_1,     # to match minority class
                                 random_state=123) # reproducible results 

        # Combine minority class with downsampled majority class
        df = pd.concat([df_majority_downsampled, df_minority],ignore_index=True)
    return df

balanced_emails = downsamp_maj(raw_df,'target')

s = balanced_emails.target.value_counts()
print('There are', s.values[0],'spam emails and', s.values[1],'ham emails in the dataset')

def tt_split(df):
    xtrain, xtest, ytrain, ytest = train_test_split(df['email'], df['target'], test_size=0.3,random_state=5)
    return xtrain,xtest,ytrain,ytest

xtrain,xtest,ytrain,ytest = tt_split(balanced_emails)

def tfid_vec(train,test):
    # initialize vectorizer - we will check ngrams up to length 2
    transformer = TfidfVectorizer(ngram_range=(1,2),min_df=5)
    # using the vectorizer to transform the training and test data to numerical values.  
    train_tfidf = transformer.fit_transform(train)
    test_tfidf = transformer.transform(test)
    return train_tfidf,test_tfidf

xtrain_tfidf,xtest_tfidf = tfid_vec(xtrain,xtest)

mymodelNB = MultinomialNB().fit(xtrain_tfidf, ytrain)
mymodelKNN = KNeighborsClassifier().fit(xtrain_tfidf,ytrain)

def evaluate_model(model,xtest,ytest):
    # get class predictions
    # set model name
    model_name = type(model).__name__
    # accuracy score
    score = metrics.accuracy_score(ytest, ypreds, normalize=True, sample_weight=None)
    print('\n\nAccuracy score of ', score)
    # call plot_matrix function 
    # classification report

    print(metrics.classification_report(ytest, ypreds,target_names=['Spam','Ham']))
    return score,model_name


