如何挑选我可以用任意数据测试的训练有素的 NLP 模型

问题描述 投票:0回答:0

我正在创建一个 NLP SPAM/HAM 分类应用程序,并且我有一个训练有素的模型,具有很高的准确性

我不确定如何保存此模型以供使用,因此我可以针对它运行一些任意文本

我是 NLP/ML 的新手,非常感谢您的帮助 附言。你可能需要帮我把事情说清楚

我知道你必须使用 pickle 命令,但除此之外我不知道该怎么做

提前为冗长的代码道歉,但我不想错过任何东西,它可以帮助别人更好地理解我的问题

`

import os
import pandas as pd
import re
import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.utils import resample
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score  
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from numpy.random import RandomState
from sklearn import preprocessing


nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')


path = r"C:\Users\Username\Desktop\Project"

HamData = []
SpamData = []

for root, folders, files in os.walk(path):
    if 'ham' in str(files):
        for file in files:
            with open(root + '\\' + file, encoding="iso8859-1") as hamfile:
                HamData.append(" ".join(hamfile.readlines()))
                
    elif 'spam' in str(files):
        for file in files:
            with open(root + '\\' + file, encoding="iso8859-1") as spamfile:
                SpamData.append(" ".join(spamfile.readlines()))

# AllEmails = HamData + SpamData
# labels = ["ham"]*len(HamData) + ["spam"]*len(SpamData)

# raw_df = pd.DataFrame(
#         {"email": AllEmails,
#         "label": labels}
#         )


def create_dataframe(HamData,SpamData):
    ham_dataframe=pd.DataFrame(HamData,columns =['email'])
    # add target column for ham
    ham_dataframe['target']=0
    
    # do same process for spam
    spam_dataframe=pd.DataFrame(SpamData,columns =['email'])
    # add target column for ham
    spam_dataframe['target']=1
    
    raw_df = pd.concat([ham_dataframe,spam_dataframe])
    #all_emails = all_emails.sample(frac=1).reset_index(drop=True)
    return raw_df

def clean_regex(string):
    
    #defining the cleaning function which will run several regex search & replace
    # processes on each email
    # m for message in email column
    string = re.sub(r'([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})',' ',str(string)) # email addresses
    string = re.sub(r'(http|https)://[a-zA-Z0-9\\./]+', ' ', str(string)) # url reg exp
    string = re.sub(r'\d+', '', str(string)) # numbers # replace any numbers with whitespace
    string = re.sub(r'<[^<]+?>', '', str(string)) # html <tags> #+? matches characters any number of times
    string = string.replace(r'[^a-zA-Z]', '') # ^ means inverse of anything a-zA -Z
    string = string.replace('nbsp', '') # remove multiple spaces and replace with single
    string = string.translate(str.maketrans('', '', punctuation)) # remove punctuation
    string = string.lower() # lower case

    return string

raw_df = create_dataframe(HamData,SpamData)

raw_df['RegEx'] = raw_df['email'].apply(clean_regex)
print(raw_df['RegEx'])

raw_df['Sec_Clean']=raw_df['RegEx'].apply(lambda x: ' '.join([item for item in x.split() if 3 <= len(item) <= 12]))
print(raw_df['Sec_Clean'])

raw_df['Tokenized'] = raw_df.apply(lambda row: nltk.word_tokenize(row['Sec_Clean']), axis=1)
print(raw_df['Tokenized'])


lem = nltk.WordNetLemmatizer()

def lemmatizer(t_text):
    text = [lem.lemmatize(word) for word in t_text]
    return text

raw_df['Lemmatize'] = raw_df['Tokenized'].apply(lambda x: lemmatizer(x))
print(raw_df['Lemmatize'])

print(raw_df)

def removeStopwords(token):
    StopWordsToRemove = nltk.corpus.stopwords.words('english')
    text = [word for word in token if word not in StopWordsToRemove]

    return text


raw_df['StopWordRemoval']=raw_df['Lemmatize'].apply(lambda x: removeStopwords(x))
print(raw_df['StopWordRemoval'])

print(raw_df)


def downsamp_maj(df,target):

    lenclass_1 = len(df[df[target]==1])
    lenclass_0 = len(df[df[target]==0])

    # if class 1 is larger than class 0
    if lenclass_1 > lenclass_0:

        # set class 1 as the majority and class 0 as minority
        df_majority = df[df[target]==1]
        df_minority = df[df[target]==0]
        
        # downsample majority to the number of classes in minority
        df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=lenclass_0,     # to match minority class
                                 random_state=123) # reproducible results 

        # join downsampled majortiy and minority into dataframe
        df = pd.concat([df_majority_downsampled, df_minority],ignore_index=True)
        
    # otherwise downsample class 0
    else:

        df_majority = df[df[target]==0]
        df_minority = df[df[target]==1]

        df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=lenclass_1,     # to match minority class
                                 random_state=123) # reproducible results 

        # Combine minority class with downsampled majority class
        df = pd.concat([df_majority_downsampled, df_minority],ignore_index=True)
    
    return df

balanced_emails = downsamp_maj(raw_df,'target')

s = balanced_emails.target.value_counts()
print('There are', s.values[0],'spam emails and', s.values[1],'ham emails in the dataset')





def tt_split(df):
    
    xtrain, xtest, ytrain, ytest = train_test_split(df['email'], df['target'], test_size=0.3,random_state=5)
    
    return xtrain,xtest,ytrain,ytest

xtrain,xtest,ytrain,ytest = tt_split(balanced_emails)


def tfid_vec(train,test):
    
    # initialize vectorizer - we will check ngrams up to length 2
    transformer = TfidfVectorizer(ngram_range=(1,2),min_df=5)
    
    # using the vectorizer to transform the training and test data to numerical values.  
    train_tfidf = transformer.fit_transform(train)
    test_tfidf = transformer.transform(test)
    
    return train_tfidf,test_tfidf

xtrain_tfidf,xtest_tfidf = tfid_vec(xtrain,xtest)

mymodelNB = MultinomialNB().fit(xtrain_tfidf, ytrain)
mymodelKNN = KNeighborsClassifier().fit(xtrain_tfidf,ytrain)


def evaluate_model(model,xtest,ytest):
    
    # get class predictions
    ypreds=model.predict(xtest)
    
    # set model name
    model_name = type(model).__name__
    
    print('\n---------',model_name,'---------\n')
    
    # accuracy score
    score = metrics.accuracy_score(ytest, ypreds, normalize=True, sample_weight=None)
    print('\n\nAccuracy score of ', score)
    
    # call plot_matrix function 
    classes=['spam','ham']
    plt.figure()
    #plot_matrix(ytest,ypreds,classes,model_name)
    
    # classification report

    print(metrics.classification_report(ytest, ypreds,target_names=['Spam','Ham']))
    
    return score,model_name

score,model_name=evaluate_model(mymodelNB,xtest_tfidf,ytest)

python pandas machine-learning nlp pickle
© www.soinside.com 2019 - 2024. All rights reserved.