产品分类问题,模型应该显示产品类别,但显示数值

问题描述 投票:0回答:1

火车.py

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import LabelEncoder
import joblib
import numpy as np
nltk.download('stopwords')

stop = stopwords.words('english')
porter = PorterStemmer()


def preprocess_data(text):
    ''' The function to remove punctuation, stopwords, and apply stemming'''
    words = re.sub("[^a-zA-Z]", " ", text)
    words = [word.lower() for word in words.split()
             if word.lower() not in stop]
    words = [porter.stem(word) for word in words]
    return " ".join(words)


# Read the data
df = pd.read_csv('ifound_cat.csv')
df['Description'] = df['Description'].apply(preprocess_data)

# Flattening and Encoding the Labels
df['MainCategory'] = df['Category'].apply(lambda x: x.split(' > ')[0])
label_encoder = LabelEncoder()
df['EncodedCategory'] = label_encoder.fit_transform(df['MainCategory'])

# Split the data
X = df['Description']
y = df['EncodedCategory']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)

# Vectorize the data
vectorizer = TfidfVectorizer(min_df=5, ngram_range=(1, 2))
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Apply MiniBatchKMeans for faster clustering
kmeans = MiniBatchKMeans(n_clusters=100, random_state=42)
auto_generated_labels = kmeans.fit_predict(X_train)

# Create a new DataFrame for the auto-generated labels
auto_generated_df = pd.DataFrame(
    {'auto_generated_category': auto_generated_labels})
# code is created by *********JUNAID AHMED MIRANI******
# Concatenate it with the original DataFrame
df = pd.concat([df, auto_generated_df], axis=1)

# Fit the label encoder on all possible labels
all_possible_labels = np.arange(kmeans.n_clusters)
label_encoder.fit(all_possible_labels)
# Train a classification model (e.g., Logistic Regression)
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)

# Save the model
joblib.dump(classifier, 'testing/category_classifier.joblib')
joblib.dump(vectorizer, 'testing/tfidf_vectorizer.joblib')
# Save the KMeans model and label encoder
joblib.dump(kmeans, 'testing/kmeans_model.joblib')
joblib.dump(label_encoder, 'testing/label_encoder.joblib')
# The line joblib.dump(label_encoder.classes_, 'testing/label_encoder_classes.joblib') should be placed
# in your train.py file, right after you fit the LabelEncoder.
#  This line is saving the classes of the LabelEncoder after it has been fit on the training data.
joblib.dump(label_encoder.classes_, 'testing/label_encoder_classes.joblib')

print(f'Model Accuracy: {score}')

预测.py


import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import joblib
from joblib import load
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
import nltk

nltk.download('punkt')

MODELSPATH = 'testing/category_classifier.joblib'
TFIDF_VECTORIZER_PATH = 'testing/tfidf_vectorizer.joblib'
LABEL_ENCODER_PATH = 'testing/label_encoder.joblib'
DATAFILE = 'ifound_cat.csv'

stop = stopwords.words('english')
porter = PorterStemmer()

# Load the trained TF-IDF vectorizer
vectorizer = load(TFIDF_VECTORIZER_PATH)

# Load the trained label encoder
label_encoder = joblib.load(LABEL_ENCODER_PATH)

# print(label_encoder.classes_)


def load_model():
    '''Loading pretrained model'''
    model = load(MODELSPATH)
    return model


def preprocess_data(text):
    ''' Applying stopwords and stemming on raw data'''
    words = word_tokenize(text.lower())
    words = [word for word in words if word not in stop]
    words = [porter.stem(word) for word in words]
    return " ".join(words)


if __name__ == '__main__':
    data = pd.read_csv(DATAFILE)
    model = load_model()

    # Preprocess the description column
    data['Description'] = data['Description'].apply(preprocess_data)

    # Convert descriptions to TF-IDF features
    description_features = vectorizer.transform(data['Description'])

    # Make predictions using the input features
    predictions = model.predict(description_features).astype(int)

    # Transform numerical labels to category names
    predicted_category_names = label_encoder.inverse_transform(predictions)

    # Add predicted category names to the DataFrame
    data['Predicted_Category'] = predicted_category_names

    # Save the data with predicted category names
    data.to_csv('result.csv', index=False)

你能告诉我这个模型有什么问题吗?因为我在获得描述作为输入后运行它,它应该给我产品类别,但它却给我编码的数值

你能告诉我这个模型有什么问题吗?因为我在获得描述作为输入后运行它,它应该给我产品类别,但它却给我编码的数值

python machine-learning nlp text-classification
1个回答
0
投票

您提供的 Predict.py 脚本部分完整,似乎是为加载预先训练的模型和相关组件来进行预测而设计的。

prediction.py 中缺少的组件和注意事项:

预处理功能:脚本缺少在进行预测之前处理新文本数据所需的预处理功能。由于模型是根据预处理数据进行训练的,因此在进行预测之前对任何新数据应用完全相同的预处理步骤至关重要。这应包括小写、删除标点符号、删除停用词和词干提取。

模型加载函数:load_model 函数已定义,但从未在提供的代码中调用。该函数应该用于加载经过训练的分类模型,该模型似乎是基于 train.py 脚本的逻辑回归模型。

预测函数:该脚本不包含实际进行预测的函数。典型的预测函数将采用原始文本输入,使用与训练数据相同的步骤对其进行预处理,使用加载的 TF-IDF 矢量化器对其进行矢量化,然后使用加载的模型进行预测。

标签解码:进行预测后,脚本应使用加载的标签编码器将数值预测转换回原始标签。这对于最终用户的可解释性非常重要。

错误处理:与train.py脚本类似,此代码片段中不存在错误处理。包含 try- except 块或其他错误检查机制来处理加载模型或进行预测时可能出现的问题将是有益的。

输入数据:脚本引用数据文件 DATAFILE = 'ifound_cat.csv' 但不在提供的代码片段中使用它。在完整的预测脚本中,您可以从此文件或其他来源加载新数据,对其进行预处理,然后进行预测。

最新问题
© www.soinside.com 2019 - 2024. All rights reserved.