将二元分类Python scikit-learn模型切换为多类分类模型

问题描述 投票:0回答:1

我目前在切换以下代码以适应多类变量(3 个级别)时遇到问题。

# data import
from ucimlrepo import fetch_ucirepo
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import LabelBinarizer

# fetch breast cancer dataset
bc = fetch_ucirepo(id=17)

# data (as pandas dataframes)
bc_X = bc.data.features
bc_y = bc.data.targets

# fetch heart disease dataset
hd = fetch_ucirepo(id=45)
# data (as pandas dataframes)
hd_X = hd.data.features
hd_y = hd.data.targets


# fetch iris dataset
ir = fetch_ucirepo(id=53)
# data (as pandas dataframes)
ir_X = ir.data.features
ir_y = ir.data.targets


# fetch wine quality dataset
wq = fetch_ucirepo(id=186)
# data (as pandas dataframes)
wq_X = wq.data.features
wq_y = wq.data.targets


# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(hd_X, hd_y, test_size=0.2, random_state=42)

# cap values function
def cap_values(values):
    # Convert input to a numpy array
    values = np.array(values)
    # Cap values at 1 using numpy's clip function
    capped_values = np.clip(values, None, 1)
    return capped_values

# Find the indices of rows with missing values in X_train
missing_indices = X_train.isnull().any(axis=1)
missing_indices2 = X_test.isnull().any(axis=1)

# Remove rows with missing values from X_train and y_train
X_train = X_train[~missing_indices]
y_train = y_train[~missing_indices]
X_test = X_test[~missing_indices2]
y_test = y_test[~missing_indices2]

# Apply the cap_values function to y_test and y_train
y_test = cap_values(y_test)
y_train = cap_values(y_train)

# Step 3: Fit each model to the training data

# Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Naïve Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Support Vector Machine
svm_model = SVC(random_state=42)
svm_model.fit(X_train, y_train)

# Step 4: Evaluate the models using the testing data
models = {
    'Decision Tree': dt_model,
    'Naïve Bayes': nb_model,
    'Random Forest': rf_model,
    'Support Vector Machine': svm_model
}

# Initialize a dictionary to store the performance metrics
performance_metrics = {}

# Iterate through each model and calculate the metrics
for name, model in models.items():
    # Get predictions
    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else model.decision_function(X_test)

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    ConfusionMatrixDisplay(cm, display_labels=model.classes_).plot(cmap=plt.cm.Blues)
    plt.title(f'Confusion Matrix: {name}')
    plt.show()

    # Calculate sensitivity (recall) and specificity
    if cm.shape == (2, 2):
      # Binary classification metrics
      tn, fp, fn, tp = cm.ravel()
      sensitivity = tp / (tp + fn)
      specificity = tn / (tn + fp)
    else:
      # Multi-class classification metrics
      sensitivity = recall_score(y_test, y_pred, average='macro')
      specificity = None
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)

    # Calculate balanced accuracy
    balanced_accuracy = (sensitivity + specificity) / 2

    # Calculate precision, recall, and F1-score
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    # Calculate AUC
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
    auc = roc_auc_score(y_test, y_pred_prob)

    # Store the performance metrics
    performance_metrics[name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Sensitivity': sensitivity,
        'Specificity': specificity,
        'Balanced Accuracy': balanced_accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'AUC': auc
    }

    # Plot ROC curve
    plt.figure()
    plt.plot(fpr, tpr, label=f'{name} (AUC = {auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line
    plt.title(f'ROC Curve: {name}')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    plt.show()

# Display the performance metrics in a DataFrame
metrics_df = pd.DataFrame(performance_metrics).transpose()
print(metrics_df)

我需要准确度、灵敏度、特异性、平衡准确度、精确度、召回率、f1 分数、auc,但最重要的是模型的 roc 图。这非常适合二元分类,所以我很困惑为什么它不适用于多类。展平变量不起作用。

感谢您的帮助

我尝试压平变量,我尝试做一个 for 循环来进行 3 个单独的二元分类并取每个的平均值,我尝试将数据分成 3 个数据集(每个数据集只有 2 个类,因此它本质上是一个二元分类问题).

python classification svm decision-tree roc
1个回答
0
投票

我修改了代码以执行以下操作:

  • 对于每个类别,获取其 ROC 曲线,然后将这些曲线平均在一起以获得该模型的单个曲线
  • 添加了宏观特异性评分
  • 修改了代码以使用二进制和多类标签(定义新函数以帮助解决此问题)

如果您有二进制数据,则值得根据

sklearn
仔细检查结果;这将确保事情是一致的。我已经检查了总体指标。


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from ucimlrepo import fetch_ucirepo

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
)
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, roc_auc_score

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

# fetch heart disease dataset
# hd = fetch_ucirepo(id=45)
# data (as pandas dataframes)
hd_X_orig = hd.data.features
hd_y_orig = hd.data.targets

#Remove rows with any missing feature
# Find the indices of rows with missing values in X_train
# Remove rows with missing values from X_train and y_train
missing_indices = hd_X_orig.isnull().any(axis=1)
hd_X = hd_X_orig[~missing_indices].copy()
hd_y = hd_y_orig[~missing_indices].copy()

# Step 2: Split the data into training and testing sets
train_frac, val_frac = 0.7, 0.2
test_frac = 1 - (train_frac + val_frac)

train_val_ixs, test_ixs = train_test_split(
    range(len(hd_X)), test_size=test_frac, random_state=0
)

train_ixs, val_ixs = train_test_split(
    train_val_ixs, test_size=val_frac / (1 - test_frac), random_state=0
)

print(
    'Partition sizes are:',
    f'train={train_frac} ({len(train_ixs)} samples) | '
    f'val={val_frac} ({len(val_ixs)} samples) | '
    f'test={test_frac:.1f} ({len(test_ixs)} samples)'
)

X_train, X_val, X_test = [hd_X.iloc[ixs] for ixs in [train_ixs, val_ixs, test_ixs]]
y_train, y_val, y_test = [hd_y.iloc[ixs].values.ravel() for ixs in [train_ixs, val_ixs, test_ixs]]

# Step 3: Fit each model to the training data

# Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Naïve Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Support Vector Machine
svm_model = SVC(probability=True, random_state=42)
svm_model.fit(X_train, y_train)

# Step 4: Evaluate the models using the testing data
models = {
    'Decision Tree': dt_model,
    'Naïve Bayes': nb_model,
    'Random Forest': rf_model,
    'Support Vector Machine': svm_model
}

# Initialize a dictionary to store the performance metrics
performance_metrics = {}

#
# Returns macro metrics for any number of classes (can be binary or multiclass)
# Returns tuple: (
#     global TN, global FP, global FN, global TP,
#     macro sensitivity, macro specificity, macro precision, macro f1,
#     macro accuracy, macro balanced accuracy, subset accuracy,
#     supports
# )
#
def macro_metrics(y_true, y_pred):
    classes = np.unique(y_true)
    
    tp_perclass = []
    fp_perclass = []
    tn_perclass = []
    fn_perclass = []
    sensitivity_perclass = []
    specificity_perclass = []
    precision_perclass = []
    f1_perclass = []
    accuracy_perclass = []
    balanced_accuracy_perclass = []
    support_perclass = []
    
    #For each class, get its metrics
    for clas in classes:
        #Convert to binary True/False for this class
        class_true = y_true == clas
        class_pred = y_pred == clas
        
        tp = (class_pred & class_true).sum()
        tn = (~class_pred & ~class_true).sum()
        
        fp = (class_pred & ~class_true).sum()
        fn = (~class_pred & class_true).sum()
        
        eps = 1e-6 #prevent division by 0
        sensitivity = tp / (tp + fn + eps)
        specificity = tn / (tn + fp + eps)
        precision = tp / (tp + fp + eps)
        f1 = 2 * precision * sensitivity / (precision + sensitivity + eps)
        accuracy = (tp + tn) / (tp + tn + fp + fn)
        balanced_accuracy = (sensitivity + specificity) / 2
        
        #Record metrics for each class
        tp_perclass.append(tp)
        tn_perclass.append(tn)
        fp_perclass.append(fp)
        fn_perclass.append(fn)
        
        sensitivity_perclass.append(sensitivity)
        specificity_perclass.append(specificity)
        precision_perclass.append(precision)
        f1_perclass.append(f1)
        accuracy_perclass.append(accuracy)
        balanced_accuracy_perclass.append(balanced_accuracy)
        support_perclass.append(class_true.sum())
    
    #This is equivalent to sklearns accuracy_score() for multilabel data
    subset_accuracy = (y_pred == y_true).mean()
    
    #Return macro metrics, i.e. a simple average over each class's score
    return (
        #Global TN, FP, FN, TP
        sum(tn_perclass), sum(fp_perclass), sum(fn_perclass), sum(tp_perclass),
        
        #Macro metrics
        np.mean(sensitivity_perclass),
        np.mean(specificity_perclass),
        np.mean(precision_perclass),
        np.mean(f1_perclass),
        np.mean(accuracy_perclass),
        np.mean(balanced_accuracy_perclass),
        
        #like accuracy_score():
        subset_accuracy,
        
        #Supports
        support_perclass
    )

# Iterate through each model and calculate the metrics
roc_ax = plt.figure(figsize=(7, 3)).add_subplot()
plt.close()

for name, model in models.items():
    # Get predictions
    y_pred = model.predict(X_val)
    y_pred_probs = model.predict_proba(X_val)

    # Confusion matrix
    cm = confusion_matrix(y_val, y_pred)
    ConfusionMatrixDisplay(cm, display_labels=model.classes_).plot(
        cmap=plt.cm.Blues, colorbar=False
    )
    plt.title(f'Confusion Matrix: {name}', fontsize=10)
    plt.gcf().set_size_inches(2.5, 2.5)
    plt.show()

    # Binary/multiclass classification metrics
    tn, fp, fn, tp, macro_sens, macro_spec, macro_prec, macro_f1, macro_acc,\
        macro_bacc, acc, supports = macro_metrics(y_val, y_pred)

    #Confirm against sklearn's classification_report()
    if False:
        print('tn:', tn, 'fp:', fp, 'fn:', fn, 'tp:', tp,
            '\nmacro sens:', macro_sens,
            '\nmacro spec:', macro_spec,
            '\nmacro prec:', macro_prec,
            '\nmacro f1:', macro_f1,
            '\nmacro acc:', macro_acc,
            '\nmacro bacc:', macro_bacc,
            '\nacc:', acc,
            '\nsupports:', supports
            )
    
    #sklearn's classification_report() for various class-wise & averaged metrics
    print( classification_report(y_val, y_pred, digits=6, zero_division=0) )

    # Calculate precision, recall, and F1-score
    precision = precision_score(y_val, y_pred, average='macro', zero_division=0)
    recall = recall_score(y_val, y_pred, average='macro')
    f1 = f1_score(y_val, y_pred, average='macro')

    # Calculate AUC
    y_val_onehot = pd.get_dummies(y_val, dtype=int).values
    #Note that "auc" below is still affected by class imbalance. See its doc page for info.
    auc = roc_auc_score(y_val_onehot, y_pred_probs, average='macro', multi_class='ovr')
    
    #ROC averaged over classes
    # Get the ROC for each label, and simply average together on a common axis
    common_roc_fpr = np.linspace(0, 1, 100)
    tprs_perclass = []
    
    for clas in np.unique(y_val):
        fpr, tpr, thresholds = roc_curve(y_val==clas, y_pred_probs[:, clas])
        tprs_perclass.append(
            np.interp(common_fpr, fpr, tpr)
        )
    macro_roc_tpr = np.row_stack(tprs_perclass).mean(axis=0)
    
    # Store the performance metrics
    performance_metrics[name] = {
        'Accuracy': accuracy_score(y_val, y_pred),
        # 'Specificity': specificity, #removed - same as recall
        'Balanced Accuracy': macro_bacc,
        'Precision': precision,
        'Recall': recall,
        'Specificity': macro_spec, #added
        'F1 Score': f1,
        'AUC': auc #a macro average
    }
    
    # Add ROC curve
    roc_ax.plot(common_roc_fpr, macro_roc_tpr, label=f'{name} (AUC={auc:.2f})')
    
roc_ax.plot([0, 1], [0, 1], 'k--')  # Diagonal line
roc_ax.set(xlim=(-0.01, 1.01), ylim=(-0.01, 1.01))
roc_ax.set_title(f'ROC Curves')
roc_ax.set_xlabel('False Positive Rate')
roc_ax.set_ylabel('True Positive Rate')
roc_ax.legend(fontsize=9)
plt.sca(roc_ax)

# Display the performance metrics in a DataFrame
metrics_df = pd.DataFrame(performance_metrics).transpose()
display(metrics_df)
© www.soinside.com 2019 - 2024. All rights reserved.