SVM 模型将错误的数据点突出显示为支持向量

问题描述 投票:0回答:1

我正在研究 SVM 模型作为作业。无论我做什么,模型都会选择错误的数据点作为支持向量: scatter svm

这是我的数据: csv 数据

这是我的代码:

def decision_hyperplane(clf, x, y=None, dimension=2):
    """
    Return a decision line (dimension 2, return y based on x) or a
    decision plane (dimension 3, return z based on x and y).

    Decision plane equation is wx + b = 0, so in 2d case:
    w.dot(x) + b = w_x * x + w_y * y + b = 0
    y = (-w_x * x - b) / w_y
    """
    if dimension == 2:
        return (-clf.intercept_[0] - clf.coef_[0][0] * x) / clf.coef_[0][1]
    elif dimension == 3:
        return (-clf.intercept_[0] - clf.coef_[0][0] * x - clf.coef_[0][1] * y) / clf.coef_[0][2]


file_path = 'cell_samples.csv'  # Replace with the actual file path
df = pd.read_csv(file_path)
x_column= 'UnifSize'
y_column= 'UnifShape'
class_name = 'Class'
X = df[[x_column, y_column]]
y = df[class_name]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=255)

clf = svm.SVC(kernel='linear', C=100000)
# fit the model
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

#scatter data points
plt.scatter(X_test[x_column][y_pred == 2], X_test[y_column][y_pred == 2], label='Type 1', c='blue', marker='o')
plt.scatter(X_test[x_column][y_pred == 4], X_test[y_column][y_pred == 4], label='Type 2', c='red', marker='s')

w = clf.coef_[0] # parameters  of the hyperplane
b = clf.intercept_ #hyperplane interception
a = -(w[1] / w[0])
xx = np.linspace(X_test.iloc[:, 0].min(), X_test.iloc[:, 0].max())
yy = a * xx - (b / w[0]) # getting corresponding y-points

# plot the decision boundary (hyperplane)
hyperplane = decision_hyperplane(clf, xx)
plt.plot(xx, hyperplane, linewidth=2, color='black')

# Highlight only the closest support vectors
vectors = clf.support_vectors_[np.abs(clf.decision_function(clf.support_vectors_)).argsort()]
# get unique vectors (no duplicates)
vectors = np.unique(vectors, axis=0)

plt.scatter(vectors[:, 0],
            vectors[:, 1],
            s=100, facecolors='none',
            linewidth=1,
            edgecolors='k', alpha=.5,
            marker='o', label='Support Vectors'
            )

# plot description`
plt.title('Scatter plot with SVM Decision Boundary')
plt.xlabel(x_column)
plt.ylabel(y_column)
plt.legend(loc='upper right')

# Show the plot
plt.show()

从散点图中,你可以很容易地看出支持向量只有4个。

我尝试使用 C 值 100000,但程序滞后并且需要很长时间才能完成

python scikit-learn svm
1个回答
0
投票

支持向量是从训练数据中学习的,但在代码中仅绘制了测试点。

您可以使用默认设置(C=1)运行 SVC,并获得以下结果,其中包括测试数据和训练数据:

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn import svm
from sklearn.model_selection import train_test_split

def decision_hyperplane(clf, x, y=None, dimension=2):
    """
    Return a decision line (dimension 2, return y based on x) or a
    decision plane (dimension 3, return z based on x and y).

    Decision plane equation is wx + b = 0, so in 2d case:
    w.dot(x) + b = w_x * x + w_y * y + b = 0
    y = (-w_x * x - b) / w_y
    """
    if dimension == 2:
        return (-clf.intercept_[0] - clf.coef_[0][0] * x) / clf.coef_[0][1]
    elif dimension == 3:
        return (-clf.intercept_[0] - clf.coef_[0][0] * x - clf.coef_[0][1] * y) / clf.coef_[0][2]


file_path = '../cell_samples.csv'  # Replace with the actual file path
df = pd.read_csv(file_path)
x_column= 'UnifSize'
y_column= 'UnifShape'
class_name = 'Class'

X = df[[x_column, y_column]]
y = df[class_name]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=255)

clf = svm.SVC(kernel='linear')
# fit the model
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train)
y_pred = clf.predict(X_test)

#train: scatter data points
plt.scatter(X_train[x_column][y_pred_train == 2], X_train[y_column][y_pred_train == 2],
            label='train Type 1', c='darkblue', marker='o', s=50)
plt.scatter(X_train[x_column][y_pred_train == 4], X_train[y_column][y_pred_train == 4],
            label='train Type 2', c='darkred', marker='s', s=50)

#test: scatter data points
plt.scatter(X_test[x_column][y_pred == 2], X_test[y_column][y_pred == 2],
            label='test Type 1', c='dodgerblue', marker='o', s=13)
plt.scatter(X_test[x_column][y_pred == 4], X_test[y_column][y_pred == 4],
            label='train Type 2', c='tomato', marker='s', s=13)

w = clf.coef_[0] # parameters  of the hyperplane
b = clf.intercept_ #hyperplane interception
a = -(w[1] / w[0])
xx = np.linspace(X_test.iloc[:, 0].min(), X_test.iloc[:, 0].max())
yy = a * xx - (b / w[0]) # getting corresponding y-points

# plot the decision boundary (hyperplane)
hyperplane = decision_hyperplane(clf, xx)
plt.plot(xx, hyperplane, linewidth=2, color='black')

# Highlight only the closest support vectors
vectors = clf.support_vectors_[np.abs(clf.decision_function(clf.support_vectors_)).argsort()]
# get unique vectors (no duplicates)
vectors = np.unique(vectors, axis=0)

plt.scatter(vectors[:, 0],
            vectors[:, 1],
            s=220,
            color='black',
            linewidth=3,
            edgecolors='none', alpha=.30,
            marker='o', label='Support Vectors',
            zorder=-1)

# plot description
plt.title('Scatter plot with SVM Decision Boundary')
plt.xlabel(x_column)
plt.ylabel(y_column)
plt.gcf().legend(loc='lower left', ncols=3, bbox_to_anchor=(0.12, 0.1), fontsize=9)
plt.gcf().set_size_inches(8, 3.5)

# Show the plot
plt.show()
© www.soinside.com 2019 - 2024. All rights reserved.