我正在研究 SVM 模型作为作业。无论我做什么,模型都会选择错误的数据点作为支持向量:
这是我的数据: csv 数据
这是我的代码:
def decision_hyperplane(clf, x, y=None, dimension=2):
"""
Return a decision line (dimension 2, return y based on x) or a
decision plane (dimension 3, return z based on x and y).
Decision plane equation is wx + b = 0, so in 2d case:
w.dot(x) + b = w_x * x + w_y * y + b = 0
y = (-w_x * x - b) / w_y
"""
if dimension == 2:
return (-clf.intercept_[0] - clf.coef_[0][0] * x) / clf.coef_[0][1]
elif dimension == 3:
return (-clf.intercept_[0] - clf.coef_[0][0] * x - clf.coef_[0][1] * y) / clf.coef_[0][2]
file_path = 'cell_samples.csv' # Replace with the actual file path
df = pd.read_csv(file_path)
x_column= 'UnifSize'
y_column= 'UnifShape'
class_name = 'Class'
X = df[[x_column, y_column]]
y = df[class_name]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=255)
clf = svm.SVC(kernel='linear', C=100000)
# fit the model
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
#scatter data points
plt.scatter(X_test[x_column][y_pred == 2], X_test[y_column][y_pred == 2], label='Type 1', c='blue', marker='o')
plt.scatter(X_test[x_column][y_pred == 4], X_test[y_column][y_pred == 4], label='Type 2', c='red', marker='s')
w = clf.coef_[0] # parameters of the hyperplane
b = clf.intercept_ #hyperplane interception
a = -(w[1] / w[0])
xx = np.linspace(X_test.iloc[:, 0].min(), X_test.iloc[:, 0].max())
yy = a * xx - (b / w[0]) # getting corresponding y-points
# plot the decision boundary (hyperplane)
hyperplane = decision_hyperplane(clf, xx)
plt.plot(xx, hyperplane, linewidth=2, color='black')
# Highlight only the closest support vectors
vectors = clf.support_vectors_[np.abs(clf.decision_function(clf.support_vectors_)).argsort()]
# get unique vectors (no duplicates)
vectors = np.unique(vectors, axis=0)
plt.scatter(vectors[:, 0],
vectors[:, 1],
s=100, facecolors='none',
linewidth=1,
edgecolors='k', alpha=.5,
marker='o', label='Support Vectors'
)
# plot description`
plt.title('Scatter plot with SVM Decision Boundary')
plt.xlabel(x_column)
plt.ylabel(y_column)
plt.legend(loc='upper right')
# Show the plot
plt.show()
从散点图中,你可以很容易地看出支持向量只有4个。
我尝试使用 C 值 100000,但程序滞后并且需要很长时间才能完成
支持向量是从训练数据中学习的,但在代码中仅绘制了测试点。
您可以使用默认设置(C=1)运行 SVC,并获得以下结果,其中包括测试数据和训练数据:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn import svm
from sklearn.model_selection import train_test_split
def decision_hyperplane(clf, x, y=None, dimension=2):
"""
Return a decision line (dimension 2, return y based on x) or a
decision plane (dimension 3, return z based on x and y).
Decision plane equation is wx + b = 0, so in 2d case:
w.dot(x) + b = w_x * x + w_y * y + b = 0
y = (-w_x * x - b) / w_y
"""
if dimension == 2:
return (-clf.intercept_[0] - clf.coef_[0][0] * x) / clf.coef_[0][1]
elif dimension == 3:
return (-clf.intercept_[0] - clf.coef_[0][0] * x - clf.coef_[0][1] * y) / clf.coef_[0][2]
file_path = '../cell_samples.csv' # Replace with the actual file path
df = pd.read_csv(file_path)
x_column= 'UnifSize'
y_column= 'UnifShape'
class_name = 'Class'
X = df[[x_column, y_column]]
y = df[class_name]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=255)
clf = svm.SVC(kernel='linear')
# fit the model
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train)
y_pred = clf.predict(X_test)
#train: scatter data points
plt.scatter(X_train[x_column][y_pred_train == 2], X_train[y_column][y_pred_train == 2],
label='train Type 1', c='darkblue', marker='o', s=50)
plt.scatter(X_train[x_column][y_pred_train == 4], X_train[y_column][y_pred_train == 4],
label='train Type 2', c='darkred', marker='s', s=50)
#test: scatter data points
plt.scatter(X_test[x_column][y_pred == 2], X_test[y_column][y_pred == 2],
label='test Type 1', c='dodgerblue', marker='o', s=13)
plt.scatter(X_test[x_column][y_pred == 4], X_test[y_column][y_pred == 4],
label='train Type 2', c='tomato', marker='s', s=13)
w = clf.coef_[0] # parameters of the hyperplane
b = clf.intercept_ #hyperplane interception
a = -(w[1] / w[0])
xx = np.linspace(X_test.iloc[:, 0].min(), X_test.iloc[:, 0].max())
yy = a * xx - (b / w[0]) # getting corresponding y-points
# plot the decision boundary (hyperplane)
hyperplane = decision_hyperplane(clf, xx)
plt.plot(xx, hyperplane, linewidth=2, color='black')
# Highlight only the closest support vectors
vectors = clf.support_vectors_[np.abs(clf.decision_function(clf.support_vectors_)).argsort()]
# get unique vectors (no duplicates)
vectors = np.unique(vectors, axis=0)
plt.scatter(vectors[:, 0],
vectors[:, 1],
s=220,
color='black',
linewidth=3,
edgecolors='none', alpha=.30,
marker='o', label='Support Vectors',
zorder=-1)
# plot description
plt.title('Scatter plot with SVM Decision Boundary')
plt.xlabel(x_column)
plt.ylabel(y_column)
plt.gcf().legend(loc='lower left', ncols=3, bbox_to_anchor=(0.12, 0.1), fontsize=9)
plt.gcf().set_size_inches(8, 3.5)
# Show the plot
plt.show()