我试图在一个数据集上应用Kernel Perceptron算法。所以我已经写好了代码并运行了它。它工作正常,但当我试图绘制决策边界时,它需要无限的时间来运行。在这里,我附上代码
# All the import statements
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import random
import pylab as pl
#Kernel Perceptron class where I wrote fit and predict functions
def linear_kernel(x1, x2):
return np.dot(x1, x2)
def polynomial_kernel(x, y, p=3):
return (1 + np.dot(x, y)) ** p
class KernelPerceptron(object):
def __init__(self, kernel=linear_kernel, T=1):
self.kernel = kernel
self.T = T
def fit(self, X, y):
n_samples, n_features = X.shape
#np.hstack((X, np.ones((n_samples, 1))))
self.alpha = np.zeros(n_samples, dtype=np.float64)
# Gram matrix
K = np.zeros((n_samples, n_samples))
for i in range(n_samples):
for j in range(n_samples):
K[i,j] = self.kernel(X[i], X[j])
for t in range(self.T):
for i in range(n_samples):
if np.sign(np.sum(K[:,i] * self.alpha * y)) != y[i]:
self.alpha[i] += 1.0
# Support vectors
sv = self.alpha > 1e-5
ind = np.arange(len(self.alpha))[sv]
self.alpha = self.alpha[sv]
self.sv = X[sv]
self.sv_y = y[sv]
print (len(self.alpha), n_samples)
def project(self, X):
y_predict = np.zeros(len(X))
for i in range(len(X)):
s = 0
for a, sv_y, sv in zip(self.alpha, self.sv_y, self.sv):
s += a * sv_y * self.kernel(X[i], sv)
y_predict[i] = s
return y_predict
def predict(self, X):
X = np.atleast_2d(X)
n_samples, n_features = X.shape
#np.hstack((X, np.ones((n_samples, 1))))
return np.sign(self.project(X))
#Testing on the dataset I have
data = pd.read_csv("Dataset_1_Team_35.csv").to_numpy()
points = []
labels = []
i = 0
while i<1000 :
l = []
l.append(data[i][0])
l.append(data[i][1])
points.append(l)
labels.append(data[i][2])
i+=1
X = np.array(points)
y = np.array(labels)
# print(type(X),type(y),len(X),len(y))
print(X.shape,y.shape)
xtr,xts,ytr,yts = train_test_split(X,y,test_size = 0.2)
print(xtr.shape,ytr.shape)
clf =KernelPerceptron(polynomial_kernel , 2)
clf.fit(xtr,ytr)
pred = clf.predict(xtr)
val = accuracy_score(pred,ytr)
print(val)
#Code for plotting the decision boundary
def make_meshgrid(x, y, h=.02):
x_min, x_max = x.min() - 1, x.max() + 1
y_min, y_max = y.min() - 1, y.max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
return xx, yy
def plot_contours(ax, clf, xx, yy, **params):
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
out = ax.contourf(xx, yy, Z, **params)
return out
fig, ax = plt.subplots()
X0, X1 = xtr[:, 0], xtr[:, 1]
xx, yy = make_meshgrid(X0, X1)
plot_contours(ax, clf, xx, yy, cmap=plt.cm.coolwarm, alpha=1) #line taking infinite time to load
ax.scatter(X0, X1, c=ytr, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
ax.set_title(title)
于是我试着调试,我试着在jupyter笔记本上逐行运行,发现这一行的
plot_contours(ax, clf, xx, yy, cmap=plt.cm.coolwarm, alpha=1)
正在花费无限的时间来运行?当我尝试在另一个数据集上运行该算法时,它的运行时间较少。
谁能帮我解决这个问题?
如果有人想要数据集,这是一个简单的数据集,里面有1000个条目(点和它们对应的标签)。鏈接 到数据集上。
不是无限的时间,只是在训练时,你在拟合800个数据点,然后在800个数据点上进行预测,但当你创建一个图时,你有28889748个数据点,因此它需要这么多的时间。
为了减少这么多的数据点,同时创建一个图,我建议做两件事。1) 使用标准标量对数据进行归一化处理 2) 在make_meshgrid函数中,增加创建网格时的步长(比如从0.02到0.2)。
这里是修改后的代码。
# All the import statements
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import random
import pylab as pl
#Kernel Perceptron class where I wrote fit and predict functions
def linear_kernel(x1, x2):
return np.dot(x1, x2)
def polynomial_kernel(x, y, p=3):
return (1 + np.dot(x, y)) ** p
class KernelPerceptron(object):
def __init__(self, kernel=linear_kernel, T=1):
self.kernel = kernel
self.T = T
def fit(self, X, y):
n_samples, n_features = X.shape
#np.hstack((X, np.ones((n_samples, 1))))
self.alpha = np.zeros(n_samples, dtype=np.float64)
# Gram matrix
K = np.zeros((n_samples, n_samples))
for i in range(n_samples):
for j in range(n_samples):
K[i,j] = self.kernel(X[i], X[j])
for t in range(self.T):
for i in range(n_samples):
if np.sign(np.sum(K[:,i] * self.alpha * y)) != y[i]:
self.alpha[i] += 1.0
# Support vectors
sv = self.alpha > 1e-5
ind = np.arange(len(self.alpha))[sv]
self.alpha = self.alpha[sv]
self.sv = X[sv]
self.sv_y = y[sv]
print (len(self.alpha), n_samples)
def project(self, X):
y_predict = np.zeros(len(X))
print(f'data points len: {len(X)}')
for i in range(len(X)):
# print('dbg3.2')
s = 0
for a, sv_y, sv in zip(self.alpha, self.sv_y, self.sv):
s += a * sv_y * self.kernel(X[i], sv)
y_predict[i] = s
return y_predict
def predict(self, X):
X = np.atleast_2d(X)
n_samples, n_features = X.shape
#np.hstack((X, np.ones((n_samples, 1))))
return np.sign(self.project(X))
#Testing on the dataset I have
data = pd.read_csv("Dataset_1_Team_35.csv").to_numpy()
points = []
labels = []
i = 0
while i<1000 :
l = []
l.append(data[i][0])
l.append(data[i][1])
points.append(l)
labels.append(data[i][2])
i+=1
X = np.array(points)
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
y = np.array(labels)
# print(type(X),type(y),len(X),len(y))
print(X.shape,y.shape)
xtr,xts,ytr,yts = train_test_split(X,y,test_size = 0.2)
print(xtr.shape,ytr.shape)
clf = KernelPerceptron(polynomial_kernel , 2)
clf.fit(xtr,ytr)
print(f'xtr: {xtr}')
pred = clf.predict(xtr)
val = accuracy_score(pred,ytr)
print(val)
#Code for plotting the decision boundary
def make_meshgrid(x, y, h=.02):
x_min, x_max = x.min() - 1, x.max() + 1
y_min, y_max = y.min() - 1, y.max() + 1
# print(f'xmin: {x_min}, xmax: {x_max}, ymin: {y_min}, ymax: {y_max}')
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
return xx, yy
def plot_contours(ax, clf, xx, yy, **params):
# print(f'np.c_[xx.ravel(), yy.ravel()]: {np.c_[xx.ravel(), yy.ravel()]}')
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
out = ax.contourf(xx, yy, Z, **params)
return out
fig, ax = plt.subplots()
X0, X1 = xtr[:, 0], xtr[:, 1]
xx, yy = make_meshgrid(X0, X1, 0.2)
plot_contours(ax, clf, xx, yy, cmap=plt.cm.coolwarm, alpha=1) #line taking infinite time to load
ax.scatter(X0, X1, c=ytr, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
ax.set_title('title')
plt.show()