我正在尝试在来自 Kaggle 的 Iris.csv 数据集上使用 逻辑回归 实现多类分类。这是我的代码。
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
def standardize(X_tr): # (x-Mean(x))/std(X) Normalizes data
for i in range(X_tr.shape[1]):
X_tr[:, i] = (X_tr[:, i] - np.mean(X_tr[:, i])) / np.std(X_tr[:, i])
return X_tr
def sigmoid(z): #Sigmoid/Logistic function
sig = 1 / (1 + np.exp(-z))
return sig
def cost(theta, X, y):
z = np.dot(X, theta)
cost0 = y.T.dot(np.log(sigmoid(z)))
cost1 = (1 - y).T.dot(np.log(1 - sigmoid(z)))
cost = -((cost1 + cost0)) / len(y)
return cost
def initialize(X): #Initializing X feature matrix and Theta vector
thetas = np.zeros((X.shape[1] + 1, len(np.unique(y))))
X = np.c_[np.ones((X.shape[0], 1)), X] # adding 691 rows of ones as the first column in X
return thetas, X
def fit(X, y, alpha=0.01, iterations=1000): # Gradient Descent
thetas_list = []
X = np.c_[np.ones((X.shape[0], 1)), X]
for i in range(len(np.unique(y))):
y_one_vs_all = np.where(y == np.unique(y)[i], 1, 0)
thetas, _ = initialize(X)
for j in range(iterations):
z = np.dot(X, thetas[:, i])
h = sigmoid(z)
gradient = np.dot(X.T, (h - y_one_vs_all)) / len(y)
thetas[:, i] -= alpha * gradient
thetas_list.append(thetas[:, i])
global gthetas
gthetas = thetas_list
return None
def predict(X):
X = np.c_[np.ones((X.shape[0], 1)), X]
predictions = []
for sample in X:
probs = []
for thetas in gthetas:
z = np.dot(sample, thetas)
probs.append(sigmoid(z))
predictions.append(np.argmax(probs) + 1)
return predictions
# load data
df = pd.read_csv("Iris.csv")
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
# convert class categorical values to numerical values
df['Species'].replace('Iris-setosa', 1, inplace=True)
df['Species'].replace('Iris-versicolor', 2, inplace=True)
df['Species'].replace('Iris-virginica', 3, inplace=True)
# prepare one-vs-all labels for multiclass classification
y1 = pd.DataFrame(np.zeros((len(y), len(np.unique(y)))))
for i in range(len(np.unique(y))):
for j in range(len(y1)):
if y[j] == np.unique(y)[i]:
y1.iloc[j, i] = 1
else:
y1.iloc[j, i] = 0
# split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y1, test_size=0.2, random_state=0)
# standardize features
X_train = standardize(X_train)
X_test = standardize(X_test)
# fit logistic regression model
fit(X_train, y_train, alpha=0.01, iterations=400)
# make predictions on test set
predictions = predict(X_test)
print(predictions)
以下是我遇到的错误。
ValueError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_14368/3997569506.py in <module>
1 # standardize features
2 X_train = standardize(X_train)
----> 3 thetas_list = fit(X_train, y_train)
4 plt.scatter(range(len(cost_list)), cost_list, c="blue")
5 plt.show()
~\AppData\Local\Temp/ipykernel_14368/3827160719.py in fit(X, y, alpha, iter)
6 thetas, _ = initialize(X)
7 for j in range(iter):
----> 8 z = dot(X, thetas[:, i])
9 h = sigmoid(z)
10 gradient = dot(X.T, (h - y_one_vs_all)) / len(y)
<__array_function__ internals> in dot(*args, **kwargs)
ValueError: shapes (120,6) and (7,) not aligned: 6 (dim 1) != 7 (dim 0)
任何修复此错误的帮助将不胜感激。我已经查看了有关堆栈溢出的其他答案,但我仍然无法弄清楚。
我想要完成的事情:以下代码用作为每个类在 y(结果,物种类)中创建列的基础。因此,y1 中的列数(如下所示)将等于类别总数。鸢尾花数据集为 3。例如,对于第一列(类别为 1,即 Iris-setosa),数据集中的任何一行为“Iris-setosa”都将在 y1 的对应行中标记为 1。任何其他类别(第 2 类和第 3 类:Iris-versicolor 和 Iris-virginica)将在 y1 的第一列中标记为 0。
for i in range(len(np.unique(y))):
for j in range(len(y1)):
if y[j] == np.unique(y)[i]:
y1.iloc[j, i] = 1 #one vs. all
else:
y1.iloc[j, i] = 0 #all others will be 0
您应该检查 theta 数组是如何构建的。您正在
initialize
函数中使用 (X.shape[1] + 1, nrows)
的形状对其进行初始化。错误告诉您,您无法计算形状为 X
的数组 (a, b)
和形状为 (b+1, c)
的 theta 之间的点积。您可以尝试删除 theta 定义中的+1
。然后你会发现另一个关于梯度的问题,你试图再次计算两个不兼容数组之间的点积。我希望这对你有帮助,祝你好运!