但是我的图显示只有 1/10 的特征与我的目标线性相关。
请参阅生成具有真实系数的数据的代码,以使用 SGD 测试线性回归 我发现每个特征 VRS 目标的分散度可能不是线性相关的
import numpy as np
##generate data
np.random.seed(0) # for reproducibility
def rand_X_y_LR(nsamples=None, nfeatures=None, plot_XY = False):
# Define the number of samples and features
num_samples = nsamples
num_features = nfeatures
# Generate a random design matrix X with values between 0 and 1
X = np.random.rand(num_samples, num_features)
print(f"shape of random X; {X.shape}")
ones_column = np.ones((len(X), 1))
print(f"shape ones_column, {ones_column.shape}")
X_plusOnes=np.hstack([ones_column, X])
print(f"shape of X_plusOnes_column, {X_plusOnes.shape}")
# Generate random coefficients for the features
true_coefficients = np.random.normal(loc=0, scale=1, size=(num_features+1))
print(f"shape of true_coefficients; {true_coefficients.shape}")
# Generate random noise for the target variable
noise = np.random.normal(loc=0, scale=1)
# Calculate the target variable y using a linear combination of X and coefficients
#y = np.dot(X_plusOnes, true_coefficients) + noise #X dot B
y = X_plusOnes @ true_coefficients + noise #X@B
print(f"y.shape; {y.shape}")
if plot_XY == True:
#plot each X column against target
fig, axes = plt.subplots(nrows=round(num_features/2), ncols=2, figsize=(9, 7))
for i, ax in enumerate(axes.flat):
ax.scatter(Xdata_in[:, i], y_data_in)
ax.set_xlabel(f"Xdata_in_col {i}")
ax.set_ylabel('Target')
ax.set_title(f"Xdata_in_col {i} vrs Target")
plt.tight_layout()
plt.show()
#fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(9, 7))
plt.figure(figsize=(4,5))
plt.hist(y_data_in)
plt.title('Target distribution')
plt.show()
plt.figure(figsize=(4,5))
plt.hist(true_coefficients)
plt.title('true_coefficients')
plt.show()
return X, y, true_coefficients
# Generating random dataset of size 1024x10 for X
Xdata_in, y_data_in, true_beta = rand_X_y_LR(nsamples=1000, nfeatures=10)
我会用以下方式计算X,Y:
代码如下所示:
def rand_X_y_LR(nsamples=None, nfeatures=None, plot_XY = False):
# Define the number of samples and features
y = np.random.uniform(-200, 200, size=(nsamples))
true_coefficients = np.random.uniform(-20, 20, size = (nsamples + 1))
true_coefficients[-1] = 0
# Generate a random design matrix X with values between 0 and 1
X_list = []
for i in range(nfeatures):
b = np.random.uniform(-20, 20)
X_temp = (y - b) / true_coefficients[i]
noise = np.random.normal(loc=0, scale=5, size = nsamples)
X_temp += noise
X_list.append(X_temp)
true_coefficients[-1] += b
X = np.vstack(X_list).T
print(f"shape of random X; {X.shape}")
ones_column = np.ones((len(X), 1))
print(f"shape ones_column, {ones_column.shape}")
X_plusOnes=np.hstack([ones_column, X])
print(f"shape of X_plusOnes_column, {X_plusOnes.shape}")
# Generate random coefficients for the features
print(true_coefficients)
print(f"shape of true_coefficients; {true_coefficients.shape}")
# Generate random noise for the target variable
noise = np.random.normal(loc=0, scale=1, size = len(X_plusOnes))
#print("noise:", noise)
y += noise
# Calculate the target variable y using a linear combination of X and coefficients
#y = np.dot(X_plusOnes, true_coefficients) + noise #X dot B
print(f"y.shape; {y.shape}")
if plot_XY == True:
#plot each X column against target
fig, axes = plt.subplots(nrows=round(num_features/2), ncols=2, figsize=(9, 7))
for i, ax in enumerate(axes.flat):
ax.scatter(Xdata_in[:, i], y_data_in)
ax.set_xlabel(f"Xdata_in_col {i}")
ax.set_ylabel('Target')
ax.set_title(f"Xdata_in_col {i} vrs Target")
plt.tight_layout()
plt.show()
#fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(9, 7))
plt.figure(figsize=(4,5))
plt.hist(y_data_in)
plt.title('Target distribution')
plt.show()
plt.figure(figsize=(4,5))
plt.hist(true_coefficients)
plt.title('true_coefficients')
plt.show()
return X, y, true_coefficients
如果我们查看每个变量的分布图,我们将得到如下结果:考虑到第 y 行,我们发现与 X_i 的其余部分存在相关性
如果我们查看绝对相关性图表,我们会得到以下结果:
可以看出Y中都在0.75以上。
也许当你使用 SGD 时你会发现其他系数,这可能是因为在使用该算法之前通常会应用变换。