知道我在这里做错了什么吗: 我的数据集约为 20k 行,mse 约为 11298955095.811989,我不太确定我做错了什么? 我正在尝试查找哪个数据集以及哪个 k 给出最小值,但没有一个值有任何意义:
import random
def split_df(dataframe):
data_rows = dataframe.values.tolist()
random.shuffle(data_rows)
train_val = 0.7
split_index = int(len(data_rows) * train_val)
train_data = data_rows[:split_index]
test_data = data_rows[split_index:]
train_df = pd.DataFrame(train_data, columns=dataframe.columns)
test_df = pd.DataFrame(test_data, columns=dataframe.columns)
return train_df, test_df
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
dataframes = [no_nulls, outliers_removed, mean_imputed, median_imputed]
target = 'median_house_value'
k_vals = range(1, 30)
for df in dataframes:
mse_dictionary = {}
X = df.drop(columns=[target]) # Features
y = df[target] # Target variable
# Split data into train and test
train_df, test_df = split_df(df)
# Separate features and target variable for train and test
X_train = train_df.drop(columns=[target])
y_train = train_df[target]
X_test = test_df.drop(columns=[target])
y_test = test_df[target]
for k in k_vals:
knn_regressor = KNeighborsRegressor(n_neighbors=k)
knn_regressor.fit(X_train, y_train)
predictions = knn_regressor.predict(X_test)
squared_errors = (predictions - y_test) ** 2 # Calculate squared errors
mse = squared_errors.mean() # Compute mean of squared errors to get MSE
mse_dictionary[k] = mse
print(f"MSE Dictionary for dataframe {df}: {mse_dictionary}")
# Additional information for debugging
print(f"Max MSE: {max(mse_dictionary.values())}")
print(f"Min MSE: {min(mse_dictionary.values())}")
print(f"Mean MSE: {sum(mse_dictionary.values()) / len(mse_dictionary)}")
事实证明这些值是正确的,回归模型对于小数据帧来说不太准确。