我尝试过使用所有变量,也尝试过选择某些变量,但 MSE 仍然很高。我想知道我的代码是否有错误。我还尝试添加一些功能工程,但暂时将其注释掉,因为它使我的 MSE 变得更糟。
#kaggle : House Prices - Advanced Regression Techniques
import pandas as pd
import numpy as np
import sklearn.model_selection
import matplotlib.pyplot as plt
import sklearn as sk
import sklearn.tree
import sklearn.ensemble
df = pd.read_csv('/Users/andrewhashoush/Downloads/house-prices-advanced-regression-techniques/train.csv')
df_test = pd.read_csv('/Users/andrewhashoush/Downloads/house-prices-advanced-regression-techniques/test.csv')
print(df.head())
print(df.info())
#selected_features = ['OverallQual', 'YearBuilt', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea',
# 'GarageCars', 'GarageArea', 'MSZoning', 'Neighborhood',
# 'KitchenQual', 'CentralAir', 'LotArea', 'MSSubClass', 'LotFrontage',
# 'Street', 'LandContour', 'Utilities', 'OverallCond', 'RoofStyle',
# 'RoofMatl', 'BsmtQual','SaleCondition', 'SaleType', 'YrSold', 'MoSold',
# 'PoolArea']
selected_features = [
'LotFrontage', 'OverallQual', 'OverallCond', 'MasVnrArea', 'HalfBath',
'BedroomAbvGr', 'KitchenAbvGr', 'GarageCars', 'WoodDeckSF',
'OpenPorchSF', 'MoSold', 'YrSold', 'MSZoning', 'Alley',
'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood',
'Condition1', 'BldgType', 'HouseStyle', 'RoofStyle', 'Exterior1st',
'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
'SaleType', 'SaleCondition'
]
#feature engineering
#df['Quality_Condition'] = df['OverallQual'] * df['OverallCond']
#df['Age_at_Sale'] = df['YrSold'] - df['YearBuilt']
#selected_features += ['Quality_Condition', 'Age_at_Sale']
X = df[selected_features]
print(X.head())
for column in X.columns:
missing_data = df[column].isnull().sum()
print(f"{column}: {missing_data}")
categorical_vars = X.select_dtypes(include='object').columns.tolist()
numerical_vars = X.select_dtypes(exclude='object').columns.tolist()
print("Categorical Variables:", categorical_vars)
print()
print("Numerical Variables:", numerical_vars)
#%%
#filled the missing categorical values with mode
for var in categorical_vars:
mode_value = X[var].mode()[0]
# X[var] = X[var].fillna(mode_value)
# X.loc[:, var] = X.loc[:, var].fillna(mode_value)
X.loc[:, var] = X[var].fillna(mode_value)
#filled the missing numerical values with median
for var in numerical_vars:
median_value = X[var].median()
X.loc[:, var] = X[var].fillna(median_value)
#checking it
for column in X.columns:
missing_data = X[column].isnull().sum()
print(f"{column}: {missing_data}")
#one hot encoding
X = pd.get_dummies(X, columns=categorical_vars)
y = df['SalePrice']
#split the data
X_train, X_val, y_train, y_val = sklearn.model_selection.train_test_split(X,y, train_size =.8, random_state= 123)
dt_model = sklearn.tree.DecisionTreeRegressor(max_depth=5, random_state=123)
dt_model.fit(X_train, y_train)
y_pred = dt_model.predict(X_val)
mse_val = np.mean((y_pred - y_val)**2)
print(f"MSE: {mse_val}")
# Random Forest Regressor
rf_model = sklearn.ensemble.RandomForestRegressor(n_estimators=100, random_state=123)
rf_model.fit(X_train, y_train)
y_pred1 = rf_model.predict(X_val)
mse_val1 = np.mean((y_pred1 - y_val)**2)
print(f"MSE: {mse_val1}")
# Gradient Boosting Regressor
gb_model = sklearn.ensemble.GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=123)
gb_model.fit(X_train, y_train)
y_pred2 = gb_model.predict(X_val)
mse_val2 = np.mean((y_pred2 - y_val)**2)
print(f"MSE: {mse_val2}")
MSE: 1451852149.6361678
MSE: 944388532.5714014
MSE: 755815420.2686024
您的代码中似乎没有任何明显的错误。
不过,您似乎正在尝试根据您正在使用的数据的文件路径来预测房价。对于此任务,这些可能是合理的 MSE 值。
MSE 是平均squared 误差,所以如果你取它的平方根,你会得到root 均方误差。 RMSE 或多或少可以解释为预测中误差的平均绝对值,而 MSE 的解释有点棘手,因为 MSE 的测量单位是目标变量的平方,而 RMSE 的测量单位等于目标变量的平方到目标变量的那些。如果您对所提供的
MSE
值求平方根,您会得到 27,000 到 39,000 之间的值,这并不疯狂 - 换句话说,这意味着您的模型平均偏离了这么多。如果您以美元预测美国房价,那么对于您的模型来说,这是一个相对合理的范围。
更多关于 RMSE 与 MSE 的信息这里