内折和外折好像不太对。我不确定我是否正确使用了训练和测试数据集。
欢迎任何帮助:)
...
# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Set the outer cross-validation loop
kf_outer = KFold(n_splits=5, shuffle=True, random_state=42)
# Initialize lists to store the scores
xgb_scores = []
rf_scores = []
ensemble_scores = []
# Loop over the outer folds
for train_index, test_index in kf_outer.split(X_scaled):
X_train, X_test = X_scaled[train_index], X_scaled[test_index]
y_train, y_test = y[train_index], y[test_index]
# Initialize lists to store the scores for the current fold
xgb_fold_scores = []
rf_fold_scores = []
ensemble_fold_scores = []
# Set the inner cross-validation loop for hyperparameter tuning
kf_inner = KFold(n_splits=5, shuffle=True, random_state=42)
# Loop over the inner folds for hyperparameter tuning
for inner_train_index, inner_test_index in kf_inner.split(X_train):
inner_X_train, inner_X_test = X_train[inner_train_index], X_train[inner_test_index]
inner_y_train, inner_y_test = y_train[inner_train_index], y_train[inner_test_index]
# Define the hyperparameter search space
params = {
'max_depth': (3, 10),
'learning_rate': (0.01, 0.5),
'n_estimators': (50, 500)
}
# Define the function to optimize (in this case, negative mean squared error)
def xgb_cv(max_depth, learning_rate, n_estimators):
xgb = XGBRegressor(max_depth=int(max_depth), learning_rate=learning_rate, n_estimators=int(n_estimators))
kf_inner = KFold(n_splits=5, shuffle=True, random_state=42)
mse = -np.mean(cross_val_score(xgb, inner_X_train, inner_y_train, cv=kf_inner, scoring='neg_mean_squared_error'))
return mse
# XGBoost hyperparameter tuning
xgb_bo = BayesianOptimization(f=xgb_cv, pbounds=params, verbose=2, random_state=42)
# Run the optimization for 50 iterations
xgb_bo.maximize(n_iter=50)
# Get the best hyperparameters and the best score
best_xgb_params = xgb_bo.max['params']
best_xgb_score = -xgb_bo.max['target']
# Train the best XGBoost model on the current training set
best_xgb_model = XGBRegressor(max_depth=int(best_xgb_params['max_depth']), learning_rate=best_xgb_params['learning_rate'], n_estimators=int(best_xgb_params['n_estimators']))
best_xgb_model.fit(inner_X_train, inner_y_train)
# Evaluate the XGBoost model
train_pred = best_xgb_model.predict(inner_X_train)
test_pred = best_xgb_model.predict(inner_X_test)
train_score = r2_score(inner_y_train, train_pred)
test_score = r2_score(inner_y_test, test_pred)
train_r2 = r2_score(inner_y_train, train_pred)
test_r2 = r2_score(inner_y_test, test_pred)
train_mse = mean_squared_error(inner_y_train, train_pred)
test_mse = mean_squared_error(inner_y_test, test_pred)
train_rmse = np.sqrt(train_mse)
test_rmse = np.sqrt(test_mse)
# Store the scores for the XGBoost model on the inner test set
xgb_fold_scores.append((train_score, test_score, train_mse, test_mse, train_rmse, test_rmse))
# Define the hyperparameter search space for the Random Forest model
rf_params = {
'max_depth': (3, 10),
'n_estimators': (50, 500)
}
# Define the function to optimize for the Random Forest model
def rf_cv(max_depth, n_estimators):
rf = RandomForestRegressor(max_depth=int(max_depth), n_estimators=int(n_estimators))
kf_inner = KFold(n_splits=5, shuffle=True, random_state=42)
mse = -np.mean(cross_val_score(rf, inner_X_train, inner_y_train, cv=kf_inner, scoring='neg_mean_squared_error'))
return mse
# Create the Bayesian optimization object for the Random Forest model
rf_bo = BayesianOptimization(f=rf_cv, pbounds=rf_params, verbose=2, random_state=42)
# Run the optimization for 50 iterations
rf_bo.maximize(n_iter=50)
# Get the best hyperparameters and the best score for the Random Forest model
best_rf_params = rf_bo.max['params']
best_rf_score = -rf_bo.max['target']
# Train the Random Forest model with the best hyperparameters
rf = RandomForestRegressor(max_depth=int(best_rf_params['max_depth']), n_estimators=int(best_rf_params['n_estimators']))
rf.fit(X_train, y_train)
# Evaluate the Random Forest model on the inner loop test set
train_pred_rf = rf.predict(inner_X_train)
test_pred_rf = rf.predict(inner_X_test)
train_score_rf = r2_score(inner_y_train, train_pred_rf)
test_score_rf = r2_score(inner_y_test, test_pred_rf)
train_r2_rf = r2_score(inner_y_train, train_pred_rf)
test_r2_rf = r2_score(inner_y_test, test_pred_rf)
train_mse_rf = mean_squared_error(inner_y_train, train_pred_rf)
test_mse_rf = mean_squared_error(inner_y_test, test_pred_rf)
train_rmse_rf = np.sqrt(train_mse_rf)
test_rmse_rf = np.sqrt(test_mse_rf)
# Store the scores for the Random Forest model on the inner test set
rf_fold_scores.append((train_score_rf, test_score_rf, train_mse_rf, test_mse_rf, train_rmse_rf, test_rmse_rf))
# Mean the scores for each model over the inner folds
mean_xgb_scores_fold = np.mean(xgb_fold_scores, axis=0)
mean_rf_scores_fold = np.mean(rf_fold_scores, axis=0)
# Store the scores for the best models on the current outer fold
xgb_scores.append(mean_xgb_scores_fold)
rf_scores.append(mean_rf_scores_fold)
# Calculate the mean predictions for XGBoost and Random Forest models over the inner folds
mean_train_pred_xgb = np.mean([x[0] for x in xgb_fold_scores], axis=0)
mean_test_pred_xgb = np.mean([x[1] for x in xgb_fold_scores], axis=0)
mean_train_pred_rf = np.mean([x[0] for x in rf_fold_scores], axis=0)
mean_test_pred_rf = np.mean([x[1] for x in rf_fold_scores], axis=0)
# Evaluate the ensemble model on the outer test set by averaging the predictions of the XGBoost and Random Forest models
ensemble_pred_train = (mean_train_pred_xgb + mean_train_pred_rf) / 2
ensemble_pred_test = (mean_test_pred_xgb + mean_test_pred_rf) / 2
ensemble_train_score = r2_score(inner_y_train, ensemble_pred_train)
ensemble_test_score = r2_score(inner_y_test, ensemble_pred_test)
ensemble_train_mse = mean_squared_error(inner_y_train, ensemble_pred_train)
ensemble_test_mse = mean_squared_error(inner_y_test, ensemble_pred_test)
ensemble_train_rmse = np.sqrt(ensemble_train_mse)
ensemble_test_rmse = np.sqrt(ensemble_test_mse)
# Store the scores for the ensemble model
ensemble_scores.append((ensemble_train_score, ensemble_test_score, ensemble_train_mse, ensemble_test_mse, ensemble_train_rmse, ensemble_test_rmse))
# Calculate the mean scores for each model
mean_xgb_scores = np.mean(xgb_scores, axis=0)
mean_rf_scores = np.mean(rf_scores, axis=0)
mean_ensemble_scores = np.mean(ensemble_scores, axis=0)
# Print the mean scores for each model
print("XGBoost average scores:", mean_xgb_scores)
print("Random Forest average scores:", mean_rf_scores)
print("Ensemble average scores:", mean_ensemble_scores)
# Choose the best model based on the test set scores
best_model = np.argmax([mean_xgb_scores[1], mean_rf_scores[1], mean_ensemble_scores[1]])
# Make predictions on the test set
if best_model == 0:
best_model = test_pred
elif best_model == 1:
best_model = test_pred_rf
else:
best_model = ensemble_pred_test
...
我正在尝试对下面的代码实施嵌套交叉验证。但是,它不起作用。我正在尝试使用循环外部文件夹中的内部文件夹。