XGBoost 和随机森林模型的嵌套交叉验证

问题描述 投票:0回答:0

内折和外折好像不太对。我不确定我是否正确使用了训练和测试数据集。

欢迎任何帮助:)

...
# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Set the outer cross-validation loop
kf_outer = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize lists to store the scores
xgb_scores = []
rf_scores = []
ensemble_scores = []

# Loop over the outer folds
for train_index, test_index in kf_outer.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Initialize lists to store the scores for the current fold
    xgb_fold_scores = []
    rf_fold_scores = []
    ensemble_fold_scores = []

    # Set the inner cross-validation loop for hyperparameter tuning
    kf_inner = KFold(n_splits=5, shuffle=True, random_state=42)

    # Loop over the inner folds for hyperparameter tuning
    for inner_train_index, inner_test_index in kf_inner.split(X_train):
        inner_X_train, inner_X_test = X_train[inner_train_index], X_train[inner_test_index]
        inner_y_train, inner_y_test = y_train[inner_train_index], y_train[inner_test_index]

        # Define the hyperparameter search space
        params = {
            'max_depth': (3, 10),
            'learning_rate': (0.01, 0.5),
            'n_estimators': (50, 500)
        }       

        # Define the function to optimize (in this case, negative mean squared error)
        def xgb_cv(max_depth, learning_rate, n_estimators):
         xgb = XGBRegressor(max_depth=int(max_depth), learning_rate=learning_rate, n_estimators=int(n_estimators))
         kf_inner = KFold(n_splits=5, shuffle=True, random_state=42)
         mse = -np.mean(cross_val_score(xgb, inner_X_train, inner_y_train, cv=kf_inner, scoring='neg_mean_squared_error'))
         return mse

        # XGBoost hyperparameter tuning
        xgb_bo = BayesianOptimization(f=xgb_cv, pbounds=params, verbose=2, random_state=42)
       
        # Run the optimization for 50 iterations
        xgb_bo.maximize(n_iter=50)

        # Get the best hyperparameters and the best score
        best_xgb_params = xgb_bo.max['params']
        best_xgb_score = -xgb_bo.max['target']

        # Train the best XGBoost model on the current training set
        best_xgb_model = XGBRegressor(max_depth=int(best_xgb_params['max_depth']), learning_rate=best_xgb_params['learning_rate'], n_estimators=int(best_xgb_params['n_estimators']))
        best_xgb_model.fit(inner_X_train, inner_y_train)

        # Evaluate the XGBoost model
        train_pred = best_xgb_model.predict(inner_X_train)
        test_pred = best_xgb_model.predict(inner_X_test)

        train_score = r2_score(inner_y_train, train_pred)
        test_score = r2_score(inner_y_test, test_pred)

        train_r2 = r2_score(inner_y_train, train_pred)
        test_r2 = r2_score(inner_y_test, test_pred)

        train_mse = mean_squared_error(inner_y_train, train_pred)
        test_mse = mean_squared_error(inner_y_test, test_pred)

        train_rmse = np.sqrt(train_mse)
        test_rmse = np.sqrt(test_mse)

        # Store the scores for the XGBoost model on the inner test set
        xgb_fold_scores.append((train_score, test_score, train_mse, test_mse, train_rmse, test_rmse))

        # Define the hyperparameter search space for the Random Forest model
        rf_params = {
         'max_depth': (3, 10),
         'n_estimators': (50, 500)
        }

        # Define the function to optimize for the Random Forest model
        def rf_cv(max_depth, n_estimators):
            rf = RandomForestRegressor(max_depth=int(max_depth), n_estimators=int(n_estimators))
            kf_inner = KFold(n_splits=5, shuffle=True, random_state=42)
            mse = -np.mean(cross_val_score(rf, inner_X_train, inner_y_train, cv=kf_inner, scoring='neg_mean_squared_error'))
            return mse

       # Create the Bayesian optimization object for the Random Forest model
        rf_bo = BayesianOptimization(f=rf_cv, pbounds=rf_params, verbose=2, random_state=42)

        # Run the optimization for 50 iterations
        rf_bo.maximize(n_iter=50)

        # Get the best hyperparameters and the best score for the Random Forest model
        best_rf_params = rf_bo.max['params']
        best_rf_score = -rf_bo.max['target']

        # Train the Random Forest model with the best hyperparameters
        rf = RandomForestRegressor(max_depth=int(best_rf_params['max_depth']), n_estimators=int(best_rf_params['n_estimators']))
        rf.fit(X_train, y_train)

        # Evaluate the Random Forest model on the inner loop test set
        train_pred_rf = rf.predict(inner_X_train)
        test_pred_rf = rf.predict(inner_X_test)

        train_score_rf = r2_score(inner_y_train, train_pred_rf)
        test_score_rf = r2_score(inner_y_test, test_pred_rf)

        train_r2_rf = r2_score(inner_y_train, train_pred_rf)
        test_r2_rf = r2_score(inner_y_test, test_pred_rf)

        train_mse_rf = mean_squared_error(inner_y_train, train_pred_rf)
        test_mse_rf = mean_squared_error(inner_y_test, test_pred_rf)

        train_rmse_rf = np.sqrt(train_mse_rf)
        test_rmse_rf = np.sqrt(test_mse_rf)

        # Store the scores for the Random Forest model on the inner test set
        rf_fold_scores.append((train_score_rf, test_score_rf, train_mse_rf, test_mse_rf, train_rmse_rf, test_rmse_rf))

    # Mean the scores for each model over the inner folds
    mean_xgb_scores_fold = np.mean(xgb_fold_scores, axis=0)
    mean_rf_scores_fold = np.mean(rf_fold_scores, axis=0)

    # Store the scores for the best models on the current outer fold
    xgb_scores.append(mean_xgb_scores_fold)
    rf_scores.append(mean_rf_scores_fold)

    # Calculate the mean predictions for XGBoost and Random Forest models over the inner folds
    mean_train_pred_xgb = np.mean([x[0] for x in xgb_fold_scores], axis=0)
    mean_test_pred_xgb = np.mean([x[1] for x in xgb_fold_scores], axis=0)

    mean_train_pred_rf = np.mean([x[0] for x in rf_fold_scores], axis=0)
    mean_test_pred_rf = np.mean([x[1] for x in rf_fold_scores], axis=0)

    # Evaluate the ensemble model on the outer test set by averaging the predictions of the XGBoost and Random Forest models
    ensemble_pred_train = (mean_train_pred_xgb + mean_train_pred_rf) / 2
    ensemble_pred_test = (mean_test_pred_xgb + mean_test_pred_rf) / 2

    ensemble_train_score = r2_score(inner_y_train, ensemble_pred_train)
    ensemble_test_score = r2_score(inner_y_test, ensemble_pred_test)

    ensemble_train_mse = mean_squared_error(inner_y_train, ensemble_pred_train)
    ensemble_test_mse = mean_squared_error(inner_y_test, ensemble_pred_test)

    ensemble_train_rmse = np.sqrt(ensemble_train_mse)
    ensemble_test_rmse = np.sqrt(ensemble_test_mse)

    # Store the scores for the ensemble model
    ensemble_scores.append((ensemble_train_score, ensemble_test_score, ensemble_train_mse, ensemble_test_mse, ensemble_train_rmse, ensemble_test_rmse))

# Calculate the mean scores for each model
mean_xgb_scores = np.mean(xgb_scores, axis=0)
mean_rf_scores = np.mean(rf_scores, axis=0)
mean_ensemble_scores = np.mean(ensemble_scores, axis=0)

# Print the mean scores for each model
print("XGBoost average scores:", mean_xgb_scores)
print("Random Forest average scores:", mean_rf_scores)
print("Ensemble average scores:", mean_ensemble_scores)

# Choose the best model based on the test set scores
best_model = np.argmax([mean_xgb_scores[1], mean_rf_scores[1], mean_ensemble_scores[1]])

# Make predictions on the test set
if best_model == 0:
    best_model = test_pred
elif best_model == 1:
    best_model = test_pred_rf
else:
    best_model = ensemble_pred_test
...

我正在尝试对下面的代码实施嵌套交叉验证。但是,它不起作用。我正在尝试使用循环外部文件夹中的内部文件夹。

python python-3.x python-2.7 cross-validation
© www.soinside.com 2019 - 2024. All rights reserved.