我希望从 Tidymodels 中的工作流程集中绘制超参数性能(RMSE 和 RSQ),但我正在努力拼凑语法。
我正在尝试复制下面的图,取自here:
如何从比赛结果中提取超参数?
# load the housing data and clean names
ames_data <- make_ames() %>%
janitor::clean_names() %>%
mutate(sale_price_log = log10(sale_price))
# SPLIT INTO TRAINING AND TESTING DATA. STRATIFY BY SALE PRICE
ames_split <- rsample::initial_split(
ames_data %>% select(-sale_price),
prop = 0.8,
strata = sale_price_log
)
# CREATE TRAINING AND TESTING OBJECTS FROM THE SPLIT OBJECT
ames_train <- training(ames_split)
ames_test <- testing(ames_split)
# CREATE RESAMPLES TO CHOOSE AND COMPARE MODELS
set.seed(234)
ames_folds <- vfold_cv(ames_train, strata = sale_price_log, v = 5)
# DEFINE PREPROCESSING RECIPES --------------------------------------------
base_rec <- recipe(sale_price_log ~ ., data = ames_train) %>%
# APPLYING LOG TRANSFORMATION TO SALE_PRICE AND GR_LIV_AREA TO ADDRESS SKEWNESS
step_log(gr_liv_area, base = 10) %>%
# CREATE DUMMY VARIABLES FROM FACTOR COLUMNS
step_dummy(all_nominal_predictors(), one_hot = TRUE)
normalise_rec <- recipe(sale_price_log ~ ., data = ames_train) %>%
# REMOVE ANY COLUMNS WITH A SINGLE UNIQUE VALUE
step_nzv(all_nominal_predictors()) %>%
# HANDLING RARE FACTOR LEVELS IN NEIGHBORHOOD TO IMPROVE MODEL ROBUSTNESS
step_other(all_nominal_predictors(), threshold = 0.05, other = "OTHER") %>%
# STABILIZING VARIANCE AND NORMALIZING DISTRIBUTIONS FOR LOT_AREA AND GR_LIV_AREA
step_YeoJohnson(all_numeric_predictors()) %>%
# NORMALIZING ALL NUMERIC PREDICTORS TO ENSURE THEY ARE ON A SIMILAR SCALE
step_normalize(all_numeric_predictors()) %>%
# CREATE DUMMY VARIABLES FROM FACTOR COLUMNS
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
# REMOVE ANY COLUMNS WITH A SINGLE UNIQUE VALUE
step_zv(all_predictors())
# PCA RECIPE
pca_rec <- recipe(sale_price_log ~ ., data = ames_train) %>%
# FOR UNSEEN FACTROR LEVELS, CREATE A NEW LEVEL CALLED "NEW"
step_novel(all_nominal_predictors()) %>%
# CREATE DUMMY VARIABLES FROM FACTOR COLUMNS
step_dummy(all_nominal_predictors()) %>%
# REMOVE ANY COLUMNS WITH A SINGLE UNIQUE VALUE
step_zv(all_predictors()) %>%
# NORMALIZING ALL NUMERIC PREDICTORS TO ENSURE THEY ARE ON A SIMILAR SCALE
step_normalize(all_numeric_predictors()) %>%
# CONVERT NUMERIC COLUMNS TO PRINCIPAL COMPONENTS
step_pca(all_predictors(), threshold = 0.95)
# # THERE ARE 309 COLUMNS IN THE BASE RECIPE
# base_rec %>%
# prep() %>%
# juice() %>%
# ncol()
#
# z <- normalise_rec %>%
# prep() %>%
# juice() %>%
# ncol()
#
# pca_rec %>%
# prep() %>%
# juice() %>%
# ncol()
#
# # THERE ARE 297 COLUMNS IN THE NORMALISED RECIPE
# pca_rec %>%
# prep() %>%
# juice() %>%
# ncol()
# BUILD MODELS -----------------------------------------------------------
# DEFINE A BAGGED RANDOM FOREST MODEL
bagged_spec <- bag_tree(
tree_depth = tune(),
min_n = tune(),
cost_complexity = tune()
) %>%
set_mode("regression") %>%
set_engine("rpart", times = 25L)
# DEFINE A RANGER RANDOM FOREST MODEL
rf_spec <-
rand_forest(
mtry = tune(),
min_n = tune(),
trees = 500
) %>%
set_engine("ranger") %>%
set_mode("regression")
# DEFINE AN XGBOOST MODEL
xgb_spec <- boost_tree(
trees = 500,
tree_depth = tune(),
min_n = tune(),
loss_reduction = tune(),
sample_size = tune(),
mtry = tune(),
learn_rate = tune()
) %>%
set_engine("xgboost", importance = TRUE) %>%
set_mode("regression")
# DEFINE A BOOSTED TREE ENSEMBLE MODEL
bt_spec <-
boost_tree(
learn_rate = tune(),
stop_iter = tune(),
trees = 500
) %>%
set_engine("lightgbm", num_leaves = tune()) %>%
set_mode("regression")
# DEFINE A WORKFLOW SET ---------------------------------------------------
wflw_set <-
workflow_set(
preproc = list(base = base_rec, normalise = normalise_rec, pca = pca_rec),
models = list(xgb = xgb_spec, bagged = bagged_spec, rf = rf_spec, bt = bt_spec),
cross = TRUE
)
# UPDATE MTRY PARAMETER FOR THE BASE XGBOOST
base_xgb_param <- wflw_set %>%
extract_workflow(
id = "base_xgb"
) %>%
hardhat::extract_parameter_set_dials() %>%
update(mtry = mtry(c(1, 308)))
base_rf_param <- wflw_set %>%
extract_workflow(
id = "base_rf"
) %>%
hardhat::extract_parameter_set_dials() %>%
update(mtry = mtry(c(1, 308)))
# UPDATE MTRY PARAMETER FOR THE NORMALISED XGB MODEL
normalise_xgb_param <- wflw_set %>%
extract_workflow(
id = "normalise_xgb"
) %>%
hardhat::extract_parameter_set_dials() %>%
update(mtry = mtry(c(1, 284)))
# UPDATE MTRY PARAMETER FOR THE NORMALISED RF MODEL
normalise_rf_param <- wflw_set %>%
extract_workflow(
id = "normalise_rf"
) %>%
hardhat::extract_parameter_set_dials() %>%
update(mtry = mtry(c(1, 284)))
# UPDATE MTRY PARAMETER FOR THE PCA XGB MODEL
pca_xgb_param <- wflw_set %>%
extract_workflow(
id = "pca_xgb"
) %>%
hardhat::extract_parameter_set_dials() %>%
update(mtry = mtry(c(1, 5)))
# UPDATE MTRY PARAMETER FOR THE PCA XGB MODEL
pca_rf_param <- wflw_set %>%
extract_workflow(
id = "pca_rf"
) %>%
hardhat::extract_parameter_set_dials() %>%
update(mtry = mtry(c(1, 5)))
# UPDATE THE WORKFLOW SET WITH THE NEW PARAMETERS
wf_set_tune_list_finalize <- wflw_set %>%
option_add(param_info = base_xgb_param, id = "base_xgb") %>%
option_add(param_info = base_rf_param, id = "base_rf") %>%
option_add(param_info = normalise_xgb_param, id = "normalise_xgb") %>%
option_add(param_info = normalise_rf_param, id = "normalise_rf") %>%
option_add(param_info = pca_xgb_param, id = "pca_xgb") %>%
option_add(param_info = pca_rf_param, id = "pca_rf")
# SPECIFY THE TUNE GRID
race_ctrl <-
control_race(
save_pred = TRUE,
parallel_over = "everything",
save_workflow = TRUE
)
# DETECT THE NUMBER OF CORES
cores <- parallel::detectCores(logical = FALSE)
# CREATE A SET OF COPIES OF R RUNNING IN PARALLEL AND COMMUNICATING VIA SOCKETS
cl <- makePSOCKcluster(cores)
# REGISTER THE PARALLEL BACKEND
doParallel::registerDoParallel(cores = cl)
# APPLY RACE ANOVA TUNING TO EACH WORKFLOW IN THE WORKFLOW SET
tictoc::tic()
race_results <- wf_set_tune_list_finalize %>%
workflow_map(
"tune_race_anova",
seed = 123,
resamples = ames_folds,
grid = 5,
control = race_ctrl,
verbose = TRUE
)
tictoc::toc()
# EXTRACT THE BEST RESULTS
best_results <-
race_results %>%
extract_workflow_set_result("base_xgb") %>%
select_best(metric = "rmse")
当您说“提取超参数”时,我假设您的意思是您想要查找与工作流集结果的每个性能指标关联的超参数值。
请注意,在工作流程集中,每个不同的调优结果可能有不同的与之关联的调优参数,因此,如果我们想从所有结果中收集调优参数,每个调优参数有一列,对于不使用给定调整参数的任何调整结果,这些列中的值将丢失;这就是为什么我们默认不将这些列包含在工作流程集的
collect_metrics()
中。
您的
best_results
对象包含 that 最佳组合的调整参数值。要查找所有对象的超参数值,您可以将 collect_metrics()
的输出绑定在一起,填充不匹配列的缺失值。这是内置 chi_features_res
对象的示例:
library(tidymodels)
chi_features_res
#> # A workflow set/tibble: 3 × 4
#> wflow_id info option result
#> <chr> <list> <list> <list>
#> 1 date_lm <tibble [1 × 4]> <opts[2]> <rsmp[+]>
#> 2 plus_holidays_lm <tibble [1 × 4]> <opts[2]> <rsmp[+]>
#> 3 plus_pca_lm <tibble [1 × 4]> <opts[3]> <tune[+]>
chi_features_res %>%
# collect metrics for each result
rowwise() %>%
mutate(metrics = list(collect_metrics(result))) %>%
pull(metrics) %>%
# bind rows
bind_rows()
#> # A tibble: 40 × 7
#> .metric .estimator mean n std_err .config num_comp
#> <chr> <chr> <dbl> <int> <dbl> <chr> <int>
#> 1 rmse standard 0.733 1 NA Preprocessor1_Model1 NA
#> 2 rsq standard 0.982 1 NA Preprocessor1_Model1 NA
#> 3 rmse standard 0.646 1 NA Preprocessor1_Model1 NA
#> 4 rsq standard 0.986 1 NA Preprocessor1_Model1 NA
#> 5 rmse standard 0.609 1 NA Preprocessor01_Model1 15
#> 6 rsq standard 0.987 1 NA Preprocessor01_Model1 15
#> 7 rmse standard 0.642 1 NA Preprocessor02_Model1 18
#> 8 rsq standard 0.986 1 NA Preprocessor02_Model1 18
#> 9 rmse standard 0.586 1 NA Preprocessor03_Model1 3
#> 10 rsq standard 0.989 1 NA Preprocessor03_Model1 3
#> # ℹ 30 more rows
创建于 2024-01-08,使用 reprex v2.0.2
您可以用
race_results
代替 chi_features_res
。 :)