问题是我已经运行了几个 xgb 调整过程,但只以文本格式或更准确地说是元数据保存结果,其中保存模型参数和性能。
它具有以下结构:
str(p)
'data.frame': 130 obs. of 10 variables:
$ mtry : int 922 1046 512 1317 675 1303 518 1029 1345 1180 ...
$ min_n : int 34 36 73 89 91 32 73 52 75 93 ...
$ tree_depth : int 44 33 43 37 34 48 25 19 38 41 ...
$ learn_rate : num 0.0236 0.0257 0.0292 0.0254 0.0271 0.023 0.025 0.0226 0.0281 0.0641 ...
$ loss_reduction: num 0.0268 0.745 0.148 0.171 0.0275 ...
$ sample_size : num 0.967 0.947 0.789 0.825 0.973 0.521 0.798 0.813 0.993 0.959 ...
$ .metric : chr "mn_log_loss" "mn_log_loss" "mn_log_loss" "mn_log_loss" ...
$ .estimator : chr "binary" "binary" "binary" "binary" ...
$ mean : num 0.423 0.424 0.424 0.424 0.424 0.425 0.425 0.426 0.427 0.427 ...
$ std_err : num 0.000382 0.000439 0.000408 0.000344 0.000368 0.000407 0.000386 0.000398 0.000392 0.000441 ...
现在我想使用元数据作为tune_bayes操作的初始值:
Error in check_initial():
! initial should be a positive integer or the results of [tune_grid()]
Run rlang::last_trace() to see where the error occurred.
如何将元数据带入调整对象而不重新运行计算?
这是一个可重现的示例:
param
代表我拥有的元数据,我想将其输入tune_bayes而无需再次调整,因为这太耗时了。
library(tidyverse)
library(tidymodels)
data(cells)
set.seed(123)
df_split <- initial_split(cells %>% select(-case))
df_train <- training(df_split)
df_test <- testing(df_split)
set.seed(123)
df_train_folds <- vfold_cv(df_train, v = 5)
# /////////////////////////////////////////////////////////////////////////////
recipe_df <- recipe(class ~ ., data=df_train) %>%
step_zv(all_predictors()) %>%
step_normalize(all_numeric_predictors())
prep(recipe_df)
xgb_prep<- prep(recipe_df,verbose=T)
# Create model
xgb_spec <- boost_tree(
trees = 100,
tree_depth = tune(),
min_n = tune(),
mtry = tune(),
loss_reduction = tune(),
sample_size = tune(),
learn_rate = tune(),
) %>%
set_engine('xgboost') %>%
set_mode('classification')
# Merge into workflow
xgb_wf <- workflow() %>%
add_model(xgb_spec) %>%
add_recipe(xgb_prep)
# set ranges for parameters
xgb_params <-
parameters(xgb_wf) %>%
update(learn_rate = learn_rate(c(0.01, 0.3), trans=NULL), # range for the learning rate
tree_depth = tree_depth(c(3, 100)), # range for the tree depth
min_n = min_n(c(1, 10)), # range for the minimum number of observations
loss_reduction = loss_reduction(c(1, 5),trans=NULL), # range for the loss reduction
sample_size= sample_prop(c(0.5,1)),
mtry = finalize(mtry(), df_train))
param <- tibble(mtry = c(922,1300), min_n = c(34,54), tree_depth = c(44,28), learn_rate = 0.0236, (0.013), loss_reduction = c(0.26800,0.26800), sample_size = c(1,0.967), metric = "mn_log_loss", estimator = "binary", mean = c(0.423,0.424), std_err = c(0.00042,0.000382))
options(tidymodels.dark = TRUE)
set.seed(123)
xgb_res <-
tune_bayes(
xgb_wf,
resamples = df_train_folds,
param_info = xgb_params,
initial = param,
iter =5,
metrics = metric_set(mn_log_loss),
control = control_bayes(verbose = TRUE,
parallel_over = "everything",
no_improve = 10)
)
这里是tune_grid的文档:https://github.com/tidymodels/tune/blob/main/R/tune_grid.R它并没有让我走得太远。
谢谢!
很难给你一个准确的答案,因为没有代码,但我怀疑问题是
mtry
参数不知道上限是多少(因为它基于数据中预测变量的数量)。网格搜索可以解决这个问题,但贝叶斯优化需要您设置它。
您可以从模型规范中获取参数信息,设置
mtry
的范围,然后使用调整对象(不是数据框)作为 tune_bayes()
的输入以及参数信息。
这是一个例子:
library(tidymodels)
set.seed(1)
sim_tr <- sim_regression(250)
sim_rs <- vfold_cv(sim_tr)
xgb_spec <-
boost_tree(mtry = tune(), min_n = tune(), trees = 20) %>%
set_mode("regression")
set.seed(2)
initial_res <-
xgb_spec %>%
tune_grid(
outcome ~ .,
resamples = sim_rs,
grid = 10
)
#> i Creating pre-processing data to finalize unknown parameter: mtry
# Use the tune object `initial_res` as the input
# Set parameter range for mtry:
xgb_param <-
xgb_spec %>%
extract_parameter_set_dials()
# See 'Model parameters needing finalization:' below
xgb_param
#> Collection of 2 parameters for tuning
#>
#> identifier type object
#> mtry mtry nparam[?]
#> min_n min_n nparam[+]
#>
#> Model parameters needing finalization:
#> # Randomly Selected Predictors ('mtry')
#>
#> See `?dials::finalize` or `?dials::update.parameters` for more information.
xgb_param <-
xgb_param %>%
update(mtry = mtry(c(1, 20)))
set.seed(3)
bayes_res <-
xgb_spec %>%
tune_bayes(
outcome ~ .,
resamples = sim_rs,
initial = initial_res, # <- tune object
iter = 4,
# Provide parameter information:
param_info = xgb_param
)
show_best(bayes_res)
#> Warning: No value of `metric` was given; metric 'rmse' will be used.
#> # A tibble: 5 × 9
#> mtry min_n .metric .estimator mean n std_err .config .iter
#> <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr> <int>
#> 1 18 9 rmse standard 16.3 10 1.01 Preprocessor1_Model04 0
#> 2 13 5 rmse standard 16.8 10 0.935 Preprocessor1_Model09 0
#> 3 16 10 rmse standard 16.9 10 0.940 Iter2 2
#> 4 20 14 rmse standard 17.0 10 1.26 Iter1 1
#> 5 12 16 rmse standard 17.3 10 1.13 Preprocessor1_Model02 0
创建于 2024-04-23,使用 reprex v2.1.0