手动重现调整对象#tidymodels

问题描述 投票:0回答:1

问题是我已经运行了几个 xgb 调整过程,但只以文本格式或更准确地说是元数据保存结果,其中保存模型参数和性能。

它具有以下结构:

str(p)
'data.frame': 130 obs. of 10 variables:
$ mtry : int 922 1046 512 1317 675 1303 518 1029 1345 1180 ...
$ min_n : int 34 36 73 89 91 32 73 52 75 93 ...
$ tree_depth : int 44 33 43 37 34 48 25 19 38 41 ...
$ learn_rate : num 0.0236 0.0257 0.0292 0.0254 0.0271 0.023 0.025 0.0226 0.0281 0.0641 ...
$ loss_reduction: num 0.0268 0.745 0.148 0.171 0.0275 ...
$ sample_size : num 0.967 0.947 0.789 0.825 0.973 0.521 0.798 0.813 0.993 0.959 ...
$ .metric : chr "mn_log_loss" "mn_log_loss" "mn_log_loss" "mn_log_loss" ...
$ .estimator : chr "binary" "binary" "binary" "binary" ...
$ mean : num 0.423 0.424 0.424 0.424 0.424 0.425 0.425 0.426 0.427 0.427 ...
$ std_err : num 0.000382 0.000439 0.000408 0.000344 0.000368 0.000407 0.000386 0.000398 0.000392 0.000441 ...

现在我想使用元数据作为tune_bayes操作的初始值:

Error in check_initial():
! initial should be a positive integer or the results of [tune_grid()]
Run rlang::last_trace() to see where the error occurred.

如何将元数据带入调整对象而不重新运行计算?

这是一个可重现的示例:

param
代表我拥有的元数据,我想将其输入tune_bayes而无需再次调整,因为这太耗时了。

library(tidyverse)
library(tidymodels) 

data(cells)
set.seed(123)
df_split <- initial_split(cells %>% select(-case))
df_train <- training(df_split)
df_test  <- testing(df_split)

set.seed(123)
df_train_folds <- vfold_cv(df_train, v = 5)


# /////////////////////////////////////////////////////////////////////////////

recipe_df <- recipe(class ~ ., data=df_train) %>% 
  step_zv(all_predictors()) %>% 
  step_normalize(all_numeric_predictors())

prep(recipe_df)

xgb_prep<- prep(recipe_df,verbose=T)


# Create model

xgb_spec <- boost_tree(
  trees = 100,
  tree_depth = tune(),
  min_n = tune(),
  mtry = tune(),
  loss_reduction = tune(),
  sample_size = tune(),
  learn_rate = tune(),
) %>%
  set_engine('xgboost') %>%
  set_mode('classification')


# Merge into workflow
xgb_wf <- workflow() %>% 
  add_model(xgb_spec) %>% 
  add_recipe(xgb_prep)

# set ranges for parameters
xgb_params <-
  parameters(xgb_wf) %>%
  update(learn_rate = learn_rate(c(0.01, 0.3), trans=NULL),  # range for the learning rate
         tree_depth = tree_depth(c(3, 100)),  # range for the tree depth
         min_n = min_n(c(1, 10)),  # range for the minimum number of observations
         loss_reduction = loss_reduction(c(1, 5),trans=NULL),  # range for the loss reduction
         sample_size= sample_prop(c(0.5,1)),
         mtry = finalize(mtry(), df_train))

param <- tibble(mtry = c(922,1300), min_n = c(34,54), tree_depth = c(44,28), learn_rate = 0.0236, (0.013), loss_reduction = c(0.26800,0.26800), sample_size = c(1,0.967), metric = "mn_log_loss", estimator = "binary", mean = c(0.423,0.424), std_err = c(0.00042,0.000382))


options(tidymodels.dark = TRUE)
set.seed(123)

xgb_res <-
  tune_bayes(
    xgb_wf,
    resamples = df_train_folds,
    param_info = xgb_params,
    initial = param,
    iter =5,
    metrics = metric_set(mn_log_loss),
    control = control_bayes(verbose = TRUE,
                            parallel_over = "everything",
                            no_improve = 10)
  )  

这里是tune_grid的文档:https://github.com/tidymodels/tune/blob/main/R/tune_grid.R它并没有让我走得太远。

谢谢!

r tidymodels fine-tuning r-parsnip
1个回答
1
投票

很难给你一个准确的答案,因为没有代码,但我怀疑问题是

mtry
参数不知道上限是多少(因为它基于数据中预测变量的数量)。网格搜索可以解决这个问题,但贝叶斯优化需要您设置它。

您可以从模型规范中获取参数信息,设置

mtry
的范围,然后使用调整对象(不是数据框)作为
tune_bayes()
的输入以及参数信息。

这是一个例子:

library(tidymodels)

set.seed(1)
sim_tr <- sim_regression(250)
sim_rs <- vfold_cv(sim_tr)

xgb_spec <- 
  boost_tree(mtry = tune(), min_n = tune(), trees = 20) %>% 
  set_mode("regression")

set.seed(2)
initial_res <- 
  xgb_spec %>% 
  tune_grid(
    outcome ~ .,
    resamples = sim_rs,
    grid = 10
  )
#> i Creating pre-processing data to finalize unknown parameter: mtry

# Use the tune object `initial_res` as the input

# Set parameter range for mtry: 

xgb_param <- 
  xgb_spec %>% 
  extract_parameter_set_dials()

# See 'Model parameters needing finalization:' below
xgb_param
#> Collection of 2 parameters for tuning
#> 
#>  identifier  type    object
#>        mtry  mtry nparam[?]
#>       min_n min_n nparam[+]
#> 
#> Model parameters needing finalization:
#>    # Randomly Selected Predictors ('mtry')
#> 
#> See `?dials::finalize` or `?dials::update.parameters` for more information.

xgb_param <- 
  xgb_param %>% 
  update(mtry = mtry(c(1, 20)))

set.seed(3)
bayes_res <- 
  xgb_spec %>% 
  tune_bayes(
    outcome ~ .,
    resamples = sim_rs,
    initial = initial_res,    # <- tune object
    iter = 4,
    # Provide parameter information: 
    param_info = xgb_param
  )

show_best(bayes_res)
#> Warning: No value of `metric` was given; metric 'rmse' will be used.
#> # A tibble: 5 × 9
#>    mtry min_n .metric .estimator  mean     n std_err .config               .iter
#>   <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                 <int>
#> 1    18     9 rmse    standard    16.3    10   1.01  Preprocessor1_Model04     0
#> 2    13     5 rmse    standard    16.8    10   0.935 Preprocessor1_Model09     0
#> 3    16    10 rmse    standard    16.9    10   0.940 Iter2                     2
#> 4    20    14 rmse    standard    17.0    10   1.26  Iter1                     1
#> 5    12    16 rmse    standard    17.3    10   1.13  Preprocessor1_Model02     0

创建于 2024-04-23,使用 reprex v2.1.0

© www.soinside.com 2019 - 2024. All rights reserved.