我在 Tidymodels 环境中使用 lightgbm,但在训练模型时遇到问题。使用网格搜索选择最终参数后,我想将该模型应用于训练数据集。 我的数据集包含 150000 个观察值。当我运行少于 36000 个观测值的模型时,计算时间就是几分钟。 一旦我使用完整的数据集,计算时间就变得无限。
下面是我的代码。有谁知道为什么会这样?
# select best hyperparameter combination
ijl_best_params <-
ijl_tune %>%
select_best(metric = "mae")
knitr::kable(ijl_best_params)
# finalize the lgbm model to use the best tuning parameters.
ijl_model_final <-
ijl_model %>%
finalize_model(ijl_best_params)
ijl_model_final
# evaluate performance of training set
train_processed_ijl <- bake(model_recipe_ijl, new_data = dt_ijl_train)
l_train_prediction_ijl <- ijl_model_final %>%
# fit the model on all the training data
fit(
formula = error_stat_forecast ~ .,
data = train_processed_ijl
) %>%
# predict the sale prices for the training data
predict(new_data = train_processed_ijl) %>%
as.list()
l_train_prediction_ijl_merged <- c(l_ijl_train, l_train_prediction_ijl)
train_prediction_ijl <- data.frame(l_train_prediction_ijl_merged) %>%
arrange(product_id)
提前谢谢您!
# create a train & test split
ts_splits <- dt_ijl %>%
time_series_split(date, initial = 30, assess = 6)
dt_ijl_train <- training(ts_splits)
dt_ijl_test <- testing(ts_splits)
# create list of train & test split
l_ijl_train <- as.list(dt_ijl_train)
l_ijl_test <- as.list(dt_ijl_test)
# apply cross validation
dt_ijl_cv <- vfold_cv(dt_ijl_train, v = 3)
# preprocess data
model_recipe_ijl <- recipe(error_stat_forecast ~ ., dt_ijl_train) %>%
update_role(product_id, date, new_role = "id") %>%
# create meta data for modelling from the date column
step_timeseries_signature(date) %>%
# remove some of the variables created in the last step
step_rm(matches(
"(.xts$)|(.iso$)|(hour)|(minute)|(second)|(day)|(week)|(am.pm)"
)) %>%
step_rm(date) %>%
# removes no variance predictors which provide no predictive information
step_nzv(all_predictors()) %>%
# removes variables with constant values - no predictive information
step_zv(all_predictors()) %>%
# one-hot encode all nominal predictors
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
prep()
# set up ijl model
ijl_model <-
boost_tree(
mode = "regression",
learn_rate = tune(), # = learning rate
tree_depth = tune(), # = max_depth
min_n = tune(), # = min_data_in_leaf
loss_reduction = tune(), # = min_gain_to_split
trees = tune() # = num_iterations
) %>%
set_engine("lightgbm", lambda_l1 = .6)
# defining grid search space
ijl_grid_search <-
parameters(learn_rate(range = c(-2, -1)), # [0.01, 0.1]
tree_depth(range = c(5, 10)),
min_n(range = c(5, 20)),
loss_reduction(),
trees(range = c (0,100))
)
# creating hyperparameter combinations according to latin_hypercube
ijl_grid <-
grid_latin_hypercube(ijl_grid_search,
size = 30)
knitr::kable(head(ijl_grid))
# create workflow
ijl_wf <-
workflow() %>%
add_model(ijl_model) %>%
add_formula(error_stat_forecast ~ .)
# evaluate hyperparameter defined before with validation set
ijl_tune <-
ijl_wf %>%
tune_grid(
resamples = dt_ijl_cv,
grid = ijl_grid,
metrics = metric_set(mae),
control = control_grid(verbose = TRUE))
# save top 5 hyperparameter combinations for evaluation
ijl_parameter <- show_best(ijl_tune, metric = "mae", n = 5)
# select best hyperparameter combination
ijl_best_params <-
ijl_tune %>%
select_best(metric = "mae")
knitr::kable(ijl_best_params)
# finalize the lgbm model to use the best tuning parameters.
ijl_model_final <-
ijl_model %>%
finalize_model(ijl_best_params)
ijl_model_final
# evaluate performance of training set
train_processed_ijl <- bake(model_recipe_ijl, new_data = dt_ijl_train)
l_train_prediction_ijl <- ijl_model_final %>%
# fit the model on all the training data
fit(
formula = error_stat_forecast ~ .,
data = train_processed_ijl
) %>%
# predict the sale prices for the training data
predict(new_data = train_processed_ijl) %>%
as.list()
l_train_prediction_ijl_merged <- c(l_ijl_train, l_train_prediction_ijl)
train_prediction_ijl <- data.frame(l_train_prediction_ijl_merged) %>%
arrange(product_id)
# measure the accuracy of training set
ijl_score_train <-
train_prediction_ijl %>%
yardstick::metrics(error_stat_forecast, .pred) %>%
mutate(.estimate = format(round(.estimate, 2), big.mark = ","),
set = "train")
knitr::kable(ijl_score_train)
# evaluate performance of test set
test_processed_ijl <- bake(model_recipe_ijl, new_data = dt_ijl_test)
l_test_prediction_ijl <- ijl_model_final %>%
# fit the model on all the training data
fit(
formula = error_stat_forecast ~ .,
data = train_processed_ijl
) %>%
# use the training model fit to predict the test data
predict(new_data = test_processed_ijl) %>%
as.list()
l_test_prediction_ijl_merged <- c(l_ijl_test, l_test_prediction_ijl)
test_prediction_ijl <- data.frame(l_test_prediction_ijl_merged) %>%
arrange(product_id)
# measure the accuracy of test set
ijl_score_test <-
test_prediction_ijl %>%
yardstick::metrics(error_stat_forecast, .pred) %>%
mutate(.estimate = format(round(.estimate, 2), big.mark = ","),
set = "test")
knitr::kable(ijl_score_test)
# compare train/test set regarding over-/underfitting
train_test_ijl <- bind_rows(ijl_score_train, ijl_score_test) %>%
spread(key =.metric,
value =.estimate)
# check residuals
test_prediction_ijl_residuals <- test_prediction_ijl %>%
arrange(product_id) %>%
mutate(residuals = (error_stat_forecast - .pred) / .pred)
ggplot(test_prediction_ijl_residuals, aes(x = .pred, y = residuals)) +
geom_point() +
xlab("Predicted Statistical Error") +
ylab("Residual (%)") +
scale_y_continuous(labels = scales::percent)
# select necessary data
test_prediction_ijl <- test_prediction_ijl %>%
select(c(product_id, date, .pred))
# merge data
dt_ijl_forecast_test_set <- test_prediction_ijl %>%
inner_join(dt_demand_history_filter, by = c("date", "product_id")) %>%
mutate(ai_forecast = statistical_forecast + .pred,
residuals = ai_forecast - demand_quantity) %>%
arrange(product_id)