如果数据集包含超过 36000 个数据点，lightGBM 训练/测试的计算时间将达到无穷大

Question

我在 Tidymodels 环境中使用 lightgbm，但在训练模型时遇到问题。使用网格搜索选择最终参数后，我想将该模型应用于训练数据集。我的数据集包含 150000 个观察值。当我运行少于 36000 个观测值的模型时，计算时间就是几分钟。一旦我使用完整的数据集，计算时间就变得无限。

下面是我的代码。有谁知道为什么会这样？

# select best hyperparameter combination
ijl_best_params <-
    ijl_tune %>%
    select_best(metric = "mae")

knitr::kable(ijl_best_params)

# finalize the lgbm model to use the best tuning parameters.
ijl_model_final <-
    ijl_model %>%
    finalize_model(ijl_best_params)

ijl_model_final

# evaluate performance of training set 
train_processed_ijl <- bake(model_recipe_ijl,  new_data = dt_ijl_train)

l_train_prediction_ijl <- ijl_model_final %>%
    # fit the model on all the training data
    fit(
        formula = error_stat_forecast ~ ., 
        data    = train_processed_ijl
    ) %>%
    # predict the sale prices for the training data
    predict(new_data = train_processed_ijl) %>%
    as.list()

l_train_prediction_ijl_merged <- c(l_ijl_train, l_train_prediction_ijl)

train_prediction_ijl <- data.frame(l_train_prediction_ijl_merged) %>% 
    arrange(product_id)

提前谢谢您！

Answer 1

# create a train & test split
ts_splits <- dt_ijl %>% 
    time_series_split(date, initial = 30, assess = 6)

dt_ijl_train <- training(ts_splits)
dt_ijl_test <- testing(ts_splits)

# create list of train & test split
l_ijl_train <- as.list(dt_ijl_train)
l_ijl_test <- as.list(dt_ijl_test)

# apply cross validation
dt_ijl_cv <- vfold_cv(dt_ijl_train, v = 3)

# preprocess data
model_recipe_ijl  <- recipe(error_stat_forecast ~ ., dt_ijl_train) %>%
    update_role(product_id, date, new_role = "id") %>%
    
    # create meta data for modelling from the date column
    step_timeseries_signature(date) %>%
    
    # remove some of the variables created in the last step
    step_rm(matches(
        "(.xts$)|(.iso$)|(hour)|(minute)|(second)|(day)|(week)|(am.pm)"
    )) %>%
    step_rm(date) %>%
    
    # removes no variance predictors which provide no predictive information
    step_nzv(all_predictors()) %>%
    
    # removes variables with constant values - no predictive information
    step_zv(all_predictors()) %>%
    
    # one-hot encode all nominal predictors
    step_dummy(all_nominal_predictors(), one_hot = TRUE) %>% 
    prep()

# set up ijl model
ijl_model <-
    boost_tree(
        mode = "regression",
        learn_rate = tune(),      # = learning rate
        tree_depth = tune(),      # = max_depth
        min_n = tune(),           # = min_data_in_leaf
        loss_reduction = tune(),  # = min_gain_to_split
        trees = tune()            # = num_iterations
    ) %>%
    set_engine("lightgbm", lambda_l1 = .6)

# defining grid search space
ijl_grid_search <-
    parameters(learn_rate(range = c(-2, -1)),        # [0.01, 0.1]
               tree_depth(range = c(5, 10)),
               min_n(range = c(5, 20)),
               loss_reduction(),
               trees(range = c (0,100))
    )

# creating hyperparameter combinations according to latin_hypercube 
ijl_grid <-
    grid_latin_hypercube(ijl_grid_search,
                         size = 30)

knitr::kable(head(ijl_grid))

# create workflow
ijl_wf <-
    workflow() %>%
    add_model(ijl_model) %>% 
    add_formula(error_stat_forecast ~ .)

# evaluate hyperparameter defined before with validation set
ijl_tune <-
    ijl_wf %>%
    tune_grid(
        resamples = dt_ijl_cv,
        grid = ijl_grid,
        metrics = metric_set(mae),
        control = control_grid(verbose = TRUE))

# save top 5 hyperparameter combinations for evaluation
ijl_parameter <- show_best(ijl_tune, metric = "mae", n = 5)

# select best hyperparameter combination
ijl_best_params <-
    ijl_tune %>%
    select_best(metric = "mae")

knitr::kable(ijl_best_params)

# finalize the lgbm model to use the best tuning parameters.
ijl_model_final <-
    ijl_model %>%
    finalize_model(ijl_best_params)

ijl_model_final

# evaluate performance of training set 
train_processed_ijl <- bake(model_recipe_ijl,  new_data = dt_ijl_train)

l_train_prediction_ijl <- ijl_model_final %>%
    # fit the model on all the training data
    fit(
        formula = error_stat_forecast ~ ., 
        data    = train_processed_ijl
    ) %>%
    # predict the sale prices for the training data
    predict(new_data = train_processed_ijl) %>%
    as.list()

l_train_prediction_ijl_merged <- c(l_ijl_train, l_train_prediction_ijl)

train_prediction_ijl <- data.frame(l_train_prediction_ijl_merged) %>% 
    arrange(product_id)

# measure the accuracy of training set
ijl_score_train <- 
    train_prediction_ijl %>%
    yardstick::metrics(error_stat_forecast, .pred) %>%
    mutate(.estimate = format(round(.estimate, 2), big.mark = ","),
           set = "train")

knitr::kable(ijl_score_train)

# evaluate performance of test set
test_processed_ijl  <- bake(model_recipe_ijl, new_data = dt_ijl_test)

l_test_prediction_ijl <- ijl_model_final %>%
    # fit the model on all the training data
    fit(
        formula = error_stat_forecast ~ ., 
        data    = train_processed_ijl
    ) %>%
    # use the training model fit to predict the test data
    predict(new_data = test_processed_ijl) %>%
    as.list()

l_test_prediction_ijl_merged <- c(l_ijl_test, l_test_prediction_ijl)

test_prediction_ijl <- data.frame(l_test_prediction_ijl_merged) %>% 
    arrange(product_id)

# measure the accuracy of test set
ijl_score_test <- 
    test_prediction_ijl %>%
    yardstick::metrics(error_stat_forecast, .pred) %>%
    mutate(.estimate = format(round(.estimate, 2), big.mark = ","),
           set = "test")

knitr::kable(ijl_score_test)

# compare train/test set regarding over-/underfitting 
train_test_ijl <- bind_rows(ijl_score_train, ijl_score_test) %>% 
    spread(key =.metric,
           value =.estimate)

# check residuals 
test_prediction_ijl_residuals <- test_prediction_ijl %>%
    arrange(product_id) %>%
    mutate(residuals = (error_stat_forecast - .pred) / .pred)

ggplot(test_prediction_ijl_residuals, aes(x = .pred, y = residuals)) +
    geom_point() +
    xlab("Predicted Statistical Error") +
    ylab("Residual (%)") +
    scale_y_continuous(labels = scales::percent)

# select necessary data 
test_prediction_ijl <- test_prediction_ijl %>% 
    select(c(product_id, date, .pred))

# merge data
dt_ijl_forecast_test_set <- test_prediction_ijl %>%
    inner_join(dt_demand_history_filter, by = c("date", "product_id")) %>% 
    mutate(ai_forecast = statistical_forecast + .pred,
           residuals = ai_forecast - demand_quantity) %>% 
    arrange(product_id)

如果数据集包含超过 36000 个数据点，lightGBM 训练/测试的计算时间将达到无穷大

问题描述投票：0回答：1

1个回答

最新问题

如果数据集包含超过 36000 个数据点，lightGBM 训练/测试的计算时间将达到无穷大

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1