使用 h2o 和 R 调整带有偏移列的 GBM 模型

问题描述 投票:0回答:1

我正在尝试训练两个 GBM 模型,第一个模型将频率作为响应变量,第二个将索赔数量作为响应,曝光量作为偏移列,但是,我没有看到两个最佳模型之间有任何差异当我进行超参数调整时。 我得到相同的 RMSE。


DF=data[-extreme_ind, ] 
DF[,c(4:60)]<- lapply(DF[,c(4:60)], factor)


df=as.h2o(DF)
splits <- h2o.splitFrame(df, 0.8, seed=1234)  
train <- h2o.assign(splits[[1]], "train.hex")  
valid <- h2o.assign(splits[[2]], "valid.hex") 

MOD_1_v2 <- h2o.gbm(x=c(4:56, 58:60),y = 61, training_frame = train, validation_frame =valid, ntrees=200) #100
summary(MOD_1_v2)

plot(MOD_1_v2,timestep="number_of_trees",metric="RMSE") 





gbm1_parameters <- list(learn_rate = c(0.01,0.05, 0.1),
                        max_depth = c(3, 5, 6),
                        sample_rate = c(0.7, 0.75, 0.8),  
                        col_sample_rate = c(0.2, 0.5, 1.0))



gbm1_grid <- h2o.grid("gbm", x = c(4:56, 58:60), y = 61,
                      grid_id = "gbm_grid",
                      training_frame = train,
                      validation_frame = valid,  
                      ntrees=20, #30
                      seed = 1,
                      hyper_params = gbm1_parameters)



gbm1_gridp<- h2o.getGrid(grid_id = "gbm_grid",
                         sort_by = "rmse",
                         decreasing  = FALSE)
print(gbm1_gridp)


best_MOD_1=h2o.getModel(gbm1_gridp@model_ids[[1]])

summary(best_MOD_1)




best_gbm_perf1 <- h2o.performance(model = best_MOD_1,newdata = valid)
best_gbm_perf1



plot(best_MOD_1,timestep="number_of_trees",metric="rmse")
h2o.varimp_plot(best_MOD_1)



MOD_2_v2 <- h2o.gbm(x=c(4:56, 58:60),y = 2,offset_column="APVI", training_frame = train, validation_frame = valid,ntrees=55) 

summary(MOD_2_v2) #apres supp outliers 

plot(MOD_2_v2,timestep="number_of_trees",metric="RMSE")


gbm2_parameters <- list(learn_rate = c(0.01,0.05, 0.1),
                        max_depth = c(3, 5),
                        sample_rate = c(0.7, 0.75, 0.8),  
                        col_sample_rate = c(0.2, 0.5, 1.0))




gbm2_grid <- h2o.grid("gbm", x = c(4:56, 58:60), y = 2,
                      grid_id = "gbm_grid",
                      training_frame = train,
                      validation_frame = valid, 
                      ntrees=55, #10
                      seed = 123,
                      hyper_params = gbm2_parameters)


gbm2_gridp<- h2o.getGrid(grid_id = "gbm_grid",
                         sort_by = "rmse",
                         decreasing  = FALSE)
print(gbm2_gridp)



best_MOD_2=h2o.getModel(gbm2_gridp@model_ids[[1]])
summary(best_MOD_2)


best_gbm_perf2 <- h2o.performance(model = best_MOD_2,newdata = valid)
best_gbm_perf2

如何解决这个问题?

r offset h2o hyperparameters gbm
1个回答
0
投票

您也可以分享一下打印输出吗?

我的第一个想法是你使用相同的 grid_id = "gbm_grid";请尝试将第二个更改为不同的。

我还将用我的通用数据尝试这个建议,看看这是否是问题所在。

谢谢!

© www.soinside.com 2019 - 2024. All rights reserved.