我正在尝试训练两个 GBM 模型,第一个模型将频率作为响应变量,第二个将索赔数量作为响应,曝光量作为偏移列,但是,我没有看到两个最佳模型之间有任何差异当我进行超参数调整时。 我得到相同的 RMSE。
DF=data[-extreme_ind, ]
DF[,c(4:60)]<- lapply(DF[,c(4:60)], factor)
df=as.h2o(DF)
splits <- h2o.splitFrame(df, 0.8, seed=1234)
train <- h2o.assign(splits[[1]], "train.hex")
valid <- h2o.assign(splits[[2]], "valid.hex")
MOD_1_v2 <- h2o.gbm(x=c(4:56, 58:60),y = 61, training_frame = train, validation_frame =valid, ntrees=200) #100
summary(MOD_1_v2)
plot(MOD_1_v2,timestep="number_of_trees",metric="RMSE")
gbm1_parameters <- list(learn_rate = c(0.01,0.05, 0.1),
max_depth = c(3, 5, 6),
sample_rate = c(0.7, 0.75, 0.8),
col_sample_rate = c(0.2, 0.5, 1.0))
gbm1_grid <- h2o.grid("gbm", x = c(4:56, 58:60), y = 61,
grid_id = "gbm_grid",
training_frame = train,
validation_frame = valid,
ntrees=20, #30
seed = 1,
hyper_params = gbm1_parameters)
gbm1_gridp<- h2o.getGrid(grid_id = "gbm_grid",
sort_by = "rmse",
decreasing = FALSE)
print(gbm1_gridp)
best_MOD_1=h2o.getModel(gbm1_gridp@model_ids[[1]])
summary(best_MOD_1)
best_gbm_perf1 <- h2o.performance(model = best_MOD_1,newdata = valid)
best_gbm_perf1
plot(best_MOD_1,timestep="number_of_trees",metric="rmse")
h2o.varimp_plot(best_MOD_1)
MOD_2_v2 <- h2o.gbm(x=c(4:56, 58:60),y = 2,offset_column="APVI", training_frame = train, validation_frame = valid,ntrees=55)
summary(MOD_2_v2) #apres supp outliers
plot(MOD_2_v2,timestep="number_of_trees",metric="RMSE")
gbm2_parameters <- list(learn_rate = c(0.01,0.05, 0.1),
max_depth = c(3, 5),
sample_rate = c(0.7, 0.75, 0.8),
col_sample_rate = c(0.2, 0.5, 1.0))
gbm2_grid <- h2o.grid("gbm", x = c(4:56, 58:60), y = 2,
grid_id = "gbm_grid",
training_frame = train,
validation_frame = valid,
ntrees=55, #10
seed = 123,
hyper_params = gbm2_parameters)
gbm2_gridp<- h2o.getGrid(grid_id = "gbm_grid",
sort_by = "rmse",
decreasing = FALSE)
print(gbm2_gridp)
best_MOD_2=h2o.getModel(gbm2_gridp@model_ids[[1]])
summary(best_MOD_2)
best_gbm_perf2 <- h2o.performance(model = best_MOD_2,newdata = valid)
best_gbm_perf2
如何解决这个问题?
您也可以分享一下打印输出吗?
我的第一个想法是你使用相同的 grid_id = "gbm_grid";请尝试将第二个更改为不同的。
我还将用我的通用数据尝试这个建议,看看这是否是问题所在。
谢谢!