我在 Stack Overflow 中发现了多个关于此的讨论。然而,他们都没有提供解决方案。
使用
caret
包,我想使用 XGBoost 算法作为基础学习器并使用随机森林作为元学习器来训练堆叠集成模型。我也尝试了 mlr
包,但它导致了很多其他问题。
查过资料,没有
NA
值,也没有inf
。我尝试像在其他一些讨论中看到的那样设置classProbs = FALSE
,并将目标变量设置为as.factor
。它没有用。
可能是什么问题?
这是我的代码:
super_train <- function(data_list, n_models = 10, cpus = 32) {
# Set up parallel processing
parallelStartSocket(cpus = cpus)
# Prepare data
train <- data_list[[1]]
test <- data_list[[2]]
train[,1] <- make.names(train[,1])
test[,1] <- make.names(test[,1])
train[,1] <- as.factor(train[,1])
test[,1] <- as.factor(test[,1])
train_control <- trainControl(method = "cv", number = 10, classProbs = TRUE , savePredictions = TRUE, summaryFunction = twoClassSummary, verboseIter = FALSE )
# Create base learners
base_learners <- list()
for (i in 1:n_models) {
print(i)
xgb_grid <- expand.grid(
booster = c("gbtree", "gblinear"),
eta = c(0.01, 0.05, 0.1, 0.3),
max_depth = c(3, 6, 9),
alpha = c(0, 1, 3, 6),
lambda = c(0, 1 , 3 , 6),
gamma = c(0, 1 , 3 , 6),
min_child_weight = c(1, 3, 5),
subsample = c(0.5, 0.7, 1),
colsample_bytree = c(0.5, 0.7, 1),
scale_pos_weight = c(0.5, 1, 3, 5)
)
xgb_train <- caret::train(CR ~ .,
data = train,
method = "xgbTree",
params = xgb_grid,
trControl = train_control,
metric = "ROC")
base_learners[[i]] <- xgb_train$finalModel
}
# Create meta-learner
meta_lrn <- caret::train(
CR ~ .,
data = train,
method = "rf",
trControl = train_control,
metric = "ROC",
importance = TRUE
)
# Create stacked ensemble
stack_control <- trainControl(method = "none")
stack_learner <- caretStack(
models = base_learners,
method = "glm",
trControl = stack_control
)
# Train stacked ensemble
stack_model <- caret::train(
CR ~ .,
data = train,
method = stack_learner,
trControl = train_control,
metric = "ROC"
)
# Evaluate on test set
test_pred <- predict(stack_model, newdata = test)
# Stop parallel processing
parallelStop()
return(stack_model)
}
这就是错误,我们的宿敌:
Something is wrong; all the ROC metric values are missing:
ROC Sens Spec
Min. : NA Min. : NA Min. : NA
1st Qu.: NA 1st Qu.: NA 1st Qu.: NA
Median : NA Median : NA Median : NA
Mean :NaN Mean :NaN Mean :NaN
3rd Qu.: NA 3rd Qu.: NA 3rd Qu.: NA
Max. : NA Max. : NA Max. : NA
NA's :108 NA's :108 NA's :108
这里是训练集的一个子集:
structure(list(CR = structure(c(1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L,
1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L,
1L), levels = c("0", "1"), class = "factor"), Gender_male = c(1L,
0L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 1L,
1L, 1L, 1L, 0L, 0L, 0L, 1L, 0L), Feat1 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), anti1 = c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), Feature2 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Feat3 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Feature4 = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), Feature5 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Feature6 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), row.names = c("Pt1", "Pt2",
"Pt3", "Pt4", "Pt17", "Pt18", "Pt2", "Pt26", "Pt28", "Pt29",
"Pt30", "Pt34", "Pt37", "Pt38", "Pt39", "Pt4", "Pt44", "Pt46",
"Pt47", "Pt48", "Pt5", "Pt52", "Pt59", "Pt62", "Pt65"), class = "data.frame")