我使用 R 中的 tidymodels 包使用以下代码对名为“muvr_model_1”的数据运行了三种机器学习算法(逻辑回归、xg-boost、随机森林);
library(dplyr)
library(magrittr)
library(ggplot2)
library(tidyverse)
library(tidymodels)
####
#Logistic regression
set.seed(9011) #For reproducibility. Shares the same way every time. Less chance in evaluation
library(tidymodels)
muvr_model_1_split <- initial_split (muvr_model_1, strata = Label_numeric_response, prop = 0.7) #strata to insure approximately equal percentage of outcomes in both training data and test data. 75% goes to training data
muvr_model_1_train <- training(muvr_model_1_split) #70% of data
muvr_model_1_test <- testing(muvr_model_1_split) #30% of data
##Heuristic – logistic regression
log_model <- logistic_reg() %>% #We want a logistic regression model
set_engine('glm') %>% #To be done according to glm, there are other ways to create logistic regression models
set_mode('classification') #The model should classify
log_fit <- log_model %>% #Takes and fits our model to training data
fit(Label_numeric_response ~ ., data = muvr_model_1_train) #Modeling response against all other factors
tidy(log_fit) # Summarizes the model aligned with the training data
log_class_pred <- predict(log_fit, new_data = muvr_model_1_test, type = 'class') #What outcome does it predict?
print(log_class_pred, n = 68)
log_prob_pred <- predict(log_fit, new_data = muvr_model_1_test, type = 'prob') # What probability for each class?
#Heuristic result
log_results <- muvr_model_1_test %>% select(Label_numeric_response) %>% #Getting the true class
bind_cols(log_class_pred, log_prob_pred) #Adds predicted class and probabilities
#Confusion matrix
conf_mat(log_results, truth = Label_numeric_response, estimate = .pred_class)
conf_mat(log_results, truth = Label_numeric_response, estimate = .pred_class) %>%
summary()
log_results %>% roc_curve(truth = Label_numeric_response, .pred_No) %>% autoplot()
roc_auc(log_results, truth = Label_numeric_response, .pred_No)
#Streamlines workflow in tidymodels
log_last_fit <- log_model %>%
last_fit(Label_numeric_response ~ ., split = muvr_model_1_split)
log_last_fit %>% collect_metrics()
#Includes Cross validation part
set.seed(9011)
muvr_model_1_folds_5 <- vfold_cv(muvr_model_1_train, v = 5, repeats = 5)
log_recipe <- recipe(Label_numeric_response ~ ., data = muvr_model_1_train, family = binomial) %>% step_normalize(all_numeric()) %>% step_dummy(all_nominal(), -all_outcomes())
log_recipe %>% prep() #Provides us with the following information
#After resampling, we have validated, i.e. evaluated on parts of the training data
log_reg_model <- logistic_reg() %>%
set_mode('classification') %>%
set_engine('glm')
logistic_wf <- workflow() %>%
add_recipe(log_recipe) %>%
add_model(log_reg_model)
log_reg_resamp <- fit_resamples(logistic_wf, muvr_model_1_folds_5) ## TRAIN with Cross validation
collect_metrics(log_reg_resamp)
#With last_fit, we predict the test data
logistic_wf %>% last_fit(muvr_model_1_split) %>% collect_metrics() ## TEST
######
#####
#XG-BOOST
library(xgboost)
xgb_recipe <- recipe(Label_numeric_response ~ ., data = muvr_model_1_train) %>% update_role(Label_numeric_response, new_role = "outcome") %>% step_normalize(all_numeric_predictors()) %>% step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
step_zv(all_predictors())
hf_xgb_model <- boost_tree( mtry = tune(),
trees = tune(), min_n = tune(),
tree_depth = tune(),
learn_rate = tune()) %>%
set_mode('classification') %>%
set_engine('xgboost')
hf_xgb_wflow <- workflow() %>% add_recipe(xgb_recipe) %>% add_model(hf_xgb_model)
set.seed(9192)
hf_xgb_grid <- grid_latin_hypercube(
finalize(mtry(), muvr_model_1_train),
trees(),
min_n(),
tree_depth(),
learn_rate(),
size = 100)
library(finetune)
auc_metric <- metric_set(yardstick::roc_auc) #Wants to measure the predictive ability of the model with ROC AUC
xgb_ctrl <- control_race(
save_pred = TRUE,
save_workflow = TRUE,
verbose_elim = TRUE)
set.seed(9011)
hf_xgb_result <- hf_xgb_wflow %>%
tune_race_anova( #tune_race_anova for faster driving. Here, instead, run the function tune_race_anova instead of specifying it as an argument for workflow function
resamples = muvr_model_1_folds_5, # resampling
grid = hf_xgb_grid, # Which grid should it be evaluated on?
metrics = auc_metric, #vilket measures should the models be evaluated according to?
control = xgb_ctrl,
verbose = F) #Write out in the console while I am running
collect_metrics(hf_xgb_result) %>%
print(n = 18)
show_best(hf_xgb_result, "roc_auc") ##TRAIN
#Visualize hyperparameter tuning
hf_xgb_result %>% collect_metrics() %>%
filter(.metric == "roc_auc") %>% select(mean, mtry:learn_rate) %>% pivot_longer(mtry:learn_rate,
names_to = "parameter", values_to = "value") %>%
ggplot(aes(value, mean, color = parameter)) + geom_point(show.legend = FALSE) + facet_wrap(~parameter, scales = "free_x") + labs(x = NULL, y = "AUC")
#Choose the best model and predict in test data
best_xgb <- select_best(hf_xgb_result, "roc_auc")
final_xgb <- finalize_workflow(hf_xgb_wflow, best_xgb)
final_xgb_res <- last_fit(final_xgb, muvr_model_1_split, metrics = auc_metric)
collect_metrics(final_xgb_res) ##TEST
show_best(final_xgb_res, "roc_auc")
final_xgb_res%>%
print(n =6)
#Feature importance
library(vip)
final_xgb %>%
fit(data = muvr_model_1_train) %>%
pull_workflow_fit() %>%
vip(geom = "col")
######
#####
#Random forest
set.seed(9011) #For reproducibility. Shares the same way every time. Less chance in evaluation
CM_metric <- metric_set(yardstick::roc_auc)
CM_recipe <- recipe(Label_numeric_response ~ ., data = muvr_model_1_train) %>% step_normalize(all_numeric()) %>%
step_dummy(all_nominal(), -all_outcomes())
rand_model <- rand_forest(
mtry = tune(),
trees = tune(),
min_n = tune()) %>%
set_mode('classification') %>%
set_engine('ranger')
rand_wflow <- workflow() %>%
add_recipe(CM_recipe) %>%
add_model(rand_model)
set.seed(9293)
rand_grid <- grid_latin_hypercube(
finalize(mtry(), muvr_model_1_train),
trees(),
min_n(),
size = 100 )
rand_ctrl <- control_race(
save_pred = TRUE,
save_workflow = TRUE,
verbose_elim = TRUE)
set.seed(9394)
rand_result <- rand_wflow %>% tune_race_anova(
resamples = muvr_model_1_folds_5,
grid = rand_grid,
metrics = CM_metric,
control = rand_ctrl,
verbose = TRUE )
collect_metrics(rand_result)
show_best(rand_result, "roc_auc") ##TRAIN
best_rand <- select_best(rand_result, "roc_auc")
best_rand
final_rand <- finalize_workflow(
rand_wflow, best_rand )
final_rand_res <- last_fit(final_rand,
muvr_model_1_split, metrics = CM_metric)
collect_metrics(final_rand_res) ###TEST
######
问题:我想从所有三个算法中提取信息来创建一条 ROC 曲线,分别用于 TRINING 和 TEST 数据中的模型性能,并希望在一个图中覆盖所有三个训练 ROC 曲线,在第二个图中覆盖所有三个测试 ROC 曲线阴谋。如下;
在 R 中执行此操作的最佳方法是什么?这是我的数据“muvr_model_1”片段;
structure(list(Label_numeric_response = structure(c(1L, 2L, 1L,
1L, 1L, 1L), levels = c("No", "Yes"), class = "factor"), Acet = c(-0.128692417994666,
-0.482249202992499, 2.26777375399065, -0.470214149992573, -0.480794796992508,
0.102343519003918), Val = c(0.0385410601921631, 0.407464020291187,
-0.75256524002018, -1.36200022018376, -0.790162200030271, -1.01311664009012
), DA = c(-0.379477365588001, -0.230197042960625, 1.60781692938895,
-0.127566821154304, 0.394914308041514, 0.712134993624688), Mal = c(0.257350020059903,
0.357840680030046, 0.0171883801312568, -0.412952569740945, 1.25095511976469,
1.4077690597181), X3.Hydr = c(1.66045651392454,
-0.38828213999918, -0.830745459982707, -0.904767569979951, -0.0552838890115781,
0.760685376958043), BAM = c(0.0601078484051758, -0.296157370351331,
0.814551841066014, 0.206805291422561, -0.463811590942629, 0.500200177457331
)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"
))
一个小可重现的例子总是有帮助的:-)
yardstick 度量函数具有分组感知功能,因此您可以先进行分组,然后使用
autoplot()
(请参见下面的示例)。要添加图例,您可以使用 roc_auc()
执行相同操作,然后使用 geom_label()
添加标签。
library(tidymodels)
tidymodels_prefer()
set.seed(1)
split <- initial_split(two_class_dat)
lr_res <-
last_fit(
logistic_reg(),
Class ~ .,
split = split)
cart_res <-
last_fit(
decision_tree(cost_complexity = 0) %>% set_mode("classification"),
Class ~ .,
split = split)
bind_rows(
collect_predictions(lr_res) %>% mutate(model = "logistic reg"),
collect_predictions(cart_res) %>% mutate(model = "CART")
) %>%
group_by(model) %>%
roc_curve(Class, .pred_Class1) %>%
autoplot()
创建于 2024-02-20,使用 reprex v2.0.2