如何根据 tidymodel 中运行的三种机器学习算法在 R 的一个图中绘制多条 ROC 曲线?

问题描述 投票:0回答:1

我使用 R 中的 tidymodels 包使用以下代码对名为“muvr_model_1”的数据运行了三种机器学习算法(逻辑回归、xg-boost、随机森林);

library(dplyr)
library(magrittr)
library(ggplot2)
library(tidyverse)
library(tidymodels)


####
#Logistic regression

set.seed(9011) #For reproducibility. Shares the same way every time. Less chance in evaluation

library(tidymodels)

muvr_model_1_split <- initial_split (muvr_model_1, strata = Label_numeric_response, prop = 0.7) #strata to insure approximately equal percentage of outcomes in both training data and test data. 75% goes to training data

muvr_model_1_train <- training(muvr_model_1_split) #70% of data
muvr_model_1_test <- testing(muvr_model_1_split) #30% of data

##Heuristic – logistic regression

log_model <- logistic_reg() %>% #We want a logistic regression model
  
  set_engine('glm') %>% #To be done according to glm, there are other ways to create logistic regression models
  
  set_mode('classification') #The model should classify

log_fit <- log_model %>% #Takes and fits our model to training data
  
  fit(Label_numeric_response ~ ., data = muvr_model_1_train) #Modeling response against all other factors

tidy(log_fit) # Summarizes the model aligned with the training data

log_class_pred <- predict(log_fit, new_data = muvr_model_1_test, type = 'class') #What outcome does it predict?

print(log_class_pred, n = 68)

log_prob_pred <- predict(log_fit, new_data = muvr_model_1_test, type = 'prob') # What probability for each class?

#Heuristic result
log_results <- muvr_model_1_test %>% select(Label_numeric_response) %>% #Getting the true class
  bind_cols(log_class_pred, log_prob_pred) #Adds predicted class and probabilities

#Confusion matrix
conf_mat(log_results, truth = Label_numeric_response, estimate = .pred_class)
conf_mat(log_results, truth = Label_numeric_response, estimate = .pred_class) %>%
  summary()

log_results %>% roc_curve(truth = Label_numeric_response, .pred_No) %>% autoplot()
roc_auc(log_results, truth = Label_numeric_response, .pred_No)


#Streamlines workflow in tidymodels

log_last_fit <- log_model %>%
  last_fit(Label_numeric_response ~ ., split = muvr_model_1_split)

log_last_fit %>% collect_metrics()

#Includes Cross validation part

set.seed(9011)

muvr_model_1_folds_5 <- vfold_cv(muvr_model_1_train, v = 5, repeats = 5)

log_recipe <- recipe(Label_numeric_response ~ ., data = muvr_model_1_train, family = binomial) %>% step_normalize(all_numeric()) %>% step_dummy(all_nominal(), -all_outcomes())

log_recipe %>% prep() #Provides us with the following information

#After resampling, we have validated, i.e. evaluated on parts of the training data

log_reg_model <- logistic_reg() %>%
  set_mode('classification') %>%
  set_engine('glm')

logistic_wf <- workflow() %>%
  add_recipe(log_recipe) %>% 
  add_model(log_reg_model)

log_reg_resamp <- fit_resamples(logistic_wf, muvr_model_1_folds_5) ## TRAIN with Cross validation

collect_metrics(log_reg_resamp)

#With last_fit, we predict the test data

logistic_wf %>% last_fit(muvr_model_1_split) %>% collect_metrics() ## TEST
######

#####
#XG-BOOST

library(xgboost)

xgb_recipe <- recipe(Label_numeric_response ~ ., data = muvr_model_1_train) %>% update_role(Label_numeric_response, new_role = "outcome") %>% step_normalize(all_numeric_predictors()) %>% step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
  step_zv(all_predictors())

hf_xgb_model <- boost_tree( mtry = tune(),
                            trees = tune(), min_n = tune(),
                            tree_depth = tune(),
                            learn_rate = tune()) %>%
  set_mode('classification') %>%
  set_engine('xgboost')

hf_xgb_wflow <- workflow() %>% add_recipe(xgb_recipe) %>% add_model(hf_xgb_model)


set.seed(9192)

hf_xgb_grid <- grid_latin_hypercube(
  finalize(mtry(), muvr_model_1_train),
  trees(),
  min_n(),
  tree_depth(),
  learn_rate(),
  size = 100)

library(finetune)

auc_metric <- metric_set(yardstick::roc_auc) #Wants to measure the predictive ability of the model with ROC AUC

xgb_ctrl <- control_race(
  save_pred = TRUE,
  save_workflow = TRUE,
  verbose_elim = TRUE)

set.seed(9011)

hf_xgb_result <- hf_xgb_wflow %>%
  tune_race_anova( #tune_race_anova for faster driving. Here, instead, run the function tune_race_anova instead of specifying it as an argument for workflow function
    resamples = muvr_model_1_folds_5, # resampling
    grid = hf_xgb_grid, # Which grid should it be evaluated on?
    metrics = auc_metric, #vilket measures should the models be evaluated according to?
    control = xgb_ctrl,
    verbose = F) #Write out in the console while I am running

collect_metrics(hf_xgb_result) %>%
  print(n = 18)

show_best(hf_xgb_result, "roc_auc") ##TRAIN

#Visualize hyperparameter tuning

hf_xgb_result %>% collect_metrics() %>%
  filter(.metric == "roc_auc") %>% select(mean, mtry:learn_rate) %>% pivot_longer(mtry:learn_rate,
                                                                                  names_to = "parameter", values_to = "value") %>%
  ggplot(aes(value, mean, color = parameter)) + geom_point(show.legend = FALSE) + facet_wrap(~parameter, scales = "free_x") + labs(x = NULL, y = "AUC")


#Choose the best model and predict in test data

best_xgb <- select_best(hf_xgb_result, "roc_auc")
final_xgb <- finalize_workflow(hf_xgb_wflow, best_xgb)
final_xgb_res <- last_fit(final_xgb, muvr_model_1_split, metrics = auc_metric)
collect_metrics(final_xgb_res)    ##TEST

show_best(final_xgb_res, "roc_auc")

final_xgb_res%>%
  print(n =6)

#Feature importance

library(vip)
final_xgb %>%
  fit(data = muvr_model_1_train) %>%
  pull_workflow_fit() %>%
  vip(geom = "col")

######


#####
#Random forest

set.seed(9011) #For reproducibility. Shares the same way every time. Less chance in evaluation

CM_metric <- metric_set(yardstick::roc_auc)
CM_recipe <- recipe(Label_numeric_response ~ ., data = muvr_model_1_train) %>% step_normalize(all_numeric()) %>%
  step_dummy(all_nominal(), -all_outcomes())

rand_model <- rand_forest(
  mtry = tune(),
  trees = tune(),
  min_n = tune()) %>%
  set_mode('classification') %>%
  set_engine('ranger')

rand_wflow <- workflow() %>%
  add_recipe(CM_recipe) %>%
  add_model(rand_model)

set.seed(9293)

rand_grid <- grid_latin_hypercube(
  finalize(mtry(), muvr_model_1_train),
  trees(),
  min_n(),
  size = 100 )

rand_ctrl <- control_race(
  save_pred = TRUE,
  save_workflow = TRUE,
  verbose_elim = TRUE)

set.seed(9394)

rand_result <- rand_wflow %>% tune_race_anova(
  resamples = muvr_model_1_folds_5,
  grid = rand_grid,
  metrics = CM_metric,
  control = rand_ctrl,
  verbose = TRUE )

collect_metrics(rand_result)

show_best(rand_result, "roc_auc") ##TRAIN

best_rand <- select_best(rand_result, "roc_auc")

best_rand

final_rand <- finalize_workflow(
  rand_wflow, best_rand )

final_rand_res <- last_fit(final_rand,
                           muvr_model_1_split, metrics = CM_metric)

collect_metrics(final_rand_res) ###TEST

######

问题:我想从所有三个算法中提取信息来创建一条 ROC 曲线,分别用于 TRINING 和 TEST 数据中的模型性能,并希望在一个图中覆盖所有三个训练 ROC 曲线,在第二个图中覆盖所有三个测试 ROC 曲线阴谋。如下;

在 R 中执行此操作的最佳方法是什么?这是我的数据“muvr_model_1”片段;

structure(list(Label_numeric_response = structure(c(1L, 2L, 1L, 
1L, 1L, 1L), levels = c("No", "Yes"), class = "factor"), Acet = c(-0.128692417994666, 
-0.482249202992499, 2.26777375399065, -0.470214149992573, -0.480794796992508, 
0.102343519003918), Val = c(0.0385410601921631, 0.407464020291187, 
-0.75256524002018, -1.36200022018376, -0.790162200030271, -1.01311664009012
), DA = c(-0.379477365588001, -0.230197042960625, 1.60781692938895, 
-0.127566821154304, 0.394914308041514, 0.712134993624688), Mal = c(0.257350020059903, 
0.357840680030046, 0.0171883801312568, -0.412952569740945, 1.25095511976469, 
1.4077690597181), X3.Hydr = c(1.66045651392454, 
-0.38828213999918, -0.830745459982707, -0.904767569979951, -0.0552838890115781, 
0.760685376958043), BAM = c(0.0601078484051758, -0.296157370351331, 
0.814551841066014, 0.206805291422561, -0.463811590942629, 0.500200177457331
)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"
))
machine-learning tidyverse roc tidymodels
1个回答
0
投票

一个可重现的例子总是有帮助的:-)

yardstick 度量函数具有分组感知功能,因此您可以先进行分组,然后使用

autoplot()
(请参见下面的示例)。要添加图例,您可以使用
roc_auc()
执行相同操作,然后使用
geom_label()
添加标签。

library(tidymodels)
tidymodels_prefer()

set.seed(1)
split <- initial_split(two_class_dat)

lr_res <- 
  last_fit(
    logistic_reg(),
    Class ~ ., 
    split = split)

cart_res <- 
  last_fit(
    decision_tree(cost_complexity = 0) %>% set_mode("classification"),
    Class ~ ., 
    split = split)

bind_rows(
    collect_predictions(lr_res) %>% mutate(model = "logistic reg"), 
    collect_predictions(cart_res) %>% mutate(model = "CART")
    ) %>% 
  group_by(model) %>% 
  roc_curve(Class, .pred_Class1) %>% 
  autoplot()

创建于 2024-02-20,使用 reprex v2.0.2

© www.soinside.com 2019 - 2024. All rights reserved.