表(data, reference, dnn = dnn, ...)中出错:当运行带尾号的confusionMatrix时,所有参数必须具有相同的长度,在R中。

问题描述 投票:0回答:1

我在运行混淆矩阵时遇到了一个问题。

我是这样做的。

rf <- caret::train(tested ~., 
                               data = training_data, 
                               method = "rf",
                               trControl = ctrlInside,
                               metric = "ROC", 
                               na.action = na.exclude)

rf

在我得到我的模型后,这是我的下一步。

evalResult.rf <- predict(rf, testing_data, type = "prob")
predict_rf <- as.factor(ifelse(evalResult.rf <0.5, "positive", "negative"))

然后我运行我的混淆矩阵。

cm_rf_forest <- confusionMatrix(predict_rf, testing_data$tested, "positive") 

而错误是在我应用混淆矩阵之后出现的。

Error in table(data, reference, dnn = dnn, ...) : 
  all arguments must have the same length

尽管如此,我还是给你一些我的数据:

训练数据。

structure(list(tested = structure(c(1L, 1L, 1L, 1L, 1L, 
1L), .Label = c("negative", "positive"), class = "factor"), Gender = structure(c(2L, 
2L, 1L, 1L, 2L, 2L), .Label = c("Female", "Male", "Other"), class = "factor"), 
    Age = c(63, 23, 28, 40, 31, 60), number_days_symptoms = c(1, 
    1, 16, 1, 14, 1), care_home_worker = structure(c(1L, 2L, 
    1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    health_care_worker = structure(c(1L, 1L, 1L, 1L, 2L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), how_unwell = c(1, 1, 6, 4, 2, 
    1), self_diagnosis = structure(c(1L, 1L, 2L, 1L, 2L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), chills = structure(c(1L, 1L, 2L, 
    1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    cough = structure(c(1L, 1L, 2L, 2L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), diarrhoea = structure(c(1L, 1L, 
    1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    fatigue = structure(c(1L, 2L, 2L, 2L, 2L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), headache = structure(c(2L, 2L, 
    3L, 2L, 2L, 2L), .Label = c("Headcahe", "No", "Yes"), class = "factor"), 
    loss_smell_taste = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), muscle_ache = structure(c(1L, 
    1L, 2L, 2L, 2L, 2L), .Label = c("No", "Yes"), class = "factor"), 
    nasal_congestion = structure(c(1L, 1L, 1L, 2L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), nausea_vomiting = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    shortness_breath = structure(c(1L, 1L, 1L, 1L, 2L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), sore_throat = structure(c(1L, 
    1L, 1L, 2L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    sputum = structure(c(1L, 1L, 2L, 2L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), temperature = structure(c(4L, 
    4L, 4L, 4L, 1L, 4L), .Label = c("37.5-38", "38.1-39", "39.1-41", 
    "No"), class = "factor"), asthma = structure(c(2L, 1L, 1L, 
    1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    diabetes_type_one = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), diabetes_type_two = structure(c(2L, 
    1L, 1L, 1L, 1L, 2L), .Label = c("No", "Yes"), class = "factor"), 
    obesity = structure(c(1L, 2L, 2L, 1L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), hypertension = structure(c(1L, 
    1L, 2L, 1L, 1L, 2L), .Label = c("No", "Yes"), class = "factor"), 
    heart_disease = structure(c(1L, 1L, 1L, 1L, 1L, 2L), .Label = c("No", 
    "Yes"), class = "factor"), lung_condition = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    liver_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), kidney_disease = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor")), row.names = c(1L, 
3L, 4L, 5L, 6L, 7L), class = "data.frame")

这里是我的test_data:

structure(list(tested = structure(c(1L, 1L, 1L, 1L, 1L, 
1L), .Label = c("negative", "positive"), class = "factor"), Gender = structure(c(1L, 
2L, 1L, 1L, 1L, 2L), .Label = c("Female", "Male", "Other"), class = "factor"), 
    Age = c(19, 26, 30, 45, 40, 43), number_days_symptoms = c(20, 
    1, 1, 20, 14, 1), care_home_worker = structure(c(1L, 1L, 
    1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    health_care_worker = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), how_unwell = c(7, 6, 6, 6, 6, 
    2), self_diagnosis = structure(c(2L, 1L, 1L, 2L, 2L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), chills = structure(c(2L, 1L, 1L, 
    1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    cough = structure(c(2L, 1L, 1L, 2L, 2L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), diarrhoea = structure(c(2L, 1L, 
    1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    fatigue = structure(c(2L, 1L, 1L, 2L, 2L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), headache = structure(c(2L, 2L, 
    2L, 3L, 2L, 3L), .Label = c("Headcahe", "No", "Yes"), class = "factor"), 
    loss_smell_taste = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), muscle_ache = structure(c(2L, 
    1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    nasal_congestion = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), nausea_vomiting = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    shortness_breath = structure(c(2L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), sore_throat = structure(c(1L, 
    1L, 1L, 2L, 1L, 2L), .Label = c("No", "Yes"), class = "factor"), 
    sputum = structure(c(2L, 1L, 1L, 2L, 1L, 2L), .Label = c("No", 
    "Yes"), class = "factor"), temperature = structure(c(4L, 
    4L, 4L, 1L, 1L, 4L), .Label = c("37.5-38", "38.1-39", "39.1-41", 
    "No"), class = "factor"), asthma = structure(c(1L, 1L, 1L, 
    1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    diabetes_type_one = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), diabetes_type_two = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    obesity = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), hypertension = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    heart_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), lung_condition = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), 
    liver_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
    "Yes"), class = "factor"), kidney_disease = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor")), row.names = c(2L, 
8L, 11L, 14L, 20L, 27L), class = "data.frame")

此外,我还在ctrInside的子样本上 进行了一个smote平衡类。

这是我的smote函数。

smotest <- list(name = "SMOTE with more neighbors!",
                func = function (x, y) {
                  115
                  library(DMwR)
                  dat <- if (is.data.frame(x)) x else as.data.frame(x)
                  dat$.y <- y
                  dat <- SMOTE(.y ~ ., data = dat, k = 3, perc.over = 100, perc.under =
                                 200)
                  list(x = dat[, !grepl(".y", colnames(dat), fixed = TRUE)],
                       y = dat$.y) },
                first = TRUE)

ctrlInside是这样的

ctrlInside <- trainControl(method = "repeatedcv", 
                           number = 10,
                           repeats = 5,
                           summaryFunction = twoClassSummary,
                           classProbs = TRUE,
                           savePredictions = TRUE, 
                           search = "grid",
                           sampling = smotest)

给出这些函数只是为了让你了解我对每个整体所做的事情。有什么原因会发生这种情况吗?

r machine-learning r-caret confusion-matrix
1个回答
0
投票

你可以使用complete.cases来预测只有那些没有na的,也必须对矩阵进行操作,我将在下面展示。使用一个例子的数据集,我使10的变量在一列NAs,并训练。

idx = sample(nrow(iris),100)
data = iris
data$Petal.Length[sample(nrow(data),10)] = NA
data$tested = factor(ifelse(data$Species=="versicolor","positive","negative"))
data = data[,-5]
training_data = data[idx,]
testing_data= data[-idx,]

rf <- caret::train(tested ~., data = training_data, 
                              method = "rf",
                              trControl = ctrlInside,
                              metric = "ROC", 
                              na.action = na.exclude)

做评估结果,你可以看到我得到同样的错误。

evalResult.rf <- predict(rf, testing_data, type = "prob")
predict_rf <- as.factor(ifelse(evalResult.rf <0.5, "positive", "negative"))
cm_rf_forest <- confusionMatrix(predict_rf, testing_data$tested, "positive") 

Error in table(data, reference, dnn = dnn, ...) : 
  all arguments must have the same length

所以,有两个错误的来源,1. . 你有NAS和他们不能预测,第二,evalResult. rf返回一个概率矩阵,第一列是概率是负类,第二是后置。

head(evalResult.rf)
   negative positive
3     1.000    0.000
6     1.000    0.000
9     0.948    0.052
12    1.000    0.000
13    0.976    0.024
19    0.998    0.002

为了得到类,你要做的是,得到每行最大值的那一列,然后返回相应的列名,也就是类。

colnames(evalResult.rf)[max.col(evalResult.rf)]

我们现在就这样做。

testing_data = testing_data[complete.cases(testing_data),]
evalResult.rf <- predict(rf, testing_data, type = "prob")
predict_rf <- factor(colnames(evalResult.rf)[max.col(evalResult.rf)])
cm_rf_forest <- confusionMatrix(predict_rf, testing_data$tested, "positive")

Confusion Matrix and Statistics

          Reference
Prediction negative positive
  negative       33        1
  positive        0       11

               Accuracy : 0.9778          
                 95% CI : (0.8823, 0.9994)
    No Information Rate : 0.7333          
    P-Value [Acc > NIR] : 1.507e-05       

                  Kappa : 0.9416     
© www.soinside.com 2019 - 2024. All rights reserved.