我已经看到许多有关此特定错误的答案。对于我的特定问题,我尚未找到任何答案。因此,我的问题
这是我的工作:
shortness_breath_data <- data_categ_nosev %>%
dplyr::select(shortness_breath, obesity, asthma, diabetes_type_one, diabetes_type_two, obesity, hypertension, heart_disease, lung_condition, liver_disease, kidney_disease, Covid_tested, Gender)
这是put(head(shortness_breath_data))
:
structure(list(shortness_breath = structure(c(1L, 2L, 1L, 1L,
1L, 2L), .Label = c("No", "Yes"), class = "factor"), obesity = structure(c(1L,
1L, 2L, 2L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
asthma = structure(c(2L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), diabetes_type_one = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
diabetes_type_two = structure(c(2L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), hypertension = structure(c(1L,
1L, 1L, 2L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
heart_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), lung_condition = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
liver_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), kidney_disease = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
Covid_tested = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("negative",
"positive"), class = "factor"), Gender = structure(c(2L,
1L, 2L, 1L, 1L, 2L), .Label = c("Female", "Male", "Other"
), class = "factor")), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"), problems = structure(list(row = c(2910L,
35958L), col = c("how_unwell", "how_unwell"), expected = c("a double",
"a double"), actual = c("How Unwell", "How Unwell"), file = c("'/Users/gabrielburcea/Rprojects/data/data_lev_categorical_no_sev.csv'",
"'/Users/gabrielburcea/Rprojects/data/data_lev_categorical_no_sev.csv'"
)), row.names = c(NA, -2L), class = c("tbl_df", "tbl", "data.frame"
)))
然后将其分为训练和测试数据集。
shortness_breath_data$shortness_breath <- as.factor(shortness_breath_data$shortness_breath)
n <- nrow(shortness_breath_data)
set.seed(22)
trainingdx <- sample(1:n, 0.7 * n)
train <- shortness_breath_data[trainingdx,]
validate <- shortness_breath_data[-trainingdx,]
train %>% distinct(shortness_breath)
validate %>% distinct(shortness_breath)
[为了避免麻烦,我也提供了dput(head(train))
和dput(head(validate))
]进行同样的操作>
火车数据集:
structure(list(shortness_breath = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), obesity = structure(c(2L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), asthma = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), diabetes_type_one = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), diabetes_type_two = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), hypertension = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), heart_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), lung_condition = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), liver_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), kidney_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), Covid_tested = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("negative", "positive"), class = "factor"), Gender = structure(c(1L, 1L, 1L, 2L, 1L, 2L), .Label = c("Female", "Male", "Other" ), class = "factor")), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"), problems = structure(list(row = c(2910L, 35958L), col = c("how_unwell", "how_unwell"), expected = c("a double", "a double"), actual = c("How Unwell", "How Unwell"), file = c("'/Users/gabrielburcea/Rprojects/data/data_lev_categorical_no_sev.csv'", "'/Users/gabrielburcea/Rprojects/data/data_lev_categorical_no_sev.csv'" )), row.names = c(NA, -2L), class = c("tbl_df", "tbl", "data.frame" )))
验证数据集:
structure(list(shortness_breath = structure(c(1L, 2L, 2L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), obesity = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), asthma = structure(c(2L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), diabetes_type_one = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), diabetes_type_two = structure(c(2L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), hypertension = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), heart_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), lung_condition = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), liver_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), kidney_disease = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), Covid_tested = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("negative", "positive"), class = "factor"), Gender = structure(c(2L, 1L, 2L, 2L, 1L, 1L), .Label = c("Female", "Male", "Other" ), class = "factor")), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"), problems = structure(list(row = c(2910L, 35958L), col = c("how_unwell", "how_unwell"), expected = c("a double", "a double"), actual = c("How Unwell", "How Unwell"), file = c("'/Users/gabrielburcea/Rprojects/data/data_lev_categorical_no_sev.csv'", "'/Users/gabrielburcea/Rprojects/data/data_lev_categorical_no_sev.csv'" )), row.names = c(NA, -2L), class = c("tbl_df", "tbl", "data.frame" )))
然后,我使用逐步,向前的方法建立逻辑回归模型。
null_model <- glm(shortness_breath ~ 1, data = train, family = "binomial") fm_shortness_breath <- glm(shortness_breath ~., data = train, family = "binomial") stepmodel <- step(null_model, scope = list(lower = null_model, upper = fm_shortness_breath), direction = "forward")
然后,我得到摘要模型,并将预测结果存储在源数据框中。
summary(stepmodel) validate$pred <- predict(stepmodel, validate, type = "response") validate$real <- validate$shortness_breath train$pred <- predict(stepmodel, train, type = "response") train$real <- train$shortness_breath
然后我可以毫无问题地绘制我的ROC曲线:
plot.roc(validate$real, validate$pred, col = "red", main = "ROC Validation Set", percent = TRUE, print.auc = TRUE)
但是,当我尝试获取混淆矩阵时,这就是我得到错误的地方。但这是我的代码:
cm_stepmodel <- confusionMatrix(stepmodel, validate)
然后出现错误:
Error: `data` and `reference` should be factors with the same levels.
带有显示回溯:
3. stop("`data` and `reference` should be factors with the same levels.", call. = FALSE) 2. confusionMatrix.default(stepmodel, validate) 1. confusionMatrix(stepmodel, validate)
我根本看不出问题所在。并尝试了其他几种选择,但没有用。我已逐步复制了我所采取的确切方法。而且我没有得到答案。另外,我也用RMarkdown,脱字符号和R标记了此问题,以防万一。
此外,使用的库为:
library(tidyverse)
library(conflicted)
library(tidymodels)
library(ggrepel)
library(corrplot)
library(dplyr)
library(corrr)
library(themis)
library(rsample)
library(caret)
library(forcats)
library(rcompanion)
library(MASS)
library(pROC)
library(ROCR)
library(data.table)
我已经看到许多有关此特定错误的答案。对于我的特定问题,我尚未找到任何答案。因此,我的问题是我该怎么做:...
[尝试将您的预测概率转换为标签,然后在其上运行confusionMatrix: