在R中保留交叉验证会返回非常低的准确性结果(正在寻找反馈和评论)

问题描述 投票:-1回答:1

我正在尝试通过20次迭代来计算种子数据集(Link to the seeds dataset)上决策树的准确性,但是,总体准确性却很低(30%-35%)。这是我到目前为止所做的:

library(rpart)
seed = read.csv("seeds_dataset.txt",header= F, sep="\t")
colnames(seed)<- c("area", "per.", "comp.", "l.kernel", "w.kernel","asy_coeff", "lenkernel","type")

sampleSize <- nrow(seed)
mat = matrix(nrow=sampleSize, ncol=20) 
for (t in 1:20) {
  testSampleIdx <- sample(nrow(seed), size=sampleSize)
  data <- seed[testSampleIdx,]

  for (i in 1:nrow(data)){
    training = data[-i, ]
    test = data[i, ] 
    classification = rpart(type ~ ., data=training, method="class") 
    prediction = predict(classification, newdata=test, type="class")
    cm = table(test$type, prediction)
    accuracy <- sum(diag(cm))/sum(cm)
    mat[i,t] = accuracy 
  }
}
for (i in 1:ncol(mat)){
  print(paste("accuracy for ",i," iteration ", round((mean(mat[, i]))*100,1), "%", sep=""))
}
print(paste("overall accuracy ", round((mean(mat))*100,1), "%", sep=""))

任何人都可以向我提供引起这种低准确性的原因的评论和反馈吗?谢谢。

r decision-tree cross-validation
1个回答
0
投票

这里是编辑后的代码:

library(rpart)
seed.all = read.csv("~/Downloads/seeds_dataset.txt",header= F, sep="\t")
colnames(seed.all)<- c("area", "per.", "comp.", "l.kernel", "w.kernel","asy_coeff", "lenkernel","type")

seed = seed.all[!is.na(seed.all$type),]

sampleSize <- nrow(seed)
mat = matrix(nrow=sampleSize, ncol=20) 
for (t in 1:20) {
  testSampleIdx <- sample(nrow(seed), size=sampleSize, replace=TRUE)
  data <- seed[testSampleIdx,]

  for (i in 1:nrow(data)){
    training = data[-i, ]
    test = data[i, ] 
    classification = rpart(type ~ ., data=training, method="class") 
    prediction = predict(classification, newdata=test, type="class")
    cm = table(test$type, prediction)
    accuracy <- sum(diag(cm))/sum(cm)
    mat[i,t] = accuracy 
  }
}
for (i in 1:ncol(mat)){
  print(paste("accuracy for ",i," iteration ", round((mean(mat[, i]))*100,1), "%", sep=""))
}
## [1] "accuracy for 1 iteration 30.1%"
## [1] "accuracy for 2 iteration 34%"
## [1] "accuracy for 3 iteration 28.6%"
## [1] "accuracy for 4 iteration 34.5%"
## [1] "accuracy for 5 iteration 38.3%"
## [1] "accuracy for 6 iteration 33.5%"
## [1] "accuracy for 7 iteration 33.5%"
## [1] "accuracy for 8 iteration 36.9%"
## [1] "accuracy for 9 iteration 25.7%"
## [1] "accuracy for 10 iteration 31.6%"
## [1] "accuracy for 11 iteration 35.4%"
## [1] "accuracy for 12 iteration 39.8%"
## [1] "accuracy for 13 iteration 38.8%"
## [1] "accuracy for 14 iteration 21.8%"
## [1] "accuracy for 15 iteration 32.5%"
## [1] "accuracy for 16 iteration 34.5%"
## [1] "accuracy for 17 iteration 33%"
## [1] "accuracy for 18 iteration 39.3%"
## [1] "accuracy for 19 iteration 31.1%"
## [1] "accuracy for 20 iteration 33.5%"
print(paste("overall accuracy ", round((mean(mat))*100,1), "%", sep=""))
## [1] "overall accuracy 33.3%"
© www.soinside.com 2019 - 2024. All rights reserved.