我正在尝试通过20次迭代来计算种子数据集(Link to the seeds dataset)上决策树的准确性,但是,总体准确性却很低(30%-35%)。这是我到目前为止所做的:
library(rpart)
seed = read.csv("seeds_dataset.txt",header= F, sep="\t")
colnames(seed)<- c("area", "per.", "comp.", "l.kernel", "w.kernel","asy_coeff", "lenkernel","type")
sampleSize <- nrow(seed)
mat = matrix(nrow=sampleSize, ncol=20)
for (t in 1:20) {
testSampleIdx <- sample(nrow(seed), size=sampleSize)
data <- seed[testSampleIdx,]
for (i in 1:nrow(data)){
training = data[-i, ]
test = data[i, ]
classification = rpart(type ~ ., data=training, method="class")
prediction = predict(classification, newdata=test, type="class")
cm = table(test$type, prediction)
accuracy <- sum(diag(cm))/sum(cm)
mat[i,t] = accuracy
}
}
for (i in 1:ncol(mat)){
print(paste("accuracy for ",i," iteration ", round((mean(mat[, i]))*100,1), "%", sep=""))
}
print(paste("overall accuracy ", round((mean(mat))*100,1), "%", sep=""))
任何人都可以向我提供引起这种低准确性的原因的评论和反馈吗?谢谢。
这里是编辑后的代码:
library(rpart)
seed.all = read.csv("~/Downloads/seeds_dataset.txt",header= F, sep="\t")
colnames(seed.all)<- c("area", "per.", "comp.", "l.kernel", "w.kernel","asy_coeff", "lenkernel","type")
seed = seed.all[!is.na(seed.all$type),]
sampleSize <- nrow(seed)
mat = matrix(nrow=sampleSize, ncol=20)
for (t in 1:20) {
testSampleIdx <- sample(nrow(seed), size=sampleSize, replace=TRUE)
data <- seed[testSampleIdx,]
for (i in 1:nrow(data)){
training = data[-i, ]
test = data[i, ]
classification = rpart(type ~ ., data=training, method="class")
prediction = predict(classification, newdata=test, type="class")
cm = table(test$type, prediction)
accuracy <- sum(diag(cm))/sum(cm)
mat[i,t] = accuracy
}
}
for (i in 1:ncol(mat)){
print(paste("accuracy for ",i," iteration ", round((mean(mat[, i]))*100,1), "%", sep=""))
}
## [1] "accuracy for 1 iteration 30.1%"
## [1] "accuracy for 2 iteration 34%"
## [1] "accuracy for 3 iteration 28.6%"
## [1] "accuracy for 4 iteration 34.5%"
## [1] "accuracy for 5 iteration 38.3%"
## [1] "accuracy for 6 iteration 33.5%"
## [1] "accuracy for 7 iteration 33.5%"
## [1] "accuracy for 8 iteration 36.9%"
## [1] "accuracy for 9 iteration 25.7%"
## [1] "accuracy for 10 iteration 31.6%"
## [1] "accuracy for 11 iteration 35.4%"
## [1] "accuracy for 12 iteration 39.8%"
## [1] "accuracy for 13 iteration 38.8%"
## [1] "accuracy for 14 iteration 21.8%"
## [1] "accuracy for 15 iteration 32.5%"
## [1] "accuracy for 16 iteration 34.5%"
## [1] "accuracy for 17 iteration 33%"
## [1] "accuracy for 18 iteration 39.3%"
## [1] "accuracy for 19 iteration 31.1%"
## [1] "accuracy for 20 iteration 33.5%"
print(paste("overall accuracy ", round((mean(mat))*100,1), "%", sep=""))
## [1] "overall accuracy 33.3%"