我尝试了针对this question的答案,但错误没有改变。我试图以相同的方式预处理训练集和测试集。它们来自两个不同的文件,我不确定我的老师是否会把我混合在一起,所以在拆分它们之前进行预处理并不是一个好选择。为什么predict
线第一次使用训练集而不是测试集起作用?除了行中的各个值和行的总数之外,两个数据框的结构应相同。
##### Load libraries #####
library(readr)
library(caret)
###### Read in data ######
training = read_csv("~/Machine Learning 2/M1/buad5132-m1-training-data.csv")
test = read_csv("~/Machine Learning 2/M1/buad5132-m1-test-data.csv")
##### Preprocessing #####
### Change column classes
#Training
training$INDEX = as.factor(training$INDEX)
training$TARGET_FLAG = as.factor(training$TARGET_FLAG)
training$PARENT1 = as.factor(training$PARENT1)
training$MSTATUS = as.factor(training$MSTATUS)
training$SEX = as.factor(training$SEX)
training$EDUCATION = as.factor(ifelse(training$EDUCATION == "<High School", "Less than High School", training$EDUCATION))
training$JOB = as.factor(training$JOB)
training$CAR_USE = as.factor(training$CAR_USE)
training$CAR_TYPE = as.factor(training$CAR_TYPE)
training$RED_CAR = as.factor(training$RED_CAR)
training$REVOKED = as.factor(training$REVOKED)
training$INCOME = suppressWarnings(as.numeric(gsub("[^0-9.]", "", training$INCOME)))
training$HOME_VAL = suppressWarnings(as.numeric(gsub("[^0-9.]", "", training$HOME_VAL)))
training$OLDCLAIM = suppressWarnings(as.numeric(gsub("[^0-9.]", "", training$HOME_VAL)))
training$BLUEBOOK = suppressWarnings(as.numeric(gsub("[^0-9.]", "", training$BLUEBOOK)))
training$URBANICITY = ifelse(grepl("Urban", training$URBANICITY), "Urban", "Rural")
training$URBANICITY = as.factor(training$URBANICITY)
#Test
test$INDEX = as.factor(test$INDEX)
test$TARGET_FLAG = as.factor(test$TARGET_FLAG)
test$PARENT1 = as.factor(test$PARENT1)
test$MSTATUS = as.factor(test$MSTATUS)
test$SEX = as.factor(test$SEX)
test$EDUCATION = as.factor(ifelse(test$EDUCATION == "<High School", "Less than High School", test$EDUCATION))
test$JOB = as.factor(test$JOB)
test$CAR_USE = as.factor(test$CAR_USE)
test$CAR_TYPE = as.factor(test$CAR_TYPE)
test$RED_CAR = as.factor(test$RED_CAR)
test$REVOKED = as.factor(test$REVOKED)
test$INCOME = suppressWarnings(as.numeric(gsub("[^0-9.]", "", test$INCOME)))
test$HOME_VAL = suppressWarnings(as.numeric(gsub("[^0-9.]", "", test$HOME_VAL)))
test$OLDCLAIM = suppressWarnings(as.numeric(gsub("[^0-9.]", "", test$HOME_VAL)))
test$BLUEBOOK = suppressWarnings(as.numeric(gsub("[^0-9.]", "", test$BLUEBOOK)))
test$URBANICITY = ifelse(grepl("Urban", test$URBANICITY), "Urban", "Rural")
test$URBANICITY = as.factor(test$URBANICITY)
### Dummy variables
#Training
trainDmyParams = dummyVars(~., training[,-c(1,2)])
training.dmy = as.data.frame(predict(trainDmyParams, training[,-c(1,2)]))
training.dmy$TARGET_FLAG = training$TARGET_FLAG
names(training.dmy) = make.names(names(training.dmy))
#Test
testDmyParams = dummyVars(~., test[,-c(1,2)])
test.dmy = as.data.frame(predict(testDmyParams, test[,-c(1,2)]))
test.dmy$TARGET_FLAG = test$TARGET_FLAG
names(test.dmy) = make.names(names(test.dmy))
### Standardization and imputation
#Training
preProcessTrain = preProcess(training.dmy, method = c("center", "scale", "bagImpute"))
training.prepped = predict(preProcessTrain, training.dmy)
#Test
preProcessTest = preProcess(test.dmy, method = c("center", "scale", "bagImpute"))
test.prepped = predict(preProcessTest, test.dmy) # <--- error occurs on this line
Error in UseMethod("predict") : no applicable method for 'predict' applied to an object of class "NULL"
@@ duckmayr,我无法在评论中全部说明。 str(preProcessTest)
的完整输出太长了,无法在此处发布。我实际上是在几分钟后停止运行的,所以我不确定运行需要多长时间。波纹管我有前几行输出,但是我不知道该怎么做。此后,它将继续输出更多相同内容,以显示bagImpute
的功能。我应该注意,str(preProcessTrain)
看起来与str(preProcessTest)
非常相似,并且当进入predict
时,它可以工作。
List of 21
$ dim : int [1:2] 0 47
$ bc : NULL
$ yj : NULL
$ et : NULL
$ invHyperbolicSine: NULL
$ mean : Named num [1:46] 1.63e-01 4.50e+01 7.17e-01 1.04e+01 6.03e+04 ...
..- attr(*, "names")= chr [1:46] "KIDSDRIV" "AGE" "HOMEKIDS" "YOJ" ...
$ std : Named num [1:46] 4.87e-01 8.53 1.12 4.17 4.70e+04 ...
..- attr(*, "names")= chr [1:46] "KIDSDRIV" "AGE" "HOMEKIDS" "YOJ" ...
$ ranges : NULL
$ rotation : NULL
$ method :List of 4
..$ center : chr [1:46] "KIDSDRIV" "AGE" "HOMEKIDS" "YOJ" ...
..$ scale : chr [1:46] "KIDSDRIV" "AGE" "HOMEKIDS" "YOJ" ...
..$ bagImpute: chr [1:46] "KIDSDRIV" "AGE" "HOMEKIDS" "YOJ" ...
..$ ignore : chr "TARGET_FLAG"
$ thresh : num 0.95
$ pcaComp : NULL
$ numComp : NULL
$ ica : NULL
$ wildcards :List of 2
..$ PCA: chr(0)
..$ ICA: chr(0)
$ k : num 5
$ knnSummary :function (x, ...)
$ bagImp :List of 46
..$ KIDSDRIV :List of 2
.. ..$ var : chr "KIDSDRIV"
.. ..$ model:List of 6
.. .. ..$ y : num(0)
.. .. ..$ X : NULL
.. .. ..$ mtrees:List of 10
.. .. .. ..$ :List of 1
.. .. .. .. ..$ btree:List of 10
.. .. .. .. .. ..$ frame :'data.frame': 1 obs. of 8 variables:
.. .. .. .. .. .. ..$ var : Factor w/ 1 level "<leaf>": 1
.. .. .. .. .. .. ..$ n : int 0
.. .. .. .. .. .. ..$ wt : num 0
.. .. .. .. .. .. ..$ dev : num 0
.. .. .. .. .. .. ..$ yval : num NaN
.. .. .. .. .. .. ..$ complexity: num NaN
.. .. .. .. .. .. ..$ ncompete : int 0
.. .. .. .. .. .. ..$ nsurrogate: int 0
.. .. .. .. .. ..$ call :List of 1
.. .. .. .. .. .. ..$ na.action: NULL
.. .. .. .. .. ..$ terms :Classes 'terms', 'formula' language y ~ AGE + HOMEKIDS + YOJ + INCOME + PARENT1.No + PARENT1.Yes + HOME_VAL + MSTATUS.Yes + MSTATUS.z_No + SEX.M + SE| __truncated__ ...
.. .. .. .. .. .. .. ..- attr(*, "variables")= language list(y, AGE, HOMEKIDS, YOJ, INCOME, PARENT1.No, PARENT1.Yes, HOME_VAL, MSTATUS.Yes, MSTATUS.z_No, SEX.M, SEX.z_F,| __truncated__ ...
.. .. .. .. .. .. .. ..- attr(*, "factors")= int [1:47, 1:46] 0 1 0 0 0 0 0 0 0 0 ...
.. .. .. .. .. .. .. .. ..- attr(*, "dimnames")=List of 2
.. .. .. .. .. .. .. .. .. ..$ : chr [1:47] "y" "AGE" "HOMEKIDS" "YOJ" ...
.. .. .. .. .. .. .. .. .. ..$ : chr [1:46] "AGE" "HOMEKIDS" "YOJ" "INCOME" ...
.. .. .. .. .. .. .. ..- attr(*, "term.labels")= chr [1:46] "AGE" "HOMEKIDS" "YOJ" "INCOME" ...
.. .. .. .. .. .. .. ..- attr(*, "order")= int [1:46] 1 1 1 1 1 1 1 1 1 1 ...
.. .. .. .. .. .. .. ..- attr(*, "intercept")= int 1
.. .. .. .. .. .. .. ..- attr(*, "response")= int 1
.. .. .. .. .. .. .. ..- attr(*, ".Environment")=<environment: 0x00000000187b2ec8>
.. .. .. .. .. .. .. ..- attr(*, "predvars")= language list(y, AGE, HOMEKIDS, YOJ, INCOME, PARENT1.No, PARENT1.Yes, HOME_VAL, MSTATUS.Yes, MSTATUS.z_No, SEX.M, SEX.z_F,| __truncated__ ...
.. .. .. .. .. .. .. ..- attr(*, "dataClasses")= Named chr [1:47] "numeric" "numeric" "numeric" "numeric" ...
.. .. .. .. .. .. .. .. ..- attr(*, "names")= chr [1:47] "y" "AGE" "HOMEKIDS" "YOJ" ...
.. .. .. .. .. ..$ cptable : num [1, 1:3] NaN 0 NaN
.. .. .. .. .. .. ..- attr(*, "dimnames")=List of 2
.. .. .. .. .. .. .. ..$ : chr "1"
.. .. .. .. .. .. .. ..$ : chr [1:3] "CP" "nsplit" "rel error"
.. .. .. .. .. ..$ method : chr "anova"
.. .. .. .. .. ..$ parms : NULL
.. .. .. .. .. ..$ control :List of 9
.. .. .. .. .. .. ..$ minsplit : int 20
.. .. .. .. .. .. ..$ minbucket : num 7
.. .. .. .. .. .. ..$ cp : num 0.01
.. .. .. .. .. .. ..$ maxcompete : int 4
.. .. .. .. .. .. ..$ maxsurrogate : int 5
.. .. .. .. .. .. ..$ usesurrogate : int 2
.. .. .. .. .. .. ..$ surrogatestyle: int 0
.. .. .. .. .. .. ..$ maxdepth : int 30
.. .. .. .. .. .. ..$ xval : num 0
.. .. .. .. .. ..$ functions:List of 2