早安
我运行了带有调整的“随机森林”,并将预测添加到“火车”数据中,该数据运行得很好且没有问题。但是,当我尝试在Test数据集上运行随机森林模型时,出现上述错误。关于什么可能导致此的任何想法是我的代码。感谢任何帮助。训练数据集确实有3500行,而测试数据集有1500行,因为数据集由5000行组成。
R代码:
####Clearing the global environmnent rm(list = ls()) ##Setting the working directory setwd("D:/Great Learning/Module 3 -Machine Learning/Project") ##Packages required to be loaded install.packages("DataExplorer") install.packages("xlsx") ##install.packages("magrittr") install.packages("dplyr") install.packages("tidyverse") install.packages("mice") install.packages("NbClust") ##Reading in the dataset library(xlsx) LoanModelRaw = read.xlsx("Thera Bank_Personal_Loan_Modelling-dataset- 1.xlsx",sheetName = "Bank_Personal_Loan_Modelling",header = T) ##LoanModelRaw = read.csv("Thera Bank_Personal_Loan_Modelling-dataset-1.csv", sep = ";",header = T) ##Viewing the dataset in R View(LoanModelRaw) dim(LoanModelRaw) colnames(LoanModelRaw) str(LoanModelRaw) summary(LoanModelRaw) nrow(LoanModelRaw) attach(LoanModelRaw) #Correcting column names names(LoanModelRaw)[2] = "AgeInYears" names(LoanModelRaw)[3] = "ExperienceInYears" names(LoanModelRaw)[4] = "IncomeInKMonth" names(LoanModelRaw)[5] = "ZIPCode" names(LoanModelRaw)[6] = "FamilyMembers" names(LoanModelRaw)[10] = "PersonalLoan" names(LoanModelRaw)[11] = "SecuritiesAccount" names(LoanModelRaw)[12] = "CDAccount" colnames(LoanModelRaw) #############################################################1 EDA of the data####################################################### library(DataExplorer) ##introduce(LoanModelRaw) plot_intro(LoanModelRaw) plot_missing(LoanModelRaw) ##plot_bar(LoanModelRaw) plot_histogram(LoanModelRaw) create_report(LoanModelRaw) ?plot_boxplot #Missing Value Treatment library(mice) sum(is.na(LoanModelRaw)) md.pattern(LoanModelRaw) LoanModelRawImpute = mice(LoanModelRaw, m =5, method = 'pmm', seed = 1000) LoanModelRawNoNa = complete(LoanModelRawImpute, 3) md.pattern(LoanModelRawNoNa) #Correcting negative experience LoanModel = abs(LoanModelRawNoNa[2:14]) attach(LoanModel) #View(LoanModel) #summary(LoanModel) #nrow(LoanModel)
#LoanModel$Split = sample.split(LoanModel$PersonalLoan, SplitRatio = 0.7) View(LoanModel) LoanModelTrainRaw = subset(LoanModel,LoanModel$Split == TRUE) LoanModelTestRaw = subset(LoanModel,LoanModel$Split == FALSE) #Installing the packages for the running random forest install.packages("randomForest") install.packages("dplyr") library(randomForest) library(dplyr) attach(LoanModelTrain) str(LoanModelTrain) #Need to exclude the split and move columns LoanModelTrain = LoanModelTrainRaw[1:13] LoanModelTest = LoanModelTestRaw[1:13] LoanModelTrain = LoanModelTrain %>% select(IncomeInKMonth,Mortgage,ZIPCode,CCAvg,everything()) LoanModelTest = LoanModelTest %>% select(IncomeInKMonth,Mortgage,ZIPCode,CCAvg, everything()) head(LoanModelTrain) head(LoanModelTest) ###Converting the data set to a factor variable in order to be read #Train fcol = c(5:13) LoanModelTrain[,fcol] = lapply(LoanModelTrain[,fcol], factor) str(LoanModelTrain) nrow(LoanModelTrain) #Test fcol = c(5:13) LoanModelTest[,fcol] = lapply(LoanModelTest[,fcol], factor) str(LoanModelTest) ##Running the random forest seed = 1000 set.seed(seed) LoanModelTrainRF = randomForest(PersonalLoan ~ ., data = LoanModelTrain, ntree = 501, mtry = 10, nodesize = 10, importance = TRUE, do.trace = TRUE) print(LoanModelTrainRF) plot(LoanModelTrainRF) importance(LoanModelTrainRF) ?randomForest ###Tuning the random Forest set.seed(seed) LoanModelTrain = LoanModelTrain %>% select(PersonalLoan,everything()) str(LoanModelTrain) LoanModelTrainRFTuned = tuneRF(x = LoanModelTrain[,-c(1)], y = PersonalLoan, mtryStart = 10, stepFactor = 1.5, improve = 0.001, trace = TRUE, plot = TRUE, doBest = TRUE, importance = TRUE) ###Running refined random forest LoanModelTrainRefinedRF = randomForest(PersonalLoan ~ ., data = LoanModelTrain, ntree = 95, mtry = 10, nodesize = 10, importance = TRUE, do.trace = TRUE) print(LoanModelTrainRefinedRF) plot(LoanModelTrainRefinedRF) ###Adding the prediction columns and probability columns LoanModelTrain$Predict = predict(LoanModelTrainRefinedRF,data= LoanModelTrain, type = "class") LoanModelTrain$Score = predict(LoanModelTrainRefinedRF,data= LoanModelTrain, type = "prob") head(LoanModelTrain) ###Check the accuracy of the model install.packages("caret") library(caret) caret::confusionMatrix(LoanModelTrain$PersonalLoan, LoanModelTrain$Predict) ###Run the model against the Test Data str(LoanModelTest) LoanModelTest$Predict = predict(LoanModelTrainRefinedRF,data= LoanModelTest, type = "class") LoanModelTest$Score = predict(LoanModelTrainRefinedRF,data= LoanModelTest, type = "prob")
[Good Day,我运行了一个带有调整的随机森林,并将预测添加到Train数据中,该数据运行得很好,没有问题。但是,当我尝试在...
此错误意味着您尝试将长度为3500的列向量附加到具有1500行的矩阵中。当然,这不起作用,因为R不会自动为空行创建ǸA
(这是一件好事)。