我正在尝试理解和实现分类器,R语言中的一个类基于几个UCI数据集,其中一个是(http://archive.ics.uci.edu/ml/datasets/Chronic_Kidney_Disease)。
在尝试打印混淆矩阵时,出现了“所有参数必须具有相同长度”的错误。
我做错了什么?
library(caret)library(dplyr)library(e1071)library(NLP)library(tm)ds = read.csv('kidney_disease.csv', header = TRUE)#移除无用列 ds <- subset(ds, select = -c(age), classification =='ckd' )x <- subset(ds, select = -classification) #创建x变量y <- ds$classification #创建y变量(依赖变量)# 在整个数据集上测试#pred <- predict(model, subset(ds, select=-classification))trainPositive<-xtestnegative<-yinTrain<-createDataPartition(1:nrow(trainPositive),p=0.6,list=FALSE)trainpredictors<-trainPositive[inTrain,1:4]trainLabels<-trainPositive[inTrain,6]testPositive<-trainPositive[-inTrain,]testPosNeg<-rbind(testPositive,testnegative)testpredictors<-testPosNeg[,1:4]testLabels<-testPosNeg[,6]svm.model<-svm(trainpredictors,y=NULL, type='one-classification', nu=0.10, scale=TRUE, kernel="radial")svm.predtrain<-predict(svm.model,trainpredictors)svm.predtest<-predict(svm.model,testpredictors)# confusionMatrixTable<-table(Predicted=svm.pred,Reference=testLabels)# confusionMatrix(confusionMatrixTable,positive='TRUE')confTrain <- table(Predicted=svm.predtrain,Reference=trainLabels)confTest <- table(Predicted=svm.predtest,Reference=testLabels)confusionMatrix(confTest,positive='TRUE')print(confTrain)print(confTest)#grid
这是我使用的数据集的前几行:
id bp sg al su rbc pc pcc ba bgr bu sc sod pot hemo pcv wc1 0 80 1.020 1 0 normal notpresent notpresent 121 36 1.2 NA NA 15.4 44 78002 1 50 1.020 4 0 normal notpresent notpresent NA 18 0.8 NA NA 11.3 38 60003 2 80 1.010 2 3 normal normal notpresent notpresent 423 53 1.8 NA NA 9.6 31 75004 3 70 1.005 4 0 normal abnormal present notpresent 117 56 3.8 111 2.5 11.2 32 67005 4 80 1.010 2 0 normal normal notpresent notpresent 106 26 1.4 NA NA 11.6 35 73006 5 90 1.015 3 0 notpresent notpresent 74 25 1.1 142 3.2 12.2 39 7800 rc htn dm cad appet pe ane classification1 5.2 yes yes no good no no ckd2 no no no good no no ckd3 no yes no poor no yes ckd4 3.9 yes no no poor yes yes ckd5 4.6 no no no good no no ckd6 4.4 yes yes no good yes no ckd
错误日志:
> confTrain <- table (Predicted = svm.predtrain, Reference = trainLabels)表格错误 (Predicted = svm.predtrain, Reference = trainLabels):所有参数必须具有相同长度> confTest <- table (Predicted = svm.predtest, Reference = testLabels)表格错误 (expected = svm.predtest, reference = testLabels):所有参数必须具有相同长度>> confusionMatrix (confTest, positive = 'TRUE')混淆矩阵错误 (confTest, positive = "TRUE"):'confTest'对象未找到>>> print (confTrain)打印错误 (confTrain): 对象'confTrain'未找到> print (confTest)打印错误 (confTest): 对象'confTest'未找到
回答:
我看到了很多问题。首先,看起来你的很多数据是字符类型而不是分类器所需的数值类型。我们选择一些列并转换为数值类型。我将使用data.table
因为fread
非常方便。
library(caret)library(e1071)library(data.table)setDT(ds)#选择列mycols <- c("id","bp","sg","al","su")#转换为数值类型ds[,(mycols) := lapply(.SD, as.numeric),.SDcols = mycols]#将分类转换为逻辑类型data <- ds[,.(bp,sg,al,su,classification = ds$classification == "ckd")]data bp sg al su classification 1: 80 1.020 1 0 TRUE 2: 50 1.020 4 0 TRUE 3: 80 1.010 2 3 TRUE 4: 70 1.005 4 0 TRUE 5: 80 1.010 2 0 TRUE --- 396: 80 1.020 0 0 FALSE397: 70 1.025 0 0 FALSE398: 80 1.020 0 0 FALSE399: 60 1.025 0 0 FALSE400: 80 1.025 0 0 FALSE
一旦数据被清理,你可以像原始代码中那样使用createDataPartition
抽样训练和测试集。
#抽样数据用于训练和测试集inTrain<-createDataPartition(1:nrow(data),p=0.6,list=FALSE)train<- data[inTrain,]test <- data[-inTrain,]
然后我们可以创建模型并进行预测。
svm.model<-svm(classification ~ bp + sg + al + su, data = train, type='one-classification', nu=0.10, scale=TRUE, kernel="radial")#进行预测 svm.predtrain<-predict(svm.model,train)svm.predtest<-predict(svm.model,test)
你对交叉表的主要问题是模型只能对没有NA
的情况进行预测,所以你必须将分类级别子集化为有预测的那些。然后你可以评估confusionMatrix
:
confTrain <- table(Predicted=svm.predtrain, Reference=train$classification[as.integer(names(svm.predtrain))])confTest <- table(Predicted=svm.predtest, Reference=test$classification[as.integer(names(svm.predtest))])confusionMatrix(confTest,positive='TRUE')混淆矩阵和统计数据 参考Predicted FALSE TRUE FALSE 0 17 TRUE 55 64 准确率 : 0.4706 95% 置信区间 : (0.3845, 0.558) 无信息率 : 0.5956 P值 [Acc > NIR] : 0.9988 Kappa : -0.2361 Mcnemar's 测试 P值 : 1.298e-05 敏感性 : 0.7901 特异性 : 0.0000 正预测值 : 0.5378 负预测值 : 0.0000 流行率 : 0.5956 检测率 : 0.4706 检测流行率 : 0.8750 平衡准确率 : 0.3951 '正'类 : TRUE
数据
library(archive)library(data.table)tf1 <- tempfile(fileext = ".rar")#下载数据文件download.file("http://archive.ics.uci.edu/ml/machine-learning-databases/00336/Chronic_Kidney_Disease.rar", tf1)tf2 <- tempfile()#解压文件archive_extract(tf1, tf2)#读取数据ds <- fread(paste0(tf2,"/Chronic_Kidney_Disease/chronic_kidney_disease.arff"), fill = TRUE, skip = "48")#移除错误的最后一列ds[,V26:= NULL]#设置列名 (来自头部)setnames(ds,c("id","bp","sg","al","su","rbc","pc","pcc","ba","bgr","bu","sc","sod","pot","hemo","pcv","wc","rc","htn","dm","cad","appet","pe","ane","classification"))#用NA替换"?"ds[ds == "?"] <- NA