我的数据集看起来像这样
ID 885038 885039 885040 885041 885042 885043 885044 Class1267359 2 0 0 0 0 1 0 01295720 0 0 0 0 0 1 0 01295721 0 0 0 0 0 1 0 01295723 0 0 0 0 0 1 0 01295724 0 0 0 1 0 1 0 01295725 0 0 0 1 0 1 0 01295726 2 0 0 0 0 1 0 11295727 2 0 0 0 0 1 0 11295740 0 0 0 0 0 1 0 11295742 0 0 0 0 0 1 0 11295744 0 0 0 0 0 1 0 11295745 0 0 0 0 0 1 0 11295746 0 0 0 0 0 1 0 1
为了进行递归特征消除,我按照以下步骤进行:
- 训练SVM分类器
- 计算所有特征的排名标准
- 移除排名值最小的特征
- 返回步骤1
以下是我编写的用于实现上述过程的R代码,但它没有显示任何错误,并且循环继续使用训练集的长度进行计算。
data <- read.csv("dummy - Copy.csv", header = TRUE)rownames(data) <- data[,1]data<-data[,-1]for (k in 1:length(data)){ inTraining <- createDataPartition(data$Class, p = .70, list = FALSE) training <- data[ inTraining,] testing <- data[-inTraining,] ## Building the model #### svm.model <- svm(Class ~ ., data = training, cross=10,metric="ROC",type="eps-regression",kernel="linear",na.action=na.omit,probability = TRUE) ###### auc measure ####### #prediction and ROC svm.model$index svm.pred <- predict(svm.model, testing, probability = TRUE) #calculating auc c <- as.numeric(svm.pred) c = c - 1 pred <- prediction(c, testing$Class) perf <- performance(pred,"tpr","fpr") plot(perf,fpr.stop=0.1) auc <- performance(pred, measure = "auc") auc <- [email protected][[1]] #compute the weight vector w = t(svm.model$coefs)%*%svm.model$SV #compute ranking criteria weight_matrix = w * w #rank the features w_transpose <- t(weight_matrix) w2 <- as.matrix(w_transpose[order(w_transpose[,1], decreasing = FALSE),]) a <- as.matrix(w2[which(w2 == min(w2)),]) #to get the rows with minimum values row.names(a) -> remove data<- data[,setdiff(colnames(data),remove)] print(length(data)) length <- (length(data)) cols_names <- colnames(data) print(auc) output <- paste(length,auc,sep=";") write(output, file = "output.txt",append = TRUE) write(cols_names, file = paste(length,"cols_selected", ".txt", sep=""))}
打印的输出如下:
[1] 3[1] 0.5[1] 2[1] 0.5[1] 2[1] 0.5[1] 2[1] 0.75[1] 2[1] 1[1] 2[1] 0.75[1] 2[1] 0.5[1] 2[1] 0.75
但是,当我选择任何特征子集,例如特征3,并使用上述代码(不使用循环)构建SVM模型时,我得到的AUC值不是0.75。
data <- read.csv("3.csv", header = TRUE)rownames(data) <- data[,1]data<-data[,-1] inTraining <- createDataPartition(data$Class, p = .70, list = FALSE) training <- data[ inTraining,] testing <- data[-inTraining,] ## Building the model #### svm.model <- svm(Class ~ ., data = training, cross=10,metric="ROC",type="eps-regression",kernel="linear",na.action=na.omit,probability = TRUE) ###### auc measure ####### #prediction and ROC svm.model$index svm.pred <- predict(svm.model, testing, probability = TRUE) #calculating auc c <- as.numeric(svm.pred) c = c - 1 pred <- prediction(c, testing$Class) perf <- performance(pred,"tpr","fpr") plot(perf,fpr.stop=0.1) auc <- performance(pred, measure = "auc") auc <- [email protected][[1]] print(auc)prints output [1] 3 [1] 0.75 (instead of 0.5)
两个代码是相同的(一个带有递归循环,另一个没有递归循环),但对于相同的特征子集,AUC值却有差异。
两个代码中的3个特征(885041
、885043
和Class
)是相同的,但却给出了不同的AUC值。
回答:
我认为只使用交叉验证就足够了。在你的代码中,你已经使用了10折交叉验证来测试误差。似乎没有必要再分割数据集了。
由于你没有提到调参,cost
或gamma
将被设置为默认值。
library(tidyverse)library(e1071)library(caret)library(ROCR)library(foreach)
特征名称是数字,看起来svm()
在拟合过程中会更改名称。为了在之后匹配,我会先更改列名。
其次,可以使用caret::createFolds()
来分配折叠,而不是createDataPartition()
。
set.seed(1)k <- 5 # 5-fold CVmydf3 <- mydf %>% rename_at(.vars = vars(-ID, -Class), .funs = function(x) str_c("X.", x, ".")) %>% mutate(fold = createFolds(1:n(), k = k, list = FALSE)) # fold id column# the number of features-------------------------------x_num <- mydf3 %>% select(-ID, -Class, -fold) %>% ncol()
为了迭代,foreach()
可以是另一个选择。
cl <- parallel::makeCluster(2)doParallel::registerDoParallel(cl, cores = 2)parallel::clusterExport(cl, c("mydf3", "x_num"))parallel::clusterEvalQ(cl, c(library(tidyverse), library(ROCR)))#---------------------------------------------------------------svm_rank <- foreach(j = seq_len(x_num), .combine = rbind) %do% { mod <- foreach(cv = 1:k, .combine = bind_rows, .inorder = FALSE) %dopar% { # parallization tr <- mydf3 %>% filter(fold != cv) %>% # train select(-fold, -ID) %>% e1071::svm( # fitting svm Class ~ ., data = ., kernel = "linear", type = "eps-regression", probability = TRUE, na.action = na.omit ) # auc te <- mydf3 %>% filter(fold == cv) %>% predict(tr, newdata = ., probability = TRUE) predob <- prediction(te, mydf3 %>% filter(fold == cv) %>% select(Class)) auc <- performance(predob, measure = "auc")@y.values[[1]] # ranking - your formula w <- t(tr$coefs) %*% tr$SV if (is.null(names(w))) colnames(w) <- attr(tr$terms, "term.labels") # when only one feature left (w * w) %>% tbl_df() %>% mutate(auc = auc) } auc <- mean(mod %>% select(auc) %>% pull()) # aggregate cv auc w_mat <- colMeans(mod %>% select(-auc)) # aggregate cv ranking remove <- names(which.min(w_mat)) # minimum rank used <- mydf3 %>% select(-ID, -Class, -fold) %>% names() %>% str_c(collapse = " & ") mydf3 <- mydf3 %>% select(-remove) # remove feature for next step tibble(used = used, delete = remove, auc = auc) }#---------------------------------------------------parallel::stopCluster(cl)
在每一步,你可以得到
svm_rank#> # A tibble: 7 x 3#> used delete auc#> <chr> <chr> <dbl>#> 1 X.885038. & X.885039. & X.885040. & X.885041. & X.885042… X.88503… 0.7#> 2 X.885038. & X.885040. & X.885041. & X.885042. & X.885043… X.88504… 0.7#> 3 X.885038. & X.885041. & X.885042. & X.885043. & X.885044. X.88504… 0.7#> 4 X.885038. & X.885041. & X.885043. & X.885044. X.88504… 0.7#> 5 X.885038. & X.885041. & X.885043. X.88504… 0.7#> 6 X.885038. & X.885041. X.88503… 0.7#> 7 X.885041. X.88504… 0.7