我有以下代码:
library(mlbench)library(caret)library(ggplot2)set.seed(998)# 准备数据 ------------------------------------------------------------data(Sonar)my_data <- Sonar# 定义交叉验证 ---------------------------------------------------fitControl <- trainControl( method = "cv", number = 10, classProbs = T, savePredictions = T, summaryFunction = twoClassSummary )# 使用随机森林进行训练 ----------------------------------------------------------------model <- train( Class ~ ., data = my_data, method = "rf", trControl = fitControl, metric = "ROC")for_lift <- data.frame(Class = model$pred$obs, rf = model$pred$R)lift_obj <- lift(Class ~ rf, data = for_lift, class = "R")# 绘制ROC曲线 ----------------------------------------------------------------ggplot(lift_obj$data) + geom_line(aes(1 - Sp, Sn, color = liftModelVar)) + scale_color_discrete(guide = guide_legend(title = "method"))
它生成的图像是这样的。
请注意,我在进行10折交叉验证。生成的ROC曲线只是针对最终的平均值。
我想做的,是为每个交叉验证生成10条ROC曲线。我该如何实现这一点?
回答:
library(mlbench)library(caret)library(ggplot2)set.seed(998)# 准备数据 ------------------------------------------------------------data(Sonar)my_data <- Sonar# 定义交叉验证 ---------------------------------------------------fitControl <- trainControl( method = "cv", number = 10, classProbs = T, savePredictions = T, summaryFunction = twoClassSummary )# 使用随机森林进行训练 ----------------------------------------------------------------model <- train( Class ~ ., data = my_data, method = "rf", trControl = fitControl, metric = "ROC")for_lift <- data.frame(Class = model$pred$obs, rf = model$pred$R, resample = model$pred$Resample)lift_df <- data.frame()for (fold in unique(for_lift$resample)) { fold_df <- dplyr::filter(for_lift, resample == fold) lift_obj_data <- lift(Class ~ rf, data = fold_df, class = "R")$data lift_obj_data$fold = fold lift_df = rbind(lift_df, lift_obj_data)}lift_obj <- lift(Class ~ rf, data = for_lift, class = "R")# 绘制ROC曲线 ----------------------------------------------------------------ggplot(lift_df) + geom_line(aes(1 - Sp, Sn, color = fold)) + scale_color_discrete(guide = guide_legend(title = "Fold"))
计算AUC:
model <- train( Class ~ ., data = my_data, method = "rf", trControl = fitControl, metric = "ROC")library(plyr)library(MLmetrics)ddply(model$pred, "Resample", summarise, accuracy = Accuracy(pred, obs))
输出:
Resample accuracy1 Fold01 0.82539682 Fold02 0.80952383 Fold03 0.80000004 Fold04 0.82539685 Fold05 0.80952386 Fold06 0.82539687 Fold07 0.83333338 Fold08 0.82539689 Fold09 0.984127010 Fold10 0.7936508