概述:
我有一个名为’FID’的数据框架,我正在尝试按照下面的教程来生成三个模型:(1) 袋装树;(2) 随机森林;以及(3) 提升树。
教程:
https://bcullen.rbind.io/post/2020-06-02-tidymodels-decision-tree-learning-in-r/
问题
当我尝试运行模型,“fit_bag”, fit_rf, “fit_boost”时,我遇到了下面的错误信息。我认为问题可能出在预处理阶段。
有谁能帮助建议解决这个问题吗?
提前感谢。
错误 – 未选择任何变量
i Fold01: recipex Fold01: recipe: 错误:未选择任何变量或术语。i Fold02: recipex Fold02: recipe: 错误:未选择任何变量或术语。i Fold03: recipex Fold03: recipe: 错误:未选择任何变量或术语。i Fold04: recipex Fold04: recipe: 错误:未选择任何变量或术语。i Fold05: recipex Fold05: recipe: 错误:未选择任何变量或术语。i Fold06: recipex Fold06: recipe: 错误:未选择任何变量或术语。i Fold07: recipex Fold07: recipe: 错误:未选择任何变量或术语。i Fold08: recipex Fold08: recipe: 错误:未选择任何变量或术语。i Fold09: recipex Fold09: recipe: 错误:未选择任何变量或术语。i Fold10: recipex Fold10: recipe: 错误:未选择任何变量或术语。警告信息:所有模型在 [fit_resamples()] 中失败。查看 `.notes` 列。
R代码
## 打开库包 library(tidymodels) library(tidyverse) # 数据处理 library(skimr) # 数据可视化 library(baguette) # 袋装树 library(future) # 并行处理与减少计算时间 library(xgboost) # 提升树 # 分割数据 split <- initial_split(Tidmodel_df) # 提取训练数据 train <- training(split) # 使用10折交叉验证重新采样数据(默认10折) cv <- vfold_cv(train) ##预处理rec <- recipe(Frequency~., data=train) %>% update_role(contains("id"), Year, Month, Monsoon, Days, new_role = "id vars") %>% # 声明ID变量 step_nzv(all_predictors(), freq_cut = 0, unique_cut = 0) %>% # 移除零方差变量 step_novel(all_nominal()) %>% # 准备测试数据处理以前未见的因子级别 step_medianimpute(all_numeric(), -all_outcomes(), -has_role("id vars")) %>% # 用中位数替换缺失的数值观测 step_dummy(all_nominal(), -has_role("id vars")) # 虚拟编码分类变量 ########################################################### ##创建模型 ########################################################### #####袋装树 mod_bag <- bag_tree() %>% set_mode("regression") %>% set_engine("rpart", times = 10) # 10次自助重采样 ##创建工作流程 wflow_bag <- workflow() %>% add_recipe(rec) %>% add_model(mod_bag) ##拟合模型 plan(multisession) fit_bag <- fit_resamples( wflow_bag, cv, metrics = metric_set(rmse, rsq), control = control_resamples(verbose = TRUE, save_pred = TRUE, extract = function(x) extract_model(x))) ##随机森林 mod_rf <-rand_forest() %>% set_engine("ranger", num.threads = parallel::detectCores(), importance = "permutation", verbose = TRUE) %>% set_mode("regression") %>% set_args(trees = 1000) ##创建工作流程 wflow_rf <- workflow() %>% add_model(mod_rf) %>% add_recipe(rec) ##拟合模型 plan(multisession) fit_rf <- fit_resamples( wflow_rf, cv, metrics = metric_set(rmse, rsq), control = control_resamples(verbose = TRUE, save_pred = TRUE, extract = function(x) x) ) ##提升树 mod_boost <- boost_tree() %>% set_engine("xgboost", nthreads = parallel::detectCores()) %>% set_mode("regression") ##创建工作流程 wflow_boost <- workflow() %>% add_recipe(rec) %>% add_model(mod_boost) ##拟合模型 plan(multisession) fit_boost <- fit_resamples( wflow_boost, cv, metrics = metric_set(rmse, rsq), control = control_resamples(verbose = TRUE, save_pred = TRUE) )
数据框架 – FID
structure(list(Year = c(2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017), Month = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L), .Label = c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"), class = "factor"), Monsoon = structure(c(2L, 2L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 3L, 3L, 2L, 2L, 2L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 3L, 3L, 2L, 2L, 2L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 3L, 3L, 2L), .Label = c("First_Inter_Monssoon", "North_Monsoon", "Second_Inter_Monsoon", "South_Monsson"), class = "factor"), Frequency = c(36, 28, 39, 46, 5, 0, 0, 22, 10, 15, 8, 33, 33, 29, 31, 23, 8, 9, 7, 40, 41, 41, 30, 30, 44, 37, 41, 42, 20, 0, 7, 27, 35, 27, 43, 38), Days = c(31, 28, 31, 30, 6, 0, 0, 29, 15, 29, 29, 31, 31, 29, 30, 30, 7, 0, 7, 30, 30, 31, 30, 27, 31, 28, 30, 30, 21, 0, 7, 26, 29, 27, 29, 29)), row.names = c(NA, -36L), class = "data.frame")
回答:
这里的问题是当你使用 update_role(contains("id"), Year, Month, Monsoon, Days, new_role = "id vars")
时,你更新了所有变量的角色,如 Year
, Month
, Monsoon
等为 "id vars"
,然后它们就不再是预测变量了。当配方进行到下一个预处理步骤时,它发现根本没有预测变量。
如果你想使用这些变量作为预测变量,那么请保持它们的角色不变,不要将它们更改为其他角色,例如 "id vars"
:
library(tidymodels) library(baguette) # 袋装树fid_df <- structure(list(Year = c(2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017), Month = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L), .Label = c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"), class = "factor"), Monsoon = structure(c(2L, 2L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 3L, 3L, 2L, 2L, 2L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 3L, 3L, 2L, 2L, 2L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 3L, 3L, 2L), .Label = c("First_Inter_Monssoon", "North_Monsoon", "Second_Inter_Monsoon", "South_Monsson"), class = "factor"), Frequency = c(36, 28, 39, 46, 5, 0, 0, 22, 10, 15, 8, 33, 33, 29, 31, 23, 8, 9, 7, 40, 41, 41, 30, 30, 44, 37, 41, 42, 20, 0, 7, 27, 35, 27, 43, 38), Days = c(31, 28, 31, 30, 6, 0, 0, 29, 15, 29, 29, 31, 31, 29, 30, 30, 7, 0, 7, 30, 30, 31, 30, 27, 31, 28, 30, 30, 21, 0, 7, 26, 29, 27, 29, 29)), row.names = c(NA, -36L), class = "data.frame")# 分割数据fid_split <- initial_split(fid_df)# 提取训练数据fid_train <- training(fid_split)# 使用10折交叉验证重新采样数据(默认10折)cv <- vfold_cv(fid_train)##预处理rec <- recipe(Frequency ~ ., data = fid_df) %>% step_nzv(all_predictors(), freq_cut = 0, unique_cut = 0) %>% # 移除零方差变量 step_novel(all_nominal()) %>% # 准备测试数据处理以前未见的因子级别 step_medianimpute(all_numeric(), -all_outcomes(), -has_role("id vars")) %>% # 用中位数替换缺失的数值观测 step_dummy(all_nominal(), -has_role("id vars")) # 虚拟编码分类变量rf_spec <- rand_forest(trees = 1e3) %>% set_engine("ranger", importance = "permutation") %>% set_mode("regression")wflow_rf <- workflow() %>% add_model(rf_spec) %>% add_recipe(rec)fit_resamples( wflow_rf, cv, metrics = metric_set(rmse, rsq), control = control_resamples(save_pred = TRUE))#> #> 正在附加包: 'rlang'#> 下面的对象被 'package:purrr' 遮蔽:#> #> %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,#> flatten_lgl, flatten_raw, invoke, list_along, modify, prepend,#> splice#> #> 正在附加包: 'vctrs'#> 下面的对象被 'package:tibble' 遮蔽:#> #> data_frame#> 下面的对象被 'package:dplyr' 遮蔽:#> #> data_frame#> # 重新采样结果#> # 10折交叉验证 #> # A tibble: 10 x 5#> splits id .metrics .notes .predictions #> <list> <chr> <list> <list> <list> #> 1 <split [24/3]> Fold01 <tibble [2 × 4]> <tibble [0 × 1]> <tibble [3 × 4]>#> 2 <split [24/3]> Fold02 <tibble [2 × 4]> <tibble [0 × 1]> <tibble [3 × 4]>#> 3 <split [24/3]> Fold03 <tibble [2 × 4]> <tibble [0 × 1]> <tibble [3 × 4]>#> 4 <split [24/3]> Fold04 <tibble [2 × 4]> <tibble [0 × 1]> <tibble [3 × 4]>#> 5 <split [24/3]> Fold05 <tibble [2 × 4]> <tibble [0 × 1]> <tibble [3 × 4]>#> 6 <split [24/3]> Fold06 <tibble [2 × 4]> <tibble [0 × 1]> <tibble [3 × 4]>#> 7 <split [24/3]> Fold07 <tibble [2 × 4]> <tibble [0 × 1]> <tibble [3 × 4]>#> 8 <split [25/2]> Fold08 <tibble [2 × 4]> <tibble [0 × 1]> <tibble [2 × 4]>#> 9 <split [25/2]> Fold09 <tibble [2 × 4]> <tibble [0 × 1]> <tibble [2 × 4]>#> 10 <split [25/2]> Fold10 <tibble [2 × 4]> <tibble [0 × 1]> <tibble [2 × 4]>
创建于2020-11-18,使用 reprex包 (v0.3.0.9001)