我使用逻辑回归构建了一个预测模型,效果很好。但是当我分析测试数据集上计算的估计值时,我发现用于分层拆分的变量出现在了模型中,而我希望它作为预测变量被排除在外。update_role()
并没有解决这个问题…
data_split <- initial_split(mldata, prop = 3/4, strata = strata_var)# 创建训练和测试数据集:train_data <- training(data_split)test_data <- testing(data_split)# 构建模型mldata_recipe <- recipe(vital ~ ., data = train_data) %>% update_role(ids, new_role = "ID") %>% update_role(strata_var, new_role = "strata") %>% step_zv(all_predictors()) %>% step_unknown(all_nominal_predictors()) %>% step_dummy(all_nominal(), -all_outcomes()) %>% step_smote(vital)set.seed(456)# 10折交叉验证mldata_folds <- vfold_cv(train_data, strata = strata_var)glmnet_spec <- logistic_reg(penalty = tune(), mixture = tune()) %>% set_mode("classification") %>% set_engine("glmnet") glmnet_workflow <- workflow() %>% add_recipe(mldata_recipe) %>% add_model(glmnet_spec) glmnet_grid <- tidyr::crossing(penalty = 10^seq(-6, -1, length.out = 20), mixture = c(0, 0.05, 0.2, 0.4, 0.6, 0.8, 1)) set.seed(789)glmnet_tune <- tune_grid(glmnet_workflow, resamples = mldata_folds, grid = glmnet_grid) final_glmnet <- glmnet_workflow %>% finalize_workflow(select_best(glmnet_tune, "roc_auc")) glmnet_results <- final_glmnet %>% fit_resamples( resamples = mldata_folds, metrics = metric_set(roc_auc, accuracy, sensitivity, specificity), control = control_resamples(save_pred = TRUE) )set.seed(789)final_fit <- final_glmnet %>% last_fit(data_split)final_fit %>% pull(.workflow) %>% pluck(1) %>% tidy() %>% filter(term != "(Intercept)") %>% arrange(desc(abs(estimate))) %>% filter(abs(estimate) >0) %>% ggplot(aes(estimate, fct_reorder(term, desc(estimate)), color = estimate > 0))+ geom_vline(xintercept = 0, color = "lightgrey", lty = 2, size = 1.2) + geom_point() + scale_color_discrete(name = "Variable Effect \non outcome", labels = c("Deleterious", "Beneficial")) + theme_minimal()+ ggtitle("Meaningful Parameter Estimate Coefficients using logistic regression model")
在最后的图表中,我可以看到分层变量出现了。
回答:
你得到这个结果是因为你在step_dummy()
中使用的角色选择函数的组合。(完整的reprex在文章末尾)
你使用了以下选择。这会选择所有名义变量,但不包括任何结果。这是因为分层变量既是名义变量又不是结果变量,所以被选中了。
all_nominal(), -all_outcomes()
更好的选择是使用all_nominal_predictors()
,它不会选择id/分层变量。
library(tidymodels)data("penguins")rec_spec1 <- recipe(species ~ island + body_mass_g, data = penguins) %>% update_role(island, new_role = "strata") %>% step_dummy(all_nominal(), -all_outcomes())rec_spec1 %>% prep() %>% bake(new_data = NULL)#> # A tibble: 344 × 4#> body_mass_g species island_Dream island_Torgersen#> <int> <fct> <dbl> <dbl>#> 1 3750 Adelie 0 1#> 2 3800 Adelie 0 1#> 3 3250 Adelie 0 1#> 4 NA Adelie 0 1#> 5 3450 Adelie 0 1#> 6 3650 Adelie 0 1#> 7 3625 Adelie 0 1#> 8 4675 Adelie 0 1#> 9 3475 Adelie 0 1#> 10 4250 Adelie 0 1#> # … with 334 more rowsrec_spec2 <- recipe(species ~ island + body_mass_g, data = penguins) %>% update_role(island, new_role = "strata") %>% step_dummy(all_nominal_predictors())rec_spec2 %>% prep() %>% bake(new_data = NULL)#> # A tibble: 344 × 3#> island body_mass_g species#> <fct> <int> <fct> #> 1 Torgersen 3750 Adelie #> 2 Torgersen 3800 Adelie #> 3 Torgersen 3250 Adelie #> 4 Torgersen NA Adelie #> 5 Torgersen 3450 Adelie #> 6 Torgersen 3650 Adelie #> 7 Torgersen 3625 Adelie #> 8 Torgersen 4675 Adelie #> 9 Torgersen 3475 Adelie #> 10 Torgersen 4250 Adelie #> # … with 334 more rows
完整的reprex
library(tidymodels)library(themis)library(forcats)data("penguins")penguins0 <- penguins %>% mutate(ids = row_number(), species = factor(species == "Adelie")) %>% drop_na()data_split <- initial_split(penguins0, prop = 3/4, strata = island)# 创建训练和测试数据集:train_data <- training(data_split)test_data <- testing(data_split)# 构建模型mldata_recipe <- recipe(species ~ ., data = train_data) %>% update_role(ids, new_role = "ID") %>% update_role(island, new_role = "strata") %>% step_zv(all_predictors()) %>% step_unknown(all_nominal_predictors()) %>% step_dummy(all_nominal_predictors()) %>% step_smote(species)set.seed(456)# 10折交叉验证mldata_folds <- vfold_cv(train_data, strata = island)glmnet_spec <- logistic_reg(penalty = tune(), mixture = tune()) %>% set_mode("classification") %>% set_engine("glmnet") glmnet_workflow <- workflow() %>% add_recipe(mldata_recipe) %>% add_model(glmnet_spec) glmnet_grid <- tidyr::crossing(penalty = 10^seq(-6, -1, length.out = 20), mixture = c(0, 0.05, 0.2, 0.4, 0.6, 0.8, 1)) set.seed(789)glmnet_tune <- tune_grid(glmnet_workflow, resamples = mldata_folds, grid = glmnet_grid) final_glmnet <- glmnet_workflow %>% finalize_workflow(select_best(glmnet_tune, "roc_auc")) glmnet_results <- final_glmnet %>% fit_resamples( resamples = mldata_folds, metrics = metric_set(roc_auc, accuracy, sensitivity, specificity), control = control_resamples(save_pred = TRUE) )set.seed(789)final_fit <- final_glmnet %>% last_fit(data_split)final_fit %>% pull(.workflow) %>% pluck(1) %>% tidy() %>% filter(term != "(Intercept)") %>% arrange(desc(abs(estimate))) %>% filter(abs(estimate) >0) %>% ggplot(aes(estimate, fct_reorder(term, desc(estimate)), color = estimate > 0))+ geom_vline(xintercept = 0, color = "lightgrey", lty = 2, size = 1.2) + geom_point() + scale_color_discrete(name = "Variable Effect \non outcome", labels = c("Deleterious", "Beneficial")) + theme_minimal()+ ggtitle("Meaningful Parameter Estimate Coefficients using logistic regression model")