## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----setup, message = FALSE--------------------------------------------------- library("vimp") library("SuperLearner") ## ----load-vrc01-data---------------------------------------------------------- # read in the data data("vrc01") # subset to the columns of interest for this analysis library("dplyr") library("tidyselect") # retain only the columns of interest for this analysis y <- vrc01$ic50.censored X <- vrc01 %>% select(starts_with("geog"), starts_with("subtype"), starts_with("length")) set.seed(1234) vrc01_folds <- make_folds(y = y, V = 2) ## ----est-regressions-lm, warning = FALSE-------------------------------------- library("rlang") vrc01_subset <- vrc01 %>% select(ic50.censored, starts_with("geog"), starts_with("subtype"), starts_with("length")) %>% rename(y = ic50.censored) # estimate prediction function on each subset, predict on held-out fold full_fit <- vector("numeric", length = nrow(vrc01)) for (v in 1:2) { train_v <- subset(vrc01_subset, vrc01_folds != v) test_v <- subset(vrc01_subset, vrc01_folds == v) full_mod <- glm(y ~ ., data = train_v) full_fit[vrc01_folds == v] <- predict(full_mod, newdata = test_v) } # estimate the reduced conditional means for each of the individual variables # remove the outcome for the predictor matrix geog_indx <- max(which(grepl("geog", names(X)))) for (i in seq_len(ncol(X) - geog_indx)) { this_name <- names(X)[i + geog_indx] red_fit <- vector("numeric", length = nrow(vrc01)) for (v in 1:2) { train_v <- subset(vrc01_subset, vrc01_folds != v) test_v <- subset(vrc01_subset, vrc01_folds == v) red_fit[vrc01_folds == v] <- suppressWarnings( predict(glm(y ~ ., data = train_v %>% select(-!!this_name)), newdata = test_v) ) } this_vim <- vim(Y = y, f1 = full_fit, f2 = red_fit, indx = i + geog_indx, run_regression = FALSE, type = "r_squared", sample_splitting_folds = vrc01_folds, scale = "logit") if (i == 1) { lm_mat <- this_vim } else { lm_mat <- merge_vim(lm_mat, this_vim) } } # print out the matrix lm_mat ## ----estimate-full-regression-with-cf, message = FALSE, warning = FALSE------- learners <- "SL.ranger" # estimate the full regression function V <- 2 set.seed(4747) full_cv_fit <- suppressWarnings( SuperLearner::CV.SuperLearner( Y = y, X = X, SL.library = learners, cvControl = list(V = 2 * V), innerCvControl = list(list(V = V)), family = binomial() ) ) # get a numeric vector of cross-fitting folds cross_fitting_folds <- get_cv_sl_folds(full_cv_fit$folds) # get sample splitting folds set.seed(1234) sample_splitting_folds <- make_folds(unique(cross_fitting_folds), V = 2) full_cv_preds <- full_cv_fit$SL.predict ## ----estimate-reduced-regressions-with-cf, message = FALSE, warning = FALSE---- vars <- names(X)[(geog_indx + 1):ncol(X)] set.seed(1234) for (i in seq_len(length(vars))) { # use "eval" and "parse" to assign the objects of interest to avoid duplicating code eval(parse(text = paste0("reduced_", vars[i], "_cv_fit <- suppressWarnings(SuperLearner::CV.SuperLearner( Y = y, X = X[, -(geog_indx + i), drop = FALSE], SL.library = learners, cvControl = SuperLearner::SuperLearner.CV.control(V = 2 * V, validRows = full_cv_fit$folds), innerCvControl = list(list(V = V)), family = binomial() ))"))) eval(parse(text = paste0("reduced_", vars[i], "_cv_preds <- reduced_", vars[i], "_cv_fit$SL.predict"))) } ## ----cf-vims------------------------------------------------------------------ for (i in seq_len(length(vars))) { # again, use "eval" and "parse" to assign the objects of interest to avoid duplicating code eval(parse(text = paste0("cf_", vars[i], "_vim <- vimp_rsquared(Y = y, cross_fitted_f1 = full_cv_preds, cross_fitted_f2 = reduced_", vars[i], "_cv_preds, indx = (geog_indx + i), cross_fitting_folds = cross_fitting_folds, sample_splitting_folds = sample_splitting_folds, run_regression = FALSE, alpha = 0.05, V = V, na.rm = TRUE, scale = 'logit')"))) } cf_ests <- merge_vim(cf_subtype.is.01_AE_vim, cf_subtype.is.02_AG_vim, cf_subtype.is.07_BC_vim, cf_subtype.is.A1_vim, cf_subtype.is.A1C_vim, cf_subtype.is.A1D_vim, cf_subtype.is.B_vim, cf_subtype.is.C_vim, cf_subtype.is.D_vim, cf_subtype.is.O_vim, cf_subtype.is.Other_vim, cf_length.env_vim, cf_length.gp120_vim, cf_length.loop.e_vim, cf_length.loop.e.outliers_vim, cf_length.v5_vim, cf_length.v5.outliers_vim) all_vars <- c(paste0("Subtype is ", c("01_AE", "02_AG", "07_BC", "A1", "A1C", "A1D", "B", "C", "D", "O", "Other")), paste0("Length of ", c("Env", "gp120", "V5", "V5 outliers", "Loop E", "Loop E outliers"))) ## ----plot-cf-vim, fig.width = 8.5, fig.height = 8----------------------------- library("ggplot2") library("cowplot") theme_set(theme_cowplot()) cf_est_plot_tib <- cf_ests$mat %>% mutate( var_fct = rev(factor(s, levels = cf_ests$mat$s, labels = all_vars[as.numeric(cf_ests$mat$s) - geog_indx], ordered = TRUE)) ) # plot cf_est_plot_tib %>% ggplot(aes(x = est, y = var_fct)) + geom_point() + geom_errorbarh(aes(xmin = cil, xmax = ciu)) + xlab(expression(paste("Variable importance estimates: ", R^2, sep = ""))) + ylab("") + ggtitle("Estimated individual feature importance") + labs(subtitle = "in the VRC01 data (considering only geographic confounders, subtype, and viral geometry)") ## ----cf-group-vim, fig.width = 8.5, fig.height = 8, warning = FALSE----------- set.seed(91011) reduced_subtype_cv_fit <- suppressWarnings( SuperLearner::CV.SuperLearner( Y = y, X = X[, -c(5:15), drop = FALSE], SL.library = learners, cvControl = SuperLearner::SuperLearner.CV.control(V = 2 * V, validRows = full_cv_fit$folds), innerCvControl = list(list(V = V)), family = binomial() ) ) reduced_subtype_cv_preds <- reduced_subtype_cv_fit$SL.predict reduced_geometry_cv_fit <- suppressWarnings( SuperLearner::CV.SuperLearner( Y = y, X = X[, -c(16:21), drop = FALSE], SL.library = learners, cvControl = SuperLearner::SuperLearner.CV.control(V = 2 * V, validRows = full_cv_fit$folds), innerCvControl = list(list(V = V)), family = binomial() ) ) reduced_geometry_cv_preds <- reduced_geometry_cv_fit$SL.predict cf_subtype_vim <- vimp_rsquared( Y = y, cross_fitted_f1 = full_cv_preds, cross_fitted_f2 = reduced_subtype_cv_preds, indx = 5:15, run_regression = FALSE, V = V, cross_fitting_folds = cross_fitting_folds, sample_splitting_folds = sample_splitting_folds, scale = "logit" ) cf_geometry_vim <- vimp_rsquared( Y = y, cross_fitted_f1 = full_cv_preds, cross_fitted_f2 = reduced_geometry_cv_preds, indx = 16:21, run_regression = FALSE, V = V, cross_fitting_folds = cross_fitting_folds, sample_splitting_folds = sample_splitting_folds, scale = "logit" ) cf_groups <- merge_vim(cf_subtype_vim, cf_geometry_vim) all_grp_nms <- c("Viral subtype", "Viral geometry") grp_plot_tib <- cf_groups$mat %>% mutate( grp_fct = factor(case_when( s == "5,6,7,8,9,10,11,12,13,14,15" ~ "1", s == "16,17,18,19,20,21" ~ "2" ), levels = c("1", "2"), labels = all_grp_nms, ordered = TRUE) ) grp_plot_tib %>% ggplot(aes(x = est, y = grp_fct)) + geom_point() + geom_errorbarh(aes(xmin = cil, xmax = ciu)) + xlab(expression(paste("Variable importance estimates: ", R^2, sep = ""))) + ylab("") + ggtitle("Estimated feature group importance") + labs(subtitle = "in the VRC01 data (considering only geographic confounders, subtype, and viral geometry)")