-
Notifications
You must be signed in to change notification settings - Fork 4
Description
Hi Rum -
I had a suggestion for a potential addition to your bootstrapping functions. I noticed that in cases with severely imbalanced data, your bootstrapping functions will fail because the underlying boot functions will end up often missing rare classes in their bootstrap re-samples. As a result, you'll get this error:
Error in approx(res_sp[[i]][[j]], res_se[[i]][[j]], all_sp, yleft = 1, : need at least two non-NA values to interpolate
Fortunately, the boot functions can deal with this with strata, which can be incorporated into your functions as well. Here is an example:
library(tidyverse)
library(multiROC)
library(boot)
############################################
### Generate a test data frame with very few of class "C"
############################################
set.seed(1)
n_common <- 40
n_rare <- 5
test_df <-
bind_rows(
# Class A truth
tibble(
A_pred_knn = runif(n_common, min = 0.4, max = 0.5),
B_pred_knn = runif(n_common, min = 0.2, max = 0.5),
C_pred_knn = 1-(A_pred_knn + B_pred_knn),
A_true = 1, B_true = 0, C_true = 0
),
# Class B truth
tibble(
A_pred_knn = runif(n_common, min = 0.2, max = 0.5),
B_pred_knn = runif(n_common, min = 0.4, max = 0.5),
C_pred_knn = 1-(A_pred_knn + B_pred_knn),
A_true = 0, B_true = 1, C_true = 0
),
# Class C truth
tibble(
A_pred_knn = runif(n_rare, min = 0.3, max = 0.4),
B_pred_knn = runif(n_rare, min = 0.3, max = 0.4),
C_pred_knn = 1-(A_pred_knn + B_pred_knn),
A_true = 0, B_true = 0, C_true = 1
)
) %>%
select(contains("true"), contains("pred")) %>%
mutate(id = 1:n()) %>%
column_to_rownames("id")
#############################################
### An updated version of roc_auc_with_ci
#############################################
roc_auc_with_ci_strata <- function (data, conf = 0.95, type = "bca", R = 100, stratify = F){
suppressWarnings({
roc_res <- multi_roc(data)
AUC_res <- unlist(roc_res$AUC) %>% data.frame()
AUC_res$Var <- row.names(AUC_res)
colnames(AUC_res)[1] <- "AUC"
roc_ci_all_res <- matrix(NA, nrow(AUC_res), 4) %>% data.frame()
colnames(roc_ci_all_res) <- c("Var", "AUC", "lower CI",
"higher CI")
roc_ci_all_res$Var <- AUC_res$Var
roc_ci_all_res$AUC <- AUC_res$AUC
multi_roc_auc <- function(data, idx) {
results <- multi_roc(data[idx, ])$AUC
results <- unlist(results)
return(results)
}
# Incorporate all `_true` dummary variable columns
# into a single class column to be used for stratification
find_strata <- function(df) {
df %>%
select(contains("_true")) %>%
apply(., 1, function(x)
which(x == 1) - 1)
}
data <- data %>% mutate(class = find_strata(.))
for (i in 1:nrow(AUC_res)) {
if (stratify){
# Bootstrap with stratification based on classification column
# created by find_strata()
res_boot <- boot(data, statistic = multi_roc_auc, R, strata = data$class)
} else {
res_boot <- boot(data, statistic = multi_roc_auc, R)
}
res_boot_ci <- boot.ci(res_boot, conf, type, index = i)
roc_ci_all_res[i, 3] <- res_boot_ci[[4]][1, 4]
roc_ci_all_res[i, 4] <- res_boot_ci[[4]][1, 5]
}
return(roc_ci_all_res)
})
}
If you play with this, you will see that roc_auc_with_ci_strata(test_df, conf=0.95, type = "bca", R=100, stratify = F) will fail with the same error as above, but roc_auc_with_ci_strata(test_df, conf=0.95, type = "bca", R=100, stratify = T) will succeed with the output:
| Var | AUC | lower CI | higher CI | |
|---|---|---|---|---|
| knn.A | 0.8861111 | 0.7950000 | 0.9433184 | |
| knn.B | 0.8377778 | 0.6966667 | 0.9014984 | |
| knn.C | 0.8400000 | 0.7015143 | 0.9150000 | |
| knn.macro | 0.8544444 | 0.8006079 | 0.9071139 | |
| knn.micro | 0.9058131 | 0.8722888 | 0.9308729 |