diff --git a/.DS_Store b/.DS_Store
deleted file mode 100644
index e9d789b..0000000
Binary files a/.DS_Store and /dev/null differ
diff --git a/.github/.gitignore b/.github/.gitignore
deleted file mode 100644
index 2d19fc7..0000000
--- a/.github/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-*.html
diff --git a/R/.DS_Store b/R/.DS_Store
deleted file mode 100644
index 5008ddf..0000000
Binary files a/R/.DS_Store and /dev/null differ
diff --git a/R/itr_helpers.R b/R/itr_helpers.R
index 598163c..262d5df 100644
--- a/R/itr_helpers.R
+++ b/R/itr_helpers.R
@@ -397,29 +397,63 @@ getAupecOutput = function(
 }
 
 # transformation function for taucv matrix
-gettaucv <- function(
-    fit,
-    ...
-){
-  estimates <- fit$estimates
+# gettaucv <- function(
+#     estimates,
+#     ...
+# ){
+#   # estimates <- fit$estimates
+#   fit_ml <- estimates$fit_ml
+#   n_folds <- estimates$params$n_folds
+#   tau_cv <- list()
+#
+#   # for one model
+#   for (k in seq(n_folds)) {
+#     tau_cv[[k]] <- fit_ml[["causal_forest"]][[k]][["tau_cv"]]
+#   }
+#
+#   # convert to a single matrix
+#   tau_cv <- do.call(cbind, tau_cv)
+#
+#   return(tau_cv)
+#
+# }
+gettaucv <- function(estimates, ...) {
+  # Assuming fit_ml is a named list of models
   fit_ml <- estimates$fit_ml
   n_folds <- estimates$params$n_folds
-  tau_cv <- list()
 
-  # for one model
-  for (k in seq(n_folds)) {
-    tau_cv[[k]] <- fit_ml[["causal_forest"]][[k]][["tau_cv"]]
+  # Prepare a list to store tau_cv for each model and each fold
+  all_models_tau_cv <- list()
+
+  # Loop over all models in fit_ml
+  for (model_name in names(fit_ml)) {
+    model_tau_cv <- list()
+
+    # For one model, loop over all k folds
+    for (k in seq(n_folds)) {
+      # Check if the k-th fold exists for the current model
+      if (is.list(fit_ml[[model_name]]) && k <= length(fit_ml[[model_name]])) {
+        # Check if tau_cv exists for the k-th fold of the current model
+        if ("tau_cv" %in% names(fit_ml[[model_name]][[k]])) {
+          model_tau_cv[[k]] <- fit_ml[[model_name]][[k]][["tau_cv"]]
+        } else {
+          warning(paste("tau_cv not found in fold", k, "of model", model_name))
+        }
+      } else {
+        warning(paste("Fold", k, "not found in model", model_name))
+      }
+    }
+
+    # Convert to a single matrix and store it with the model's name
+    all_models_tau_cv[[model_name]] <- do.call(cbind, model_tau_cv)
   }
 
-  # convert to a single matrix
-  tau_cv <- do.call(cbind, tau_cv)
-
-  return(tau_cv)
-
+  return(all_models_tau_cv)
 }
 
 
 
+
 # rename the columns of the data frame with the interaction terms
 rename_interaction_terms <- function(interaction_df){
   colnames(interaction_df) <- gsub(":", "_", colnames(interaction_df))
diff --git a/R/itr_summary.R b/R/itr_summary.R
index d8fd0d1..a0b1695 100644
--- a/R/itr_summary.R
+++ b/R/itr_summary.R
@@ -1,6 +1,6 @@
-#' Summarize estimate_itr output 
+#' Summarize estimate_itr output
 #' @param object An object of \code{estimate_itr} class (typically an output of \code{estimate_itr()} function).
-#' @param ... Other parameters. 
+#' @param ... Other parameters.
 #' @importFrom stats pnorm
 #' @export
 summary.itr <- function(object, ...) {
@@ -16,7 +16,7 @@ summary.itr <- function(object, ...) {
   # fit         <- object$qoi
 
 # -----------------------------------------
-# estimate ITR from ML algorithms 
+# estimate ITR from ML algorithms
 # -----------------------------------------
 
 if(length(estimate_algs) != 0){
@@ -46,7 +46,7 @@ if(length(estimate_algs) != 0){
       map(., ~ as_tibble(.)) %>%
       bind_rows() %>%
       mutate(
-        statistic = pape / sd, 
+        statistic = pape / sd,
         p.value = 2 * pnorm(abs(pape / sd), lower.tail = FALSE)
       ) %>%
       rename(
@@ -150,7 +150,7 @@ if(length(estimate_algs) != 0){
         )
     }
 
-    aupec_algs_vec <- fit$AUPEC %>% 
+    aupec_algs_vec <- fit$AUPEC %>%
       map(., ~ .x$aupec_cv) %>%
       bind_rows() %>%
       mutate(
@@ -208,7 +208,7 @@ if(length(estimate_user) != 0){
     map(., ~ as_tibble(.)) %>%
     bind_rows() %>%
     mutate(
-      statistic = pape / sd, 
+      statistic = pape / sd,
       p.value = 2 * pnorm(abs(pape / sd), lower.tail = FALSE)
     ) %>%
     rename(
@@ -316,93 +316,104 @@ print.summary.itr <- function(x, ...) {
 }
 
 
-#' Summarize test_itr output 
+#' Summarize test_itr output
 #' @param object An object of \code{test_itr} class (typically an output of \code{test_itr()} function).
-#' @param ... Other parameters. 
-#' @importFrom stats pnorm
+#' @param ... Other parameters.
+#' @importFrom dplyr mutate rename select bind_rows %>%
+#' @importFrom purrr map_dfr
+#' @importFrom tibble as_tibble tibble
 #' @export
 summary.test_itr <- function(object, ...) {
-  out            <- list()
-  consist_tibble <- tibble()
-  het_tibble     <- tibble()
-
-  ## -----------------------------------------
-  ## hypothesis tests
-  ## -----------------------------------------
-  if (names(object[1]) == "consist") {
-
-    # parameters for test_itr object
-    consist        <- object$consist
-    het            <- object$het
-    consist_names <- names(consist)
-    het_names <- names(het)
-
-    # reformat
-    out[["Consistency"]] <- consist %>%
-      map(., ~ as_tibble(.)) %>%
-      bind_rows() %>%
-      mutate(algorithm = consist_names) %>%
-      rename(statistic = stat,
-            p.value = pval) %>%
-      select(algorithm, statistic, p.value)
-
+  requireNamespace("dplyr", quietly = TRUE)
+  requireNamespace("purrr", quietly = TRUE)
+  requireNamespace("tibble", quietly = TRUE)
 
-    out[["Heterogeneity"]] <- het %>%
-      map(., ~ as_tibble(.)) %>%
-      bind_rows() %>%
-      mutate(algorithm = het_names) %>%
-      rename(statistic = stat,
-            p.value = pval) %>%
-      select(algorithm, statistic, p.value)
-  } 
-
-
-  if (names(object[1]) == "consistcv") {
-    
-    # parameters for test_itr object
-    consist <- object$consistcv
-    het <- object$hetcv
-    consist_names <- names(consist)
-    het_names <- names(het)
-
-    # reformat
-    out[["Consistency_cv"]] <- consist %>%
-      map(., ~ as_tibble(.)) %>%
-      bind_rows() %>%
-      mutate(algorithm = consist_names) %>%
-      rename(statistic = stat,
-            p.value = pval) %>%
-      select(algorithm, statistic, p.value)
+  process_results <- function(data, names) {
+    if (length(data) == 0) {
+      stop("Data for processing is empty. Check the structure of the 'test_itr' object.")
+    }
+    if (length(names) != length(data)) {
+      stop("Mismatch between data elements and their names. Each element of data should have a corresponding name.")
+    }
+    purrr::map_dfr(data, ~ tibble::as_tibble(.x), .id = "algorithm") %>%
+      dplyr::mutate(algorithm = names) %>%
+      dplyr::rename(statistic = stat, p.value = pval) %>%
+      dplyr::select(algorithm, statistic, p.value)
+  }
 
-    out[["Heterogeneity_cv"]] <- het %>%
-      map(., ~ as_tibble(.)) %>%
-      bind_rows() %>%
-      mutate(algorithm = het_names) %>%
-      rename(statistic = stat,
-            p.value = pval) %>%
-      select(algorithm, statistic, p.value)
+  out <- list()
+
+  if ("consist" %in% names(object)) {
+    consist <- object$consist
+    het <- object$het
+    out[["Consistency"]] <- process_results(consist, names(consist))
+    out[["Heterogeneity"]] <- process_results(het, names(het))
+  } else if ("consistcv" %in% names(object)) {
+    consistcv <- object$consistcv
+    hetcv <- object$hetcv
+    if (is.null(consistcv) || is.null(hetcv)) {
+      stop("The 'consistcv' or 'hetcv' elements are NULL.")
+    }
+    out[["Consistency_cv"]] <- process_results(consistcv, names(consistcv))
+    out[["Heterogeneity_cv"]] <- process_results(hetcv, names(hetcv))
+  } else {
+    stop("Invalid 'test_itr' object: neither 'consist' nor 'consistcv' found in names.")
   }
 
   class(out) <- c("summary.test_itr", class(out))
-
   return(out)
 }
 
+
+
 #' Print
 #' @importFrom cli cat_rule
 #' @param x An object of \code{summary.test_itr} class. This is typically an output of \code{summary.test_itr()} function.
-#' @param ... Other parameters. 
+#' @param ... Other parameters.
 #' @export
 print.summary.test_itr <- function(x, ...) {
+  # Ensure the cli package is available
+  if (!requireNamespace("cli", quietly = TRUE)) {
+    stop("The 'cli' package is required for printing this summary. Please install it using install.packages('cli').")
+  }
+
+  # Consistency
+  cli::cat_rule(left = "The Consistency Test Results for GATEs")
+  if ("Consistency" %in% names(x)) {
+    print(as.data.frame(x[["Consistency"]]), digits = 2)
+  } else {
+    cli::cat_line("No consistency results available (sample-splitting).")
+  }
+  cli::cat_line("")
+
+  # Heterogeneity
+  cli::cat_rule(left = "The Heterogeneity Test Results for GATEs")
+  if ("Heterogeneity" %in% names(x)) {
+    print(as.data.frame(x[["Heterogeneity"]]), digits = 2)
+  } else {
+    cli::cat_line("No heterogeneity results available (sample-splitting).")
+  }
+  cli::cat_line("")
 
-  # Rank Consistency Test
-  cli::cat_rule(left = "Rank Consistency Test Results")
-  print(as.data.frame(x[["Consistency"]], digits = 2))
+  # Consistency Cross-Validation
+  cli::cat_rule(left = "The Consistency Test Results for GATEs (Cross-validation)")
+  if ("Consistency_cv" %in% names(x)) {
+    print(as.data.frame(x[["Consistency_cv"]]), digits = 2)
+  } else {
+    cli::cat_line("No consistency results available (cross-validation).")
+  }
   cli::cat_line("")
 
-  # Group Heterogeneity Test
-  cli::cat_rule(left = "Group Heterogeneity Test Results")
-  print(as.data.frame(x[["Heterogeneity"]], digits = 2))
+  # Heterogeneity Cross-Validation
+  cli::cat_rule(left = "The Heterogeneity Test Results for GATEs (Cross-validation)")
+  if ("Heterogeneity_cv" %in% names(x)) {
+    print(as.data.frame(x[["Heterogeneity_cv"]]), digits = 2)
+  } else {
+    cli::cat_line("No heterogeneity results available (cross-validation).")
+  }
   cli::cat_line("")
+
+  invisible(x)
 }
 
+
diff --git a/R/main.r b/R/main.r
index c40af4c..196383d 100644
--- a/R/main.r
+++ b/R/main.r
@@ -812,19 +812,20 @@ evaluate_itr <- function(
 }
 
 #' Conduct hypothesis tests
-#' @param fit Fitted model. Usually an output from \code{estimate_itr}
+#' @param fit Fitted model. Usually an output from \code{evaluate_itr}
 #' @param nsim Number of Monte Carlo simulations used to simulate the null distributions. Default is 1000.
 #' @param ... Further arguments passed to the function.
 #' @return An object of \code{test_itr} class
 #' @export
 test_itr <- function(
-    fit,
+    est,
     nsim = 1000,
     ...
 ) {
 
   # test parameters
-  estimates  <- fit$estimates
+  out_algs  <- est$out_algs
+  estimates <- out_algs$estimates
   cv         <- estimates$params$cv
   fit_ml     <- estimates$fit_ml
   Tcv        <- estimates$Tcv
@@ -832,8 +833,8 @@ test_itr <- function(
   indcv      <- estimates$indcv
   n_folds    <- estimates$params$n_folds
   ngates     <- estimates$params$ngates
-  algorithms <- fit$df$algorithms
-  outcome    <- fit$df$outcome
+  algorithms <- out_algs$df$algorithms
+  outcome    <- out_algs$df$outcome
 
   # caret and rlearner parameters
   caret_algorithms <- estimates$params$caret_algorithms
@@ -849,7 +850,7 @@ test_itr <- function(
   ## =================================
 
   if(cv == FALSE){
-    cat('Conduct hypothesis tests for GATEs unde sample splitting ...\n')
+    cat('Conduct hypothesis tests for GATEs unde sample-splitting ...\n')
 
     # create empty lists to for consistcv and hetcv
     consist <- list()
@@ -893,14 +894,14 @@ test_itr <- function(
 
       consistcv[[i]] <- consistcv.test(
         T   = Tcv,
-        tau = gettaucv(fit)[[i]],
+        tau = gettaucv(estimates)[[i]],
         Y   = Ycv,
         ind = indcv,
         ngates = ngates)
 
       hetcv[[i]] <- hetcv.test(
         T   = Tcv,
-        tau = gettaucv(fit)[[i]],
+        tau = gettaucv(estimates)[[i]],
         Y   = Ycv,
         ind = indcv,
         ngates = ngates)
diff --git a/docs/articles/test_itr.html b/docs/articles/test_itr.html
new file mode 100644
index 0000000..16d65a5
--- /dev/null
+++ b/docs/articles/test_itr.html
@@ -0,0 +1,249 @@
+<!DOCTYPE html>
+<!-- Generated by pkgdown: do not edit by hand --><html lang="en">
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<meta charset="utf-8">
+<meta http-equiv="X-UA-Compatible" content="IE=edge">
+<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
+<meta name="description" content="evalITR">
+<title>Nonparametric statistical tests for treatment heterogeneity and rank consistency across multiple ML algorithms • evalITR</title>
+<script src="../deps/jquery-3.6.0/jquery-3.6.0.min.js"></script><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
+<link href="../deps/bootstrap-5.2.2/bootstrap.min.css" rel="stylesheet">
+<script src="../deps/bootstrap-5.2.2/bootstrap.bundle.min.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous">
+<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous">
+<!-- bootstrap-toc --><script src="https://cdn.rawgit.com/afeld/bootstrap-toc/v1.0.1/dist/bootstrap-toc.min.js"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- search --><script src="https://cdnjs.cloudflare.com/ajax/libs/fuse.js/6.4.6/fuse.js" integrity="sha512-zv6Ywkjyktsohkbp9bb45V6tEMoWhzFzXis+LrMehmJZZSys19Yxf1dopHx7WzIKxr5tK2dVcYmaCk2uqdjF4A==" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/autocomplete.js/0.38.0/autocomplete.jquery.min.js" integrity="sha512-GU9ayf+66Xx2TmpxqJpliWbT5PiGYxpaG8rfnBEk1LL8l1KGkRShhngwdXK1UgqhAzWpZHSiYPc09/NwDQIGyg==" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mark.js/8.11.1/mark.min.js" integrity="sha512-5CYOlHXGh6QpOFA/TeTylKLWfB3ftPsde7AnmhuitiTX4K5SqCLBeKro6sPS8ilsz1Q4NRx3v8Ko2IBiszzdww==" crossorigin="anonymous"></script><!-- pkgdown --><script src="../pkgdown.js"></script><meta property="og:title" content="Nonparametric statistical tests for treatment heterogeneity and rank consistency across multiple ML algorithms">
+<meta property="og:description" content="evalITR">
+<!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
+<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
+<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
+<![endif]-->
+</head>
+<body>
+    <a href="#main" class="visually-hidden-focusable">Skip to contents</a>
+    
+
+    <nav class="navbar fixed-top navbar-light navbar-expand-lg bg-light"><div class="container">
+    
+    <a class="navbar-brand me-2" href="../index.html">evalITR</a>
+
+    <small class="nav-text text-muted me-auto" data-bs-toggle="tooltip" data-bs-placement="bottom" title="">1.0.0</small>
+
+    
+    <button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbar" aria-controls="navbar" aria-expanded="false" aria-label="Toggle navigation">
+      <span class="navbar-toggler-icon"></span>
+    </button>
+
+    <div id="navbar" class="collapse navbar-collapse ms-3">
+      <ul class="navbar-nav me-auto">
+<li class="nav-item dropdown">
+  <a href="#" class="nav-link dropdown-toggle" data-bs-toggle="dropdown" role="button" aria-expanded="false" aria-haspopup="true" id="dropdown-user-s-guide">User's Guide</a>
+  <div class="dropdown-menu" aria-labelledby="dropdown-user-s-guide">
+    <a class="dropdown-item" href="../articles/install.html">Installation</a>
+    <a class="dropdown-item" href="../articles/sample_split.html">Sample Splitting</a>
+    <a class="dropdown-item" href="../articles/sample_split_caret.html">Sample Splitting with Caret/SuperLearner</a>
+    <a class="dropdown-item" href="../articles/cv_single_alg.html">Cross-Validation</a>
+    <a class="dropdown-item" href="../articles/cv_multiple_alg.html">Cross-Validation with Multiple Algorithms</a>
+    <a class="dropdown-item" href="../articles/user_itr.html">User-Defined ITR</a>
+    <a class="dropdown-item" href="../articles/user_itr_algs.html">Compare Estimated and User Defined ITR</a>
+  </div>
+</li>
+<li class="nav-item">
+  <a class="nav-link" href="../reference/index.html">Reference</a>
+</li>
+<li class="nav-item">
+  <a class="nav-link" href="../news/index.html">Changelog</a>
+</li>
+      </ul>
+<form class="form-inline my-2 my-lg-0" role="search">
+        <input type="search" class="form-control me-sm-2" aria-label="Toggle navigation" name="search-input" data-search-index="../search.json" id="search-input" placeholder="Search for" autocomplete="off">
+</form>
+
+      <ul class="navbar-nav">
+<li class="nav-item">
+  <a class="external-link nav-link" href="https://github.com/MichaelLLi/evalITR/" aria-label="github">
+    <span class="fab fa fab fa-github fa-lg"></span>
+     
+  </a>
+</li>
+      </ul>
+</div>
+
+    
+  </div>
+</nav><div class="container template-article">
+
+
+
+
+<div class="row">
+  <main id="main" class="col-md-9"><div class="page-header">
+      <img src="" class="logo" alt=""><h1>Nonparametric statistical tests for treatment heterogeneity and rank consistency across multiple ML algorithms</h1>
+            
+      
+      <small class="dont-index">Source: <a href="https://github.com/MichaelLLi/evalITR/blob/HEAD/vignettes/test_itr.Rmd" class="external-link"><code>vignettes/test_itr.Rmd</code></a></small>
+      <div class="d-none name"><code>test_itr.Rmd</code></div>
+    </div>
+
+    
+    
+<p>In practice, machine learning (ML) algorithms may fail to ascertain
+heterogeneous treatment effects due to small small sample sizes, high
+dimensionality, and arbitrary parameter-tuning. The
+<code>test_itr</code> function allows users to empirically validate the
+estimates of GATEs under various ML algorithms with statistical testing.
+In particular, there are two types of nonparametric statistical tests:
+(1) test for across group treatment effects heterogeneity and (2) test
+of rank consistency of GATEs. The following provides a description of
+each test. The tests are based on the idea that, if an ML algorithm
+produces a reasonable scoring rule (achieved by the
+<code>estimate_itr</code> function), it is reasonable to expect that (1)
+the GATEs across groups are heterogeneous; and (2) the rank ordering of
+the GATEs based on their magnitude should be mononotic.</p>
+<p>Following the previous examples, we first estimate GATEs using causal
+forest (<code>causal forest</code>), Bayesian Additive Regression Trees
+(<code>bartc</code>), LASSO (<code>lasso</code>), random forest
+(<code>rf</code>) under cross-validation using the
+<code>estimate_itr</code> function. We specify the number of groups to
+divide the sample into through the <code>ngates</code> argument. By
+setting <code>ngates = 5</code> in the example below, we estimate the
+heterogeneous impact of small class sizes on students’ writing scores
+across 5 groups of students.</p>
+<div class="sourceCode" id="cb1"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="co"># library(evalITR)</span></span>
+<span><span class="fu">devtools</span><span class="fu">::</span><span class="fu"><a href="https://devtools.r-lib.org/reference/load_all.html" class="external-link">load_all</a></span><span class="op">(</span><span class="st">"."</span><span class="op">)</span></span>
+<span></span>
+<span><span class="co"># specify the trainControl method</span></span>
+<span><span class="va">fitControl</span> <span class="op">&lt;-</span> <span class="fu">caret</span><span class="fu">::</span><span class="fu"><a href="https://rdrr.io/pkg/caret/man/trainControl.html" class="external-link">trainControl</a></span><span class="op">(</span></span>
+<span>                           method <span class="op">=</span> <span class="st">"repeatedcv"</span>,</span>
+<span>                           number <span class="op">=</span> <span class="fl">2</span>,</span>
+<span>                           repeats <span class="op">=</span> <span class="fl">2</span><span class="op">)</span></span>
+<span><span class="co"># estimate ITR</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/Random.html" class="external-link">set.seed</a></span><span class="op">(</span><span class="fl">2023</span><span class="op">)</span></span>
+<span><span class="va">fit_cv</span> <span class="op">&lt;-</span> <span class="fu"><a href="../reference/estimate_itr.html">estimate_itr</a></span><span class="op">(</span></span>
+<span>               treatment <span class="op">=</span> <span class="st">"treatment"</span>,</span>
+<span>               form <span class="op">=</span> <span class="va">user_formula</span>,</span>
+<span>               data <span class="op">=</span> <span class="va">star_data</span>,</span>
+<span>               trControl <span class="op">=</span> <span class="va">fitControl</span>,</span>
+<span>               algorithms <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span></span>
+<span>                  <span class="st">"causal_forest"</span>, <span class="co"># from caret</span></span>
+<span>                  <span class="st">"bartc"</span>, <span class="co"># from caret</span></span>
+<span>                  <span class="st">"lasso"</span>, <span class="co"># from caret </span></span>
+<span>                  <span class="st">"rf"</span><span class="op">)</span>, <span class="co"># from caret </span></span>
+<span>               budget <span class="op">=</span> <span class="fl">0.2</span>, <span class="co"># 20% budget constraint</span></span>
+<span>               n_folds <span class="op">=</span> <span class="fl">5</span>, <span class="co"># 5-fold cross-validation</span></span>
+<span>               ngates <span class="op">=</span> <span class="fl">5</span><span class="op">)</span> <span class="co"># 5 groups</span></span>
+<span><span class="co">#&gt; Evaluate ITR with cross-validation ...</span></span>
+<span><span class="co">#&gt; fitting treatment model via method 'bart'</span></span>
+<span><span class="co">#&gt; fitting response model via method 'bart'</span></span>
+<span><span class="co">#&gt; fitting treatment model via method 'bart'</span></span>
+<span><span class="co">#&gt; fitting response model via method 'bart'</span></span>
+<span><span class="co">#&gt; fitting treatment model via method 'bart'</span></span>
+<span><span class="co">#&gt; fitting response model via method 'bart'</span></span>
+<span><span class="co">#&gt; fitting treatment model via method 'bart'</span></span>
+<span><span class="co">#&gt; fitting response model via method 'bart'</span></span>
+<span><span class="co">#&gt; fitting treatment model via method 'bart'</span></span>
+<span><span class="co">#&gt; fitting response model via method 'bart'</span></span>
+<span></span>
+<span><span class="co"># evaluate ITR</span></span>
+<span><span class="va">est_cv</span> <span class="op">&lt;-</span> <span class="fu"><a href="../reference/evaluate_itr.html">evaluate_itr</a></span><span class="op">(</span><span class="va">fit_cv</span><span class="op">)</span></span>
+<span></span>
+<span><span class="co"># extract GATEs estimates</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/summary.html" class="external-link">summary</a></span><span class="op">(</span><span class="va">est_cv</span><span class="op">)</span><span class="op">$</span><span class="va">GATE</span></span>
+<span><span class="co">#&gt; <span style="color: #949494;"># A tibble: 20 × 8</span></span></span>
+<span><span class="co">#&gt;    estimate std.deviation algorithm     group statistic p.value  upper  lower</span></span>
+<span><span class="co">#&gt;       <span style="color: #949494; font-style: italic;">&lt;dbl&gt;</span>         <span style="color: #949494; font-style: italic;">&lt;dbl&gt;</span> <span style="color: #949494; font-style: italic;">&lt;chr&gt;</span>         <span style="color: #949494; font-style: italic;">&lt;int&gt;</span>     <span style="color: #949494; font-style: italic;">&lt;dbl&gt;</span>   <span style="color: #949494; font-style: italic;">&lt;dbl&gt;</span>  <span style="color: #949494; font-style: italic;">&lt;dbl&gt;</span>  <span style="color: #949494; font-style: italic;">&lt;dbl&gt;</span></span></span>
+<span><span class="co">#&gt; <span style="color: #BCBCBC;"> 1</span>   -<span style="color: #BB0000;">52.3</span>          125.  causal_forest     1   -<span style="color: #BB0000;">0.417</span>   0.677  193.   -<span style="color: #BB0000;">298.</span> </span></span>
+<span><span class="co">#&gt; <span style="color: #BCBCBC;"> 2</span>   -<span style="color: #BB0000;">61.8</span>          109.  causal_forest     2   -<span style="color: #BB0000;">0.568</span>   0.570  151.   -<span style="color: #BB0000;">275.</span> </span></span>
+<span><span class="co">#&gt; <span style="color: #BCBCBC;"> 3</span>    22.8           82.6 causal_forest     3    0.275   0.783  185.   -<span style="color: #BB0000;">139.</span> </span></span>
+<span><span class="co">#&gt; <span style="color: #BCBCBC;"> 4</span>   135.           120.  causal_forest     4    1.13    0.259  369.    -<span style="color: #BB0000;">99.3</span></span></span>
+<span><span class="co">#&gt; <span style="color: #BCBCBC;"> 5</span>   -<span style="color: #BB0000;">24.9</span>          124.  causal_forest     5   -<span style="color: #BB0000;">0.202</span>   0.840  217.   -<span style="color: #BB0000;">267.</span> </span></span>
+<span><span class="co">#&gt; <span style="color: #BCBCBC;"> 6</span>   -<span style="color: #BB0000;">24.1</span>           88.2 bartc             1   -<span style="color: #BB0000;">0.273</span>   0.785  149.   -<span style="color: #BB0000;">197.</span> </span></span>
+<span><span class="co">#&gt; <span style="color: #BCBCBC;"> 7</span>   -<span style="color: #BB0000;">25.5</span>           59.5 bartc             2   -<span style="color: #BB0000;">0.429</span>   0.668   91.1  -<span style="color: #BB0000;">142.</span> </span></span>
+<span><span class="co">#&gt; <span style="color: #BCBCBC;"> 8</span>   -<span style="color: #BB0000;">46.9</span>          101.  bartc             3   -<span style="color: #BB0000;">0.465</span>   0.642  151.   -<span style="color: #BB0000;">245.</span> </span></span>
+<span><span class="co">#&gt; <span style="color: #BCBCBC;"> 9</span>   131.           102.  bartc             4    1.28    0.199  330.    -<span style="color: #BB0000;">68.7</span></span></span>
+<span><span class="co">#&gt; <span style="color: #BCBCBC;">10</span>   -<span style="color: #BB0000;">15.1</span>          116.  bartc             5   -<span style="color: #BB0000;">0.130</span>   0.896  212.   -<span style="color: #BB0000;">242.</span> </span></span>
+<span><span class="co">#&gt; <span style="color: #BCBCBC;">11</span>   -<span style="color: #BB0000;">56.5</span>           92.0 lasso             1   -<span style="color: #BB0000;">0.614</span>   0.539  124.   -<span style="color: #BB0000;">237.</span> </span></span>
+<span><span class="co">#&gt; <span style="color: #BCBCBC;">12</span>    98.6           59.5 lasso             2    1.66    0.097<span style="text-decoration: underline;">5</span> 215.    -<span style="color: #BB0000;">18.0</span></span></span>
+<span><span class="co">#&gt; <span style="color: #BCBCBC;">13</span>   -<span style="color: #BB0000;">25.6</span>          114.  lasso             3   -<span style="color: #BB0000;">0.224</span>   0.823  199.   -<span style="color: #BB0000;">250.</span> </span></span>
+<span><span class="co">#&gt; <span style="color: #BCBCBC;">14</span>    40.4          117.  lasso             4    0.344   0.731  271.   -<span style="color: #BB0000;">190.</span> </span></span>
+<span><span class="co">#&gt; <span style="color: #BCBCBC;">15</span>   -<span style="color: #BB0000;">38.0</span>           96.8 lasso             5   -<span style="color: #BB0000;">0.393</span>   0.695  152.   -<span style="color: #BB0000;">228.</span> </span></span>
+<span><span class="co">#&gt; <span style="color: #BCBCBC;">16</span>  -<span style="color: #BB0000;">117.</span>            58.7 rf                1   -<span style="color: #BB0000;">1.99</span>    0.046<span style="text-decoration: underline;">9</span>  -<span style="color: #BB0000;">1.61</span> -<span style="color: #BB0000;">232.</span> </span></span>
+<span><span class="co">#&gt; <span style="color: #BCBCBC;">17</span>     2.78         124.  rf                2    0.022<span style="text-decoration: underline;">4</span>  0.982  246.   -<span style="color: #BB0000;">240.</span> </span></span>
+<span><span class="co">#&gt; <span style="color: #BCBCBC;">18</span>    22.7          111.  rf                3    0.204   0.838  241.   -<span style="color: #BB0000;">196.</span> </span></span>
+<span><span class="co">#&gt; <span style="color: #BCBCBC;">19</span>   145.            94.6 rf                4    1.53    0.125  331.    -<span style="color: #BB0000;">40.3</span></span></span>
+<span><span class="co">#&gt; <span style="color: #BCBCBC;">20</span>   -<span style="color: #BB0000;">35.0</span>          118.  rf                5   -<span style="color: #BB0000;">0.296</span>   0.767  197.   -<span style="color: #BB0000;">267.</span></span></span></code></pre></div>
+<p>The table reports the quintile GATEs (<span class="math inline">\(K =
+5\)</span>) estimates for each ML algorithm. We find that the Random
+Forest is able to produce statistically negative GATE for the lowest
+quantitle group (group 1) under cross-validation. This provides evidence
+that the Random Forest is able to identify a 20% subgroup whose writing
+scores are negatively impacted by small class sizes.</p>
+<p>We now conduct the statistical tests of treatment effect
+heterogeneity and rank consistency to validate these GATEs estimates. We
+use the module object output <code>test_est_cv</code> from the
+<code>evaluate_itr</code> function as the input object for the
+<code>test_itr</code> function to conduct 2 tests simultaneously. We can
+summarize the test statistics and the p-values using the
+<code>summary</code> function. Lastly, we use the <code>nsims</code>
+argument to specify the number of simulations to conduct for each test.
+The default is 1000 simulations.</p>
+<div class="sourceCode" id="cb2"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="co"># conduct nonparametric tests</span></span>
+<span><span class="va">test_est_cv</span> <span class="op">&lt;-</span> <span class="fu"><a href="../reference/test_itr.html">test_itr</a></span><span class="op">(</span><span class="va">est_cv</span>,</span>
+<span>                        nsim <span class="op">=</span> <span class="fl">5000</span><span class="op">)</span></span>
+<span><span class="co">#&gt; Conduct hypothesis tests for GATEs unde cross-validation ...</span></span>
+<span></span>
+<span><span class="co"># summarize test statistics and p-values</span></span>
+<span><span class="fu"><a href="https://rdrr.io/r/base/summary.html" class="external-link">summary</a></span><span class="op">(</span><span class="va">test_est_cv</span><span class="op">)</span></span>
+<span><span class="co">#&gt; ── The Consistency Test Results for GATEs ──────────────────────────────────────</span></span>
+<span><span class="co">#&gt; No consistency results available (sample-splitting).</span></span>
+<span><span class="co">#&gt; </span></span>
+<span><span class="co">#&gt; ── The Heterogeneity Test Results for GATEs ────────────────────────────────────</span></span>
+<span><span class="co">#&gt; No heterogeneity results available (sample-splitting).</span></span>
+<span><span class="co">#&gt; </span></span>
+<span><span class="co">#&gt; ── The Consistency Test Results for GATEs (Cross-validation) ───────────────────</span></span>
+<span><span class="co">#&gt;       algorithm statistic p.value</span></span>
+<span><span class="co">#&gt; 1 causal_forest      0.83    0.74</span></span>
+<span><span class="co">#&gt; 2         bartc      1.03    0.63</span></span>
+<span><span class="co">#&gt; 3         lasso      0.24    0.82</span></span>
+<span><span class="co">#&gt; 4            rf      1.32    0.66</span></span>
+<span><span class="co">#&gt; </span></span>
+<span><span class="co">#&gt; ── The Heterogeneity Test Results for GATEs (Cross-validation) ─────────────────</span></span>
+<span><span class="co">#&gt;       algorithm statistic p.value</span></span>
+<span><span class="co">#&gt; 1 causal_forest       1.9    0.86</span></span>
+<span><span class="co">#&gt; 2         bartc       2.3    0.81</span></span>
+<span><span class="co">#&gt; 3         lasso       3.3    0.65</span></span>
+<span><span class="co">#&gt; 4            rf       6.6    0.25</span></span></code></pre></div>
+<p>The table reports the resulting values of test statistics and the
+p-values for each test under each algorithm. We find that none of the ML
+algorithms is able to reject the treatment effect homogeneity hypothesis
+under cross-validation, which indicates that these algorithms failed to
+identify statistically significant GATEs estimate for any subgroup. In
+addition, none of the ML algorithms is able to reject the rank
+consistency hypothesis under cross-validation. Thus, there is no strong
+statistical evidence that these algorithms are producing unreliable
+GATEs.</p>
+  </main>
+</div>
+
+
+
+    <footer><div class="pkgdown-footer-left">
+  <p></p>
+<p>Developed by Michael Lingzhi Li, Kosuke Imai.</p>
+</div>
+
+<div class="pkgdown-footer-right">
+  <p></p>
+<p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.6.</p>
+</div>
+
+    </footer>
+</div>
+
+  
+
+  
+
+  </body>
+</html>
diff --git a/man/.DS_Store b/man/.DS_Store
deleted file mode 100644
index 467497b..0000000
Binary files a/man/.DS_Store and /dev/null differ
diff --git a/man/figures/README-caret_model-1 2.png b/man/figures/README-caret_model-1 2.png
new file mode 100644
index 0000000..e2a173a
Binary files /dev/null and b/man/figures/README-caret_model-1 2.png differ
diff --git a/man/figures/README-caret_model-2 2.png b/man/figures/README-caret_model-2 2.png
new file mode 100644
index 0000000..bb0e12a
Binary files /dev/null and b/man/figures/README-caret_model-2 2.png differ
diff --git a/man/figures/README-compare_itr_aupec-1 2.png b/man/figures/README-compare_itr_aupec-1 2.png
new file mode 100644
index 0000000..db3b005
Binary files /dev/null and b/man/figures/README-compare_itr_aupec-1 2.png differ
diff --git a/man/figures/README-compare_itr_gate-1 2.png b/man/figures/README-compare_itr_gate-1 2.png
new file mode 100644
index 0000000..7d0a0a7
Binary files /dev/null and b/man/figures/README-compare_itr_gate-1 2.png differ
diff --git a/man/figures/README-compare_itr_model_summary-1 2.png b/man/figures/README-compare_itr_model_summary-1 2.png
new file mode 100644
index 0000000..2ef4008
Binary files /dev/null and b/man/figures/README-compare_itr_model_summary-1 2.png differ
diff --git a/man/figures/README-cv_estimate-1 2.png b/man/figures/README-cv_estimate-1 2.png
new file mode 100644
index 0000000..a1e0d32
Binary files /dev/null and b/man/figures/README-cv_estimate-1 2.png differ
diff --git a/man/figures/README-est_extract-1 2.png b/man/figures/README-est_extract-1 2.png
new file mode 100644
index 0000000..a418cb1
Binary files /dev/null and b/man/figures/README-est_extract-1 2.png differ
diff --git a/man/figures/README-sl_plot-1 2.png b/man/figures/README-sl_plot-1 2.png
new file mode 100644
index 0000000..158837e
Binary files /dev/null and b/man/figures/README-sl_plot-1 2.png differ
diff --git a/man/figures/README-user_itr_aupec-1 2.png b/man/figures/README-user_itr_aupec-1 2.png
new file mode 100644
index 0000000..63304d2
Binary files /dev/null and b/man/figures/README-user_itr_aupec-1 2.png differ
diff --git a/man/figures/README-user_itr_gate-1 2.png b/man/figures/README-user_itr_gate-1 2.png
new file mode 100644
index 0000000..094375a
Binary files /dev/null and b/man/figures/README-user_itr_gate-1 2.png differ
diff --git a/man/figures/gate 2.png b/man/figures/gate 2.png
new file mode 100644
index 0000000..3fc77af
Binary files /dev/null and b/man/figures/gate 2.png differ
diff --git a/man/figures/plot_5folds 2.png b/man/figures/plot_5folds 2.png
new file mode 100644
index 0000000..bd59a4f
Binary files /dev/null and b/man/figures/plot_5folds 2.png differ
diff --git a/man/figures/rf 2.png b/man/figures/rf 2.png
new file mode 100644
index 0000000..4eb1ba2
Binary files /dev/null and b/man/figures/rf 2.png differ
diff --git a/tests/testthat/star 2.rda b/tests/testthat/star 2.rda
new file mode 100644
index 0000000..5baf7ba
Binary files /dev/null and b/tests/testthat/star 2.rda differ
diff --git a/tests/testthat/test-high_level 2.R b/tests/testthat/test-high_level 2.R
new file mode 100644
index 0000000..b58529b
--- /dev/null
+++ b/tests/testthat/test-high_level 2.R	
@@ -0,0 +1,42 @@
+library(evalITR)
+library(dplyr)
+test_that("Sample Splitting Works", {
+  load("star.rda")
+  # specifying the outcome
+  outcomes <- "g3tlangss"
+
+  # specifying the treatment
+  treatment <- "treatment"
+
+  # specifying the data (remove other outcomes)
+  star_data <- star %>% dplyr::select(-c(g3treadss,g3tmathss))
+
+  # specifying the formula
+  user_formula <- as.formula(
+    "g3tlangss ~ treatment + gender + race + birthmonth +
+  birthyear + SCHLURBN + GRDRANGE + GKENRMNT + GKFRLNCH +
+  GKBUSED + GKWHITE ")
+
+
+  # estimate ITR
+  fit <- estimate_itr(
+    treatment = treatment,
+    form = user_formula,
+    data = star_data,
+    algorithms = c("lasso"),
+    budget = 0.2,
+    split_ratio = 0.7)
+  expect_no_error(estimate_itr(
+    treatment = treatment,
+    form = user_formula,
+    data = star_data,
+    algorithms = c("lasso"),
+    budget = 0.2,
+    split_ratio = 0.7))
+
+
+  # evaluate ITR
+  est <- evaluate_itr(fit)
+  expect_no_error(evaluate_itr(fit))
+})
+
diff --git a/tests/testthat/test-low_level 2.R b/tests/testthat/test-low_level 2.R
new file mode 100644
index 0000000..dd0993b
--- /dev/null
+++ b/tests/testthat/test-low_level 2.R	
@@ -0,0 +1,59 @@
+library(evalITR)
+
+test_that("Non Cross-Validated Functions Work", {
+  T = c(1,0,1,0,1,0,1,0)
+  That = c(0,1,1,0,0,1,1,0)
+  That2 = c(1,0,0,1,1,0,0,1)
+  tau = c(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7)
+  Y = c(4,5,0,2,4,1,-4,3)
+  papelist <- PAPE(T,That,Y)
+  pavlist <- PAV(T,That,Y)
+  papdlist <- PAPD(T,That,That2,Y,0.5)
+  aupeclist <- AUPEC(T,tau,Y)
+  gatelist <- GATE(T,tau,Y,ngates=2)
+  expect_type(papelist,"list")
+  expect_type(pavlist,"list")
+  expect_type(papdlist,"list")
+  expect_type(aupeclist,"list")
+  expect_type(gatelist,"list")
+  expect_type(papelist$pape,"double")
+  expect_type(pavlist$pav,"double")
+  expect_type(papdlist$papd,"double")
+  expect_type(aupeclist$aupec,"double")
+  expect_type(gatelist$gate,"double")
+  expect_type(papelist$sd,"double")
+  expect_type(pavlist$sd,"double")
+  expect_type(papdlist$sd,"double")
+  expect_type(aupeclist$sd,"double")
+  expect_type(gatelist$sd,"double")
+})
+
+test_that("Cross-Validated Functions Work", {
+  T = c(1,0,1,0,1,0,1,0)
+  That = matrix(c(0,1,1,0,0,1,1,0,1,0,0,1,1,0,0,1), nrow = 8, ncol = 2)
+  That2 = matrix(c(0,0,1,1,0,0,1,1,1,1,0,0,1,1,0,0), nrow = 8, ncol = 2)
+  tau = matrix(c(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,-0.5,-0.3,-0.1,0.1,0.3,0.5,0.7,0.9),nrow = 8, ncol = 2)
+  Y = c(4,5,0,2,4,1,-4,3)
+  ind = c(rep(1,4),rep(2,4))
+  papelist <- PAPEcv(T,That,Y,ind,budget = 0.5)
+  pavlist <- PAVcv(T,That,Y,ind)
+  papdlist <- PAPDcv(T,That,That2,Y,ind,budget = 0.5)
+  aupeclist <- AUPECcv(T,tau,Y,ind)
+  gatelist <- GATEcv(T,tau,Y,ind,ngates=2)
+  expect_type(papelist,"list")
+  expect_type(pavlist,"list")
+  expect_type(papdlist,"list")
+  expect_type(aupeclist,"list")
+  expect_type(gatelist,"list")
+  expect_type(papelist$pape,"double")
+  expect_type(pavlist$pav,"double")
+  expect_type(papdlist$papd,"double")
+  expect_type(aupeclist$aupec,"double")
+  expect_type(gatelist$gate,"double")
+  expect_type(papelist$sd,"double")
+  expect_type(pavlist$sd,"double")
+  expect_type(papdlist$sd,"double")
+  expect_type(aupeclist$sd,"double")
+  expect_type(gatelist$sd,"double")
+})
+
diff --git a/vignettes/test_itr.Rmd b/vignettes/test_itr.Rmd
new file mode 100644
index 0000000..c8c045c
--- /dev/null
+++ b/vignettes/test_itr.Rmd
@@ -0,0 +1,84 @@
+---
+title: "Nonparametric statistical tests for treatment heterogeneity and rank consistency across multiple ML algorithms"
+output: rmarkdown::html_vignette
+vignette: >
+  %\VignetteIndexEntry{Nonparametric statistical tests with multiple ML algorithms}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEncoding{UTF-8}
+---
+
+```{r, include = FALSE}
+knitr::opts_chunk$set(
+  collapse = TRUE,
+  comment = "#>",
+  fig.path = "../man/figures/README-"
+  )
+
+library(dplyr)
+
+load("../data/star.rda")
+
+# specifying the outcome
+outcomes <- "g3tlangss"
+
+# specifying the treatment
+treatment <- "treatment"
+
+# specifying the data (remove other outcomes)
+star_data <- star %>% dplyr::select(-c(g3treadss,g3tmathss))
+
+# specifying the formula
+user_formula <- as.formula(
+  "g3tlangss ~ treatment + gender + race + birthmonth + 
+  birthyear + SCHLURBN + GRDRANGE + GKENRMNT + GKFRLNCH + 
+  GKBUSED + GKWHITE ")
+```
+
+In practice, machine learning (ML) algorithms may fail to ascertain heterogeneous treatment effects due to small small sample sizes, high dimensionality, and arbitrary parameter-tuning. The `test_itr` function allows users to empirically validate the estimates of GATEs under various ML algorithms with statistical testing. In particular, there are two types of nonparametric statistical tests: (1) test for across group treatment effects heterogeneity and (2) test of rank consistency of GATEs. The following provides a description of each test. The tests are based on the idea that, if an ML algorithm produces a reasonable scoring rule (achieved by the `estimate_itr` function), it is reasonable to expect that (1) the GATEs across groups are heterogeneous; and (2) the rank ordering of the GATEs based on their magnitude should be mononotic.
+
+Following the previous examples, we first estimate GATEs using causal forest (`causal forest`), Bayesian Additive Regression Trees (`bartc`), LASSO (`lasso`), random forest (`rf`) under cross-validation using the `estimate_itr` function. We specify the number of groups to divide the sample into through the `ngates` argument. By setting `ngates = 5` in the example below, we estimate the heterogeneous impact of small class sizes on students’ writing scores across 5 groups of students. 
+
+```{r multiple, warning=FALSE, message=FALSE}
+# library(evalITR)
+devtools::load_all(".")
+
+# specify the trainControl method
+fitControl <- caret::trainControl(
+                           method = "repeatedcv",
+                           number = 2,
+                           repeats = 2)
+# estimate ITR
+set.seed(2023)
+fit_cv <- estimate_itr(
+               treatment = "treatment",
+               form = user_formula,
+               data = star_data,
+               trControl = fitControl,
+               algorithms = c(
+                  "causal_forest", # from caret
+                  "bartc", # from caret
+                  "lasso", # from caret 
+                  "rf"), # from caret 
+               budget = 0.2, # 20% budget constraint
+               n_folds = 5, # 5-fold cross-validation
+               ngates = 5) # 5 groups
+
+# evaluate ITR
+est_cv <- evaluate_itr(fit_cv)
+
+# extract GATEs estimates
+summary(est_cv)$GATE
+```
+The table reports the quintile GATEs ($K = 5$) estimates for each ML algorithm. We find that the Random Forest is able to produce statistically negative GATE for the lowest quantitle group (group 1) under cross-validation. This provides evidence that the Random Forest is able to identify a 20% subgroup whose writing scores are negatively impacted by small class sizes.
+
+We now conduct the statistical tests of treatment effect heterogeneity and rank consistency to validate these GATEs estimates. We use the module object output `test_est_cv` from the `evaluate_itr` function as the input object for the `test_itr` function to conduct 2 tests simultaneously. We can summarize the test statistics and the p-values using the `summary` function. Lastly, we use the `nsims` argument to specify the number of simulations to conduct for each test. The default is 1000 simulations.
+
+```{r warning=FALSE, message=FALSE}
+# conduct nonparametric tests
+test_est_cv <- test_itr(est_cv,
+                        nsim = 5000)
+
+# summarize test statistics and p-values
+summary(test_est_cv)
+```
+The table reports the resulting values of test statistics and the p-values for each test under each algorithm. We find that none of the ML algorithms is able to reject the treatment effect homogeneity hypothesis under cross-validation, which indicates that these algorithms failed to identify statistically significant GATEs estimate for any subgroup. In addition, none of the ML algorithms is able to reject the rank consistency hypothesis under cross-validation. Thus, there is no strong statistical evidence that these algorithms are producing unreliable GATEs.