diff --git a/DESCRIPTION b/DESCRIPTION index 0419198..77515ba 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -18,9 +18,11 @@ Imports: Rcpp, magrittr, MASS, - stats + stats, + tidyr, + coop LinkingTo: Rcpp, RcppEigen Depends: - R (>= 3.50) + R (>= 3.5.0) diff --git a/NAMESPACE b/NAMESPACE index 7a1d74f..566a9d7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,5 +3,6 @@ export(staggered) export(staggered_cs) export(staggered_sa) +import(Rcpp) importFrom(magrittr,"%>%") useDynLib(staggered) diff --git a/R/RcppExports.R b/R/RcppExports.R index 5b1ef97..95a1b85 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -5,3 +5,11 @@ eigenMapMatMult <- function(A, B) { .Call('_staggered_eigenMapMatMult', PACKAGE = 'staggered', A, B) } +solve_least_squares_svd <- function(A, B) { + .Call('_staggered_solve_least_squares_svd', PACKAGE = 'staggered', A, B) +} + +solve_least_squares_normal <- function(A, B) { + .Call('_staggered_solve_least_squares_normal', PACKAGE = 'staggered', A, B) +} + diff --git a/R/compute_efficient_estimator_and_se.R b/R/compute_efficient_estimator_and_se.R index 3ba32bc..10fc8fc 100644 --- a/R/compute_efficient_estimator_and_se.R +++ b/R/compute_efficient_estimator_and_se.R @@ -2,27 +2,33 @@ #' @title Calculate group level summary statistics #' @description This function computes the mean-vector and covariance matrix of the outcomes for each cohort, where a cohort g is a group of units first treated in period g #' @param df A data frame containing panel data with the variables y (an outcome), i (an individual identifier), t (the period in which the outcome is observe), g (the period in which i is first treated, with Inf denoting never treated) +#' @param is_balanced If true, the df has previously been balanced so this does not need to be done internally. #' @return Y_bar_list A list of the means of the outcomes for each cohort g #' @return S_g_list A list of covariance matrices for the outcomes for each cohort g #' @return N_g_list A list of the number of observations for each cohort g #' @return g_list A list of when the cohorts were first treated #' @return t_list A list of the the time periods for the outcome. The vector of outcomes corresponds with this order. -compute_g_level_summaries <- function(df){ +compute_g_level_summaries <- function(df, is_balanced = TRUE){ #Balance the panel (and throw a warning if original panel is unbalanced) - df <- balance_df(df) - + if(!is_balanced){ + df <- balance_df(df) + } g_list <- sort(unique(df$g)) t_list <- sort(unique(df$t)) + + #Reshape so that Y_{it} is a column for all t + df <- df %>% + tidyr::pivot_wider(id_cols = c("i","g"), names_from = "t", values_from = "y") %>% + dplyr::ungroup() %>% + dplyr::select(-i) + compute_Ybar_Sbar_g <- function(g){ #Filter to observations in cohort g - dfg <- df %>% dplyr::filter(g == !!g) - - #Reshape so that Y_{it} is a column for all t - dfg <- dfg %>% - reshape2::dcast(i ~ t, value.var = "y") %>% - dplyr::select(-i) + dfg <- df %>% + dplyr::filter(g == !!g) %>% + dplyr::select(-g) #Order the columns ascending in time tVec <- as.numeric(colnames(dfg)) @@ -36,7 +42,7 @@ compute_g_level_summaries <- function(df){ #Compute means Ybar_g and covariance S_g Ybar_g <- base::colMeans(dfg) - S_g <- stats::var(dfg) + S_g <- coop::covar(dfg) return(list(Ybar_g = Ybar_g, @@ -44,6 +50,7 @@ compute_g_level_summaries <- function(df){ N_g = N_g)) } + resultsList <- purrr::map(.x = g_list, .f = compute_Ybar_Sbar_g) @@ -65,6 +72,7 @@ compute_g_level_summaries <- function(df){ } + balance_df <- function(df){ ## This function creates a balanced panel as needed for our analysis @@ -171,7 +179,9 @@ compute_Betastar <- function(Ybar_g_list, X_theta_cov <- base::Reduce(x = X_theta_cov_list, f = '+') #betastar <- solve(Xvar) %*% X_theta_cov - betastar <- MASS::ginv(Xvar) %*% X_theta_cov + #betastar <- MASS::ginv(Xvar) %*% X_theta_cov + #betastar <- solve_least_squares_svd(Xvar,X_theta_cov) + betastar <- solve_least_squares_normal(Xvar,X_theta_cov) #fast method of solving (Xvar)^-1 X_theta_cov return(betastar) } @@ -443,7 +453,7 @@ create_A0_list <- function(g_list, sum_of_lists <- function(.l){ #Given a list of lists all of the same length, this returns a list where the jth entry is the sum of the jth entry of all the lists - if(length(.l) == 1){return(.l)} + if(length(.l) == 1){return(.l[[1]])} results <- purrr::reduce(.x = .l, .f = function(l1, l2){ @@ -814,8 +824,64 @@ calculate_full_vcv <- function(eventPlotResultsList, resultsDF){ +processDF <- function(df, i, g, t, y){ + + #This function processes the df inputted to staggered (or staggered_cs/sa) + # It checks that the columns in the user-inputted values of i,g,t,y are actually in the data + # It also renames these columns to "i", "g", "t", "y" + + # Let's make sure we have columns with name i, t, y and g + colnames_df <- colnames(df) + if(!i %in% colnames_df){ + stop(paste0("There is no column ", i, " in the data. Thus, we are not able to find the unit identifier variable.")) + } + if(!t %in% colnames_df){ + stop(paste0("There is no column ", t, " in the data. Thus, we are not able to find the time identifier variable.")) + } + if(!g %in% colnames_df){ + stop(paste0("There is no column ", g, " in the data. Thus, we are not able to find the group identifier variable.")) + } + if(!y %in% colnames_df){ + stop(paste0("There is no column ", y, " in the data. Thus, we are not able to find the outcome variable.")) + } + + # Sanity checks + if(i %in% c("g", "t", "y" )){ + stop(paste0("Unit identifier cannot be labeled g, t, or y")) + } + + if(t %in% c("i","y", "g" )){ + stop(paste0("Time identifier cannot be labeled i, g, or y")) + } + + if(g %in% c("i", "t" ,"y" )){ + stop(paste0("Group identifier cannot be labeled i, t, or y")) + } + + + # Re-label i, t, g, y + if(i != "i"){ + df[,"i"] <- df[,i] + } + + if(t != "t"){ + df[, "t"] <- df[,t] + } + + if(g != "g"){ + df[, "g"] <- df[,g] + } + + if(y != "y"){ + df[, "y"] <- df[,y] + } + return(df) +} + + #' @useDynLib staggered #' @importFrom magrittr "%>%" +#' @import Rcpp #' @title Calculate the efficient adjusted estimator in staggered rollout designs #' @description This functions calculates the efficient estimator for staggered rollout designs proposed by Roth and Sant'Anna. #' @param df A data frame containing panel data with the variables y (an outcome), i (an individual identifier), t (the period in which the outcome is observe), g (the period in which i is first treated, with Inf denoting never treated) @@ -832,6 +898,9 @@ calculate_full_vcv <- function(eventPlotResultsList, resultsDF){ #' @param return_full_vcv If this is true and estimand = "eventstudy", then the function returns a list containing the full variance-covariance matrix for the event-plot estimates in addition to the usual dataframe with the estimates #' @param return_matrix_list If true, the function returns a list of the A_0_list and A_theta_list matrices along with betastar. This is used for internal recursive calls to calculate the variance-covariance matrix, and will generally not be needed by the end-user. Default is False. #' @param use_last_treated_only If true, then A_0_list and A_theta_list are created to only make comparisons with the last treated cohorts (as suggested by Sun and Abraham), rather than using not-yet-treated units as comparisons. If set to TRUE (and use_DiD_A0 = TRUE), then beta=1 corresponds with the Sun and Abraham estimator. +#' @param compute_fisher If true, computes a Fisher Randomization Test using the studentized estimator. +#' @param num_fisher_permutations The number of permutations to use in the Fisher Randomization Test (if compute_fisher = TRUE). Default is 500. +#' @param skip_data_check If true, skips checks that the data is balanced and contains the colums i,t,g,y. Used in internal recursive calls to increase speed, but not recommended for end-user. #' @return resultsDF A data.frame containing: estimate (the point estimate), se (the standard error), and se_neyman (the Neyman standard error). If a vector-valued eventTime is provided, the data.frame contains multiple rows for each eventTime and an eventTime column. If return_full_vcv = TRUE and estimand = "eventstudy", the function returns a list containing resultsDF and the full variance covariance for the event-study estimates (vcv) as well as the Neyman version of the covariance matrix (vcv_neyman). (If return_matrix_list = TRUE, it likewise returns a list containing lists of matrices used in the vcv calculation.) #' @references #' \cite{Roth, Jonatahan, and Sant'Anna, Pedro H. C. (2021), @@ -892,56 +961,23 @@ staggered <- function(df, FALSE), return_full_vcv = FALSE, return_matrix_list = FALSE, - use_last_treated_only = FALSE){ - - # Let's make sure we have columns with name i, t, y and g - colnames_df <- colnames(df) - if(!i %in% colnames_df){ - stop(paste0("There is no column ", i, " in the data. Thus, we are not able to find the unit identifier variable.")) - } - if(!t %in% colnames_df){ - stop(paste0("There is no column ", t, " in the data. Thus, we are not able to find the time identifier variable.")) - } - if(!g %in% colnames_df){ - stop(paste0("There is no column ", g, " in the data. Thus, we are not able to find the group identifier variable.")) - } - if(!y %in% colnames_df){ - stop(paste0("There is no column ", y, " in the data. Thus, we are not able to find the outcome variable.")) - } - - # Sanity checks - if(i %in% c("g", "t", "y" )){ - stop(paste0("Unit identifier cannot be labeled g, t, or y")) - } + use_last_treated_only = FALSE, + compute_fisher = FALSE, + num_fisher_permutations = 500, + skip_data_check = FALSE){ - if(t %in% c("i","y", "g" )){ - stop(paste0("Time identifier cannot be labeled i, g, or y")) - } - - if(g %in% c("i", "t" ,"y" )){ - stop(paste0("Group identifier cannot be labeled i, t, or y")) - } - - - # Re-label i, t, g, y - if(i != "i"){ - df[,"i"] <- df[,i] - } - - if(t != "t"){ - df[, "t"] <- df[,t] + #Process the inputted df by checking the inputted columns and renaming to i,t,g,y, and balancing on (i,t) + #We skip this if skip_data_check = TRUE + if(!skip_data_check){ + df <- processDF(df, + i=i, + g=g, + t=t, + y=y) + #Balance the panel (and throw a warning if original panel is unbalanced) + df <- balance_df(df = df) } - if(g != "g"){ - df[, "g"] <- df[,g] - } - - if(y != "y"){ - df[, "y"] <- df[,y] - } - - #Balance the panel (and throw a warning if original panel is unbalanced) - df <- balance_df(df) # If estimand is provided, force to be lower-case (allowing for non-case sensitive inputs) if(!is.null(estimand)){ @@ -966,7 +1002,9 @@ staggered <- function(df, beta = beta, use_DiD_A0 = use_DiD_A0, return_matrix_list = TRUE, - use_last_treated_only = use_last_treated_only)) + use_last_treated_only = use_last_treated_only, + compute_fisher = compute_fisher, + skip_data_check = T)) resultsDF <- purrr::reduce(.x = purrr::map(.x = eventPlotResultsList, .f = ~ .x$resultsDF), .f = dplyr::bind_rows) @@ -989,7 +1027,7 @@ staggered <- function(df, return(resultsDF) } } - g_level_summaries <- compute_g_level_summaries(df) + g_level_summaries <- compute_g_level_summaries(df, is_balanced = TRUE) Ybar_g_list <- g_level_summaries$Ybar_g_List S_g_list <- g_level_summaries$S_g_List N_g_list <- g_level_summaries$N_g_List @@ -1072,6 +1110,9 @@ staggered <- function(df, ) # Xvar_list <- purrr::pmap(.l = list(A_0_list, S_g_list, N_g_list) , .f = function(A0,S,N){ return(1/N * A0 %*%S %*% base::t(A0) ) } ) + #Save the user-inputted beta (used in FRT call) + user_input_beta <- beta + if(is.null(beta)){ beta <- compute_Betastar(Ybar_g_list, A_theta_list, @@ -1118,6 +1159,72 @@ staggered <- function(df, se = se, se_neyman = se_neyman) + ## Do FRT, if specified + permuteTreatment <- function(df,i_g_table, seed){ + #This function takes a data.frame with columns i and g, and permutes the values of g assigned to i + # The input i_g_table has the unique combinations of (i,g) in df, and is calculated outside for speed improvements + + #Draw a random permutation of the elements of first_period_df + set.seed(seed) + n = NROW(i_g_table) + randIndex <- + sample.int(n = n, + size = n, + replace = F) + + #Replace first_period_df$g with a permuted version based on randIndex + i_g_table$g <- i_g_table$g[randIndex] + + #Merge the new treatment assignments back with the original + df$g <- NULL + df <- dplyr::left_join(df, + i_g_table, + by = c("i")) + + return(df) + } + + if(compute_fisher){ + + #Find unique pairs of (i,g). This will be used for computing the permutations + # i_g_table <- df %>% + # dplyr::filter(t == min(t)) %>% + # dplyr::select(i,g) + + i_g_table <- df %>% + dplyr::filter(t == min(t)) + i_g_table <- i_g_table[,c("i","g")] + + #Now, we compute the FRT for each seed, permuting treatment for each one + #We catch any errors in the FRT simulations, and throw a warning if at least one has an error (using the remaining draws to calculate frt) + FRTResults <- + purrr::map(.x = 1:num_fisher_permutations, + .f = purrr::possibly( + .f =~ staggered::staggered(df = permuteTreatment(df, i_g_table, seed = .x), + estimand = NULL, + beta = user_input_beta, + A_theta_list = A_theta_list, + A_0_list = A_0_list, + eventTime = eventTime, + return_full_vcv = F, + return_matrix_list = F, + compute_fisher = F, + skip_data_check = T) %>% mutate(seed = .x), + otherwise = NULL) + ) %>% + purrr::discard(base::is.null) %>% + purrr::reduce(.f = dplyr::bind_rows) + + successful_frt_draws <- NROW(FRTResults) + if(successful_frt_draws < num_fisher_permutations){ + warning("There was an error in at least one of the FRT simulations. Removing the problematic draws.") + } + + resultsDF$fisher_pval <- mean( abs(resultsDF$estimate/resultsDF$se) < abs(FRTResults$estimate/FRTResults$se) ) + resultsDF$fisher_pval_se_neyman <- mean( abs(resultsDF$estimate/resultsDF$se_neyman) < abs(FRTResults$estimate/FRTResults$se_neyman) ) + resultsDF$num_fisher_permutations <- successful_frt_draws + + } if(!return_matrix_list){ #If return_matrix_list is not specified, then we return results DF unless return_full_vcv = TRUE if(!return_full_vcv){ @@ -1164,6 +1271,10 @@ staggered <- function(df, #' @param eventTime If using estimand = "eventstudy", specify what eventTime you want the event-study parameter for. The default is 0, the period in which treatment occurs. If a vector is provided, estimates are returned for all the event-times in the vector. #' @param return_full_vcv If this is true and estimand = "eventstudy", then the function returns a list containing the full variance-covariance matrix for the event-plot estimates in addition to the usual dataframe with the estimates #' @param return_matrix_list If true, the function returns a list of the A_0_list and A_theta_list matrices along with betastar. This is used for internal recursive calls to calculate the variance-covariance matrix, and will generally not be needed by the end-user. Default is False. +#' @param compute_fisher If true, computes a Fisher Randomization Test using the studentized estimator. +#' @param num_fisher_permutations The number of permutations to use in the Fisher Randomization Test (if compute_fisher = TRUE). Default is 500. +#' @param skip_data_check If true, skips checks that the data is balanced and contains the colums i,t,g,y. Used in internal recursive calls to increase speed, but not recommended for end-user. + #' @return resultsDF A data.frame containing: estimate (the point estimate), se (the standard error), and se_neyman (the Neyman standard error). If a vector-valued eventTime is provided, the data.frame contains multiple rows for each eventTime and an eventTime column. If return_full_vcv = TRUE and estimand = "eventstudy", the function returns a list containing resultsDF and the full variance covariance for the event-study estimates (vcv) as well as the Neyman version of the covariance matrix (vcv_neyman). (If return_matrix_list = TRUE, it likewise returns a list containing lists of matrices used in the vcv calculation.) #' @references #' \cite{Callaway, Brantly, and Sant'Anna, Pedro H. C. (2020), @@ -1205,56 +1316,21 @@ staggered_cs <- function(df, A_0_list = NULL, eventTime = 0, return_full_vcv = FALSE, - return_matrix_list = FALSE){ - - # Let's make sure we have columns with name i, t, y and g - colnames_df <- colnames(df) - if(!i %in% colnames_df){ - stop(paste0("There is no column ", i, " in the data. Thus, we are not able to find the unit identifier variable.")) - } - if(!t %in% colnames_df){ - stop(paste0("There is no column ", t, " in the data. Thus, we are not able to find the time identifier variable.")) - } - if(!g %in% colnames_df){ - stop(paste0("There is no column ", g, " in the data. Thus, we are not able to find the group identifier variable.")) - } - if(!y %in% colnames_df){ - stop(paste0("There is no column ", y, " in the data. Thus, we are not able to find the outcome variable.")) - } - - # Sanity checks - if(i %in% c("g", "t", "y" )){ - stop(paste0("Unit identifier cannot be labeled g, t, or y")) - } - - if(t %in% c("i","y", "g" )){ - stop(paste0("Time identifier cannot be labeled i, g, or y")) - } - - if(g %in% c("i", "t" ,"y" )){ - stop(paste0("Group identifier cannot be labeled i, t, or y")) - } - + return_matrix_list = FALSE, + compute_fisher = FALSE, + num_fisher_permutations = 500, + skip_data_check = FALSE){ - # Re-label i, t, g, y - if(i != "i"){ - df[,"i"] <- df[,i] + if(!skip_data_check){ + df <- processDF(df, + i=i, + g=g, + t=t, + y=y) + #Balance the panel (and throw a warning if original panel is unbalanced) + df <- balance_df(df = df) } - if(t != "t"){ - df[, "t"] <- df[,t] - } - - if(g != "g"){ - df[, "g"] <- df[,g] - } - - if(y != "y"){ - df[, "y"] <- df[,y] - } - - #Balance the panel (and throw a warning if original panel is unbalanced) - df <- balance_df(df) #Drop units who has g= < min(t), since ATT(t,g) is not identified for these units TreatedBeforeMinT <- df$g <= min(df$t) @@ -1271,7 +1347,10 @@ staggered_cs <- function(df, use_DiD_A0 = TRUE, use_last_treated_only = FALSE, return_full_vcv = return_full_vcv, - return_matrix_list = return_matrix_list) + return_matrix_list = return_matrix_list, + compute_fisher = compute_fisher, + num_fisher_permutations = num_fisher_permutations, + skip_data_check = skip_data_check) return(results) @@ -1293,6 +1372,10 @@ staggered_cs <- function(df, #' @param eventTime If using estimand = "eventstudy", specify what eventTime you want the event-study parameter for. The default is 0, the period in which treatment occurs. If a vector is provided, estimates are returned for all the event-times in the vector. #' @param return_full_vcv If this is true and estimand = "eventstudy", then the function returns a list containing the full variance-covariance matrix for the event-plot estimates in addition to the usual dataframe with the estimates #' @param return_matrix_list If true, the function returns a list of the A_0_list and A_theta_list matrices along with betastar. This is used for internal recursive calls to calculate the variance-covariance matrix, and will generally not be needed by the end-user. Default is False. +#' @param compute_fisher If true, computes a Fisher Randomization Test using the studentized estimator. +#' @param num_fisher_permutations The number of permutations to use in the Fisher Randomization Test (if compute_fisher = TRUE). Default is 500. +#' @param skip_data_check If true, skips checks that the data is balanced and contains the colums i,t,g,y. Used in internal recursive calls to increase speed, but not recommended for end-user. + #' @return resultsDF A data.frame containing: estimate (the point estimate), se (the standard error), and se_neyman (the Neyman standard error). If a vector-valued eventTime is provided, the data.frame contains multiple rows for each eventTime and an eventTime column. If return_full_vcv = TRUE and estimand = "eventstudy", the function returns a list containing resultsDF and the full variance covariance for the event-study estimates (vcv) as well as the Neyman version of the covariance matrix (vcv_neyman). (If return_matrix_list = TRUE, it likewise returns a list containing lists of matrices used in the vcv calculation.) #' @references #' \cite{Sun, Liyang, and Abraham, Sarah (2020), @@ -1334,57 +1417,22 @@ staggered_sa <- function(df, A_0_list = NULL, eventTime = 0, return_full_vcv = FALSE, - return_matrix_list = FALSE){ + return_matrix_list = FALSE, + compute_fisher = FALSE, + num_fisher_permutations = 500, + skip_data_check = FALSE){ - # Let's make sure we have columns with name i, t, y and g - colnames_df <- colnames(df) - if(!i %in% colnames_df){ - stop(paste0("There is no column ", i, " in the data. Thus, we are not able to find the unit identifier variable.")) - } - if(!t %in% colnames_df){ - stop(paste0("There is no column ", t, " in the data. Thus, we are not able to find the time identifier variable.")) - } - if(!g %in% colnames_df){ - stop(paste0("There is no column ", g, " in the data. Thus, we are not able to find the group identifier variable.")) - } - if(!y %in% colnames_df){ - stop(paste0("There is no column ", y, " in the data. Thus, we are not able to find the outcome variable.")) - } - # Sanity checks - if(i %in% c("g", "t", "y" )){ - stop(paste0("Unit identifier cannot be labeled g, t, or y")) + if(!skip_data_check){ + df <- processDF(df, + i=i, + g=g, + t=t, + y=y) + #Balance the panel (and throw a warning if original panel is unbalanced) + df <- balance_df(df = df) } - if(t %in% c("i","y", "g" )){ - stop(paste0("Time identifier cannot be labeled i, g, or y")) - } - - if(g %in% c("i", "t" ,"y" )){ - stop(paste0("Group identifier cannot be labeled i, t, or y")) - } - - - # Re-label i, t, g, y - if(i != "i"){ - df[,"i"] <- df[,i] - } - - if(t != "t"){ - df[, "t"] <- df[,t] - } - - if(g != "g"){ - df[, "g"] <- df[,g] - } - - if(y != "y"){ - df[, "y"] <- df[,y] - } - - #Balance the panel (and throw a warning if original panel is unbalanced) - df <- balance_df(df) - #Drop units who has g= < min(t), since ATT(t,g) is not identified for these units TreatedBeforeMinT <- df$g <= min(df$t) if(sum(TreatedBeforeMinT) > 0 ){ @@ -1400,7 +1448,10 @@ staggered_sa <- function(df, use_DiD_A0 = TRUE, use_last_treated_only = TRUE, return_full_vcv = return_full_vcv, - return_matrix_list = return_matrix_list) + return_matrix_list = return_matrix_list, + compute_fisher = compute_fisher, + num_fisher_permutations = num_fisher_permutations, + skip_data_check = skip_data_check) return(results) diff --git a/README.Rmd b/README.Rmd index 874e2d6..0d66826 100644 --- a/README.Rmd +++ b/README.Rmd @@ -146,9 +146,11 @@ We also provide a Stata implementation (staggered_stata) via the RCall package, To install the staggered_stata package, the user first needs to install the github and RCall packages. This can be done with the following commands > net install github, from("https://haghish.github.io/github/") - > github install haghish/rcall, stable + > github install haghish/rcall, version("2.5.0") -Note that the user must have R installed before installing the rcall package. The latest version of R can be downloaded [here](https://cloud.r-project.org/). The staggered_stata package can then be installed with +**Important note:** the staggered_stata package was built under Rcall version 2.5.0, and the recent release of RCall version 3.0 has created some compatibility issues. We will try our best to fix these issues, but in the meantime it is best to install version 2.5.0, as in the command above. + +Note that the user must have R installed before installing the RCall package. The latest version of R can be downloaded [here](https://cloud.r-project.org/). The staggered_stata package can then be installed with > github install jonathandroth/staggered_stata @@ -164,4 +166,4 @@ The syntax for staggered_stata is very similar to that for the staggered R packa ![Stata examples.](man/figures/Stata_screenshot.png) -The staggered, staggered_cs, and staggered_as commands all return a vector of coefficients and covariance matrix in ereturn list, and thus can be used with any post-estimation command in Stata (e.g. coefplot). +The staggered, staggered_cs, and staggered_as commands all return a vector of coefficients and covariance matrix in ereturn list, and thus can be used with any post-estimation command in Stata (e.g. coefplot). diff --git a/README.md b/README.md index 6cbbc7e..d545e7c 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,7 @@ # staggered + The staggered R package computes the efficient estimator for settings @@ -45,7 +46,7 @@ for modifying and plotting the results. ``` r library(staggered) #load the staggered package library(dplyr) #load dplyr for data manipulation -#> Warning: package 'dplyr' was built under R version 4.0.2 +#> Warning: package 'dplyr' was built under R version 4.0.5 #> #> Attaching package: 'dplyr' #> The following objects are masked from 'package:stats': @@ -55,7 +56,7 @@ library(dplyr) #load dplyr for data manipulation #> #> intersect, setdiff, setequal, union library(ggplot2) #load ggplot2 for plotting the results -#> Warning: package 'ggplot2' was built under R version 4.0.2 +#> Warning: package 'ggplot2' was built under R version 4.0.5 library(purrr) df <- staggered::pj_officer_level_balanced #load the officer data @@ -204,9 +205,15 @@ the github and RCall packages. This can be done with the following commands > net install github, from("https://haghish.github.io/github/") - > github install haghish/rcall, stable + > github install haghish/rcall, version("2.5.0") + +**Important note:** the staggered\_stata package was built under Rcall +version 2.5.0, and the recent release of RCall version 3.0 has created +some compatibility issues. We will try our best to fix these issues, but +in the meantime it is best to install version 2.5.0, as in the command +above. -Note that the user must have R installed before installing the rcall +Note that the user must have R installed before installing the RCall package. The latest version of R can be downloaded [here](https://cloud.r-project.org/). The staggered\_stata package can then be installed with diff --git a/man/compute_g_level_summaries.Rd b/man/compute_g_level_summaries.Rd index dcf7cba..b68e2c9 100644 --- a/man/compute_g_level_summaries.Rd +++ b/man/compute_g_level_summaries.Rd @@ -4,10 +4,12 @@ \alias{compute_g_level_summaries} \title{Calculate group level summary statistics} \usage{ -compute_g_level_summaries(df) +compute_g_level_summaries(df, is_balanced = TRUE) } \arguments{ \item{df}{A data frame containing panel data with the variables y (an outcome), i (an individual identifier), t (the period in which the outcome is observe), g (the period in which i is first treated, with Inf denoting never treated)} + +\item{is_balanced}{If true, the df has previously been balanced so this does not need to be done internally.} } \value{ Y_bar_list A list of the means of the outcomes for each cohort g diff --git a/man/figures/README-unnamed-chunk-7-1.png b/man/figures/README-unnamed-chunk-7-1.png index c2b024c..8915d77 100644 Binary files a/man/figures/README-unnamed-chunk-7-1.png and b/man/figures/README-unnamed-chunk-7-1.png differ diff --git a/man/staggered.Rd b/man/staggered.Rd index cd45f86..bda6504 100644 --- a/man/staggered.Rd +++ b/man/staggered.Rd @@ -18,7 +18,10 @@ staggered( use_DiD_A0 = ifelse(is.null(A_0_list), TRUE, FALSE), return_full_vcv = FALSE, return_matrix_list = FALSE, - use_last_treated_only = FALSE + use_last_treated_only = FALSE, + compute_fisher = FALSE, + num_fisher_permutations = 500, + skip_data_check = FALSE ) } \arguments{ @@ -49,6 +52,12 @@ staggered( \item{return_matrix_list}{If true, the function returns a list of the A_0_list and A_theta_list matrices along with betastar. This is used for internal recursive calls to calculate the variance-covariance matrix, and will generally not be needed by the end-user. Default is False.} \item{use_last_treated_only}{If true, then A_0_list and A_theta_list are created to only make comparisons with the last treated cohorts (as suggested by Sun and Abraham), rather than using not-yet-treated units as comparisons. If set to TRUE (and use_DiD_A0 = TRUE), then beta=1 corresponds with the Sun and Abraham estimator.} + +\item{compute_fisher}{If true, computes a Fisher Randomization Test using the studentized estimator.} + +\item{num_fisher_permutations}{The number of permutations to use in the Fisher Randomization Test (if compute_fisher = TRUE). Default is 500.} + +\item{skip_data_check}{If true, skips checks that the data is balanced and contains the colums i,t,g,y. Used in internal recursive calls to increase speed, but not recommended for end-user.} } \value{ resultsDF A data.frame containing: estimate (the point estimate), se (the standard error), and se_neyman (the Neyman standard error). If a vector-valued eventTime is provided, the data.frame contains multiple rows for each eventTime and an eventTime column. If return_full_vcv = TRUE and estimand = "eventstudy", the function returns a list containing resultsDF and the full variance covariance for the event-study estimates (vcv) as well as the Neyman version of the covariance matrix (vcv_neyman). (If return_matrix_list = TRUE, it likewise returns a list containing lists of matrices used in the vcv calculation.) diff --git a/man/staggered_cs.Rd b/man/staggered_cs.Rd index df0eb7f..3657cb7 100644 --- a/man/staggered_cs.Rd +++ b/man/staggered_cs.Rd @@ -15,7 +15,10 @@ staggered_cs( A_0_list = NULL, eventTime = 0, return_full_vcv = FALSE, - return_matrix_list = FALSE + return_matrix_list = FALSE, + compute_fisher = FALSE, + num_fisher_permutations = 500, + skip_data_check = FALSE ) } \arguments{ @@ -40,6 +43,12 @@ staggered_cs( \item{return_full_vcv}{If this is true and estimand = "eventstudy", then the function returns a list containing the full variance-covariance matrix for the event-plot estimates in addition to the usual dataframe with the estimates} \item{return_matrix_list}{If true, the function returns a list of the A_0_list and A_theta_list matrices along with betastar. This is used for internal recursive calls to calculate the variance-covariance matrix, and will generally not be needed by the end-user. Default is False.} + +\item{compute_fisher}{If true, computes a Fisher Randomization Test using the studentized estimator.} + +\item{num_fisher_permutations}{The number of permutations to use in the Fisher Randomization Test (if compute_fisher = TRUE). Default is 500.} + +\item{skip_data_check}{If true, skips checks that the data is balanced and contains the colums i,t,g,y. Used in internal recursive calls to increase speed, but not recommended for end-user.} } \value{ resultsDF A data.frame containing: estimate (the point estimate), se (the standard error), and se_neyman (the Neyman standard error). If a vector-valued eventTime is provided, the data.frame contains multiple rows for each eventTime and an eventTime column. If return_full_vcv = TRUE and estimand = "eventstudy", the function returns a list containing resultsDF and the full variance covariance for the event-study estimates (vcv) as well as the Neyman version of the covariance matrix (vcv_neyman). (If return_matrix_list = TRUE, it likewise returns a list containing lists of matrices used in the vcv calculation.) diff --git a/man/staggered_sa.Rd b/man/staggered_sa.Rd index 6fe0c93..3a0aeb2 100644 --- a/man/staggered_sa.Rd +++ b/man/staggered_sa.Rd @@ -15,7 +15,10 @@ staggered_sa( A_0_list = NULL, eventTime = 0, return_full_vcv = FALSE, - return_matrix_list = FALSE + return_matrix_list = FALSE, + compute_fisher = FALSE, + num_fisher_permutations = 500, + skip_data_check = FALSE ) } \arguments{ @@ -40,6 +43,12 @@ staggered_sa( \item{return_full_vcv}{If this is true and estimand = "eventstudy", then the function returns a list containing the full variance-covariance matrix for the event-plot estimates in addition to the usual dataframe with the estimates} \item{return_matrix_list}{If true, the function returns a list of the A_0_list and A_theta_list matrices along with betastar. This is used for internal recursive calls to calculate the variance-covariance matrix, and will generally not be needed by the end-user. Default is False.} + +\item{compute_fisher}{If true, computes a Fisher Randomization Test using the studentized estimator.} + +\item{num_fisher_permutations}{The number of permutations to use in the Fisher Randomization Test (if compute_fisher = TRUE). Default is 500.} + +\item{skip_data_check}{If true, skips checks that the data is balanced and contains the colums i,t,g,y. Used in internal recursive calls to increase speed, but not recommended for end-user.} } \value{ resultsDF A data.frame containing: estimate (the point estimate), se (the standard error), and se_neyman (the Neyman standard error). If a vector-valued eventTime is provided, the data.frame contains multiple rows for each eventTime and an eventTime column. If return_full_vcv = TRUE and estimand = "eventstudy", the function returns a list containing resultsDF and the full variance covariance for the event-study estimates (vcv) as well as the Neyman version of the covariance matrix (vcv_neyman). (If return_matrix_list = TRUE, it likewise returns a list containing lists of matrices used in the vcv calculation.) diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index be7d1c2..d34fdfd 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -18,9 +18,35 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// solve_least_squares_svd +Eigen::MatrixXd solve_least_squares_svd(Eigen::MatrixXd A, Eigen::MatrixXd B); +RcppExport SEXP _staggered_solve_least_squares_svd(SEXP ASEXP, SEXP BSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< Eigen::MatrixXd >::type A(ASEXP); + Rcpp::traits::input_parameter< Eigen::MatrixXd >::type B(BSEXP); + rcpp_result_gen = Rcpp::wrap(solve_least_squares_svd(A, B)); + return rcpp_result_gen; +END_RCPP +} +// solve_least_squares_normal +Eigen::MatrixXd solve_least_squares_normal(Eigen::MatrixXd A, Eigen::MatrixXd B); +RcppExport SEXP _staggered_solve_least_squares_normal(SEXP ASEXP, SEXP BSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< Eigen::MatrixXd >::type A(ASEXP); + Rcpp::traits::input_parameter< Eigen::MatrixXd >::type B(BSEXP); + rcpp_result_gen = Rcpp::wrap(solve_least_squares_normal(A, B)); + return rcpp_result_gen; +END_RCPP +} static const R_CallMethodDef CallEntries[] = { {"_staggered_eigenMapMatMult", (DL_FUNC) &_staggered_eigenMapMatMult, 2}, + {"_staggered_solve_least_squares_svd", (DL_FUNC) &_staggered_solve_least_squares_svd, 2}, + {"_staggered_solve_least_squares_normal", (DL_FUNC) &_staggered_solve_least_squares_normal, 2}, {NULL, NULL, 0} }; diff --git a/src/code.cpp b/src/code.cpp index f6c8768..b0e3eb7 100644 --- a/src/code.cpp +++ b/src/code.cpp @@ -10,3 +10,24 @@ SEXP eigenMapMatMult(const Eigen::Map A, Eigen::Map