From ddebbfd712b947089a1a9be117b8759ab9ae4b97 Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Tue, 27 May 2025 16:05:02 -0400 Subject: [PATCH 001/203] add refy_lkup --- R/create_lkups.R | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/R/create_lkups.R b/R/create_lkups.R index add360c4..052d663d 100644 --- a/R/create_lkups.R +++ b/R/create_lkups.R @@ -175,6 +175,40 @@ create_lkups <- function(data_dir, versions) { collapse = "|"), by = .(interpolation_id)] + # ZP ADD - CREATE OBJECT: refy_lkup + #___________________________________________________________________________ + refy_lkup_path <- fs::path(data_dir, + "estimations/prod_svy_estimation.fst") + refy_lkup <- fst::read_fst(refy_lkup_path, + as.data.table = TRUE) + refy_lkup <- refy_lkup[cache_id %in% paths_ids] + refy_lkup[ , + path := { + fs::path(data_dir, + "lineup_data", + paste0(country_code, + "_", + reporting_year), + ext = "qs") |> + as.character() + } + ] + + if ("region_code" %in% names(svy_lkup)) { + refy_lkup[, + region_code := NULL] + } + refy_lkup <- joyn::joyn(x = refy_lkup, + y = countries, + by = 'country_code', + keep = "left", + reportvar = FALSE) + + + + #___________________________________________________________________________ + + # CREATE OBJECT: interpolation_list ---- # This is to facilitate interpolation computations @@ -521,6 +555,7 @@ create_lkups <- function(data_dir, versions) { lkup <- list( svy_lkup = svy_lkup, ref_lkup = ref_lkup, + refy_lkup = refy_lkup, dist_stats = dist_stats, pop_region = pop_region, cp_lkups = cp_lkups, From 2a60af20c6e1ecf5e4d63b1ea5bbb8dd433cebb7 Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Tue, 27 May 2025 17:03:56 -0400 Subject: [PATCH 002/203] use unique version of refy df for refy --- R/create_lkups.R | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/R/create_lkups.R b/R/create_lkups.R index 052d663d..60ca600e 100644 --- a/R/create_lkups.R +++ b/R/create_lkups.R @@ -178,10 +178,12 @@ create_lkups <- function(data_dir, versions) { # ZP ADD - CREATE OBJECT: refy_lkup #___________________________________________________________________________ refy_lkup_path <- fs::path(data_dir, - "estimations/prod_svy_estimation.fst") + "estimations/prod_refy_estimation.fst") + + # NOTE: THIS `prod_refy_estimation.fst` is the refy table refy_lkup <- fst::read_fst(refy_lkup_path, as.data.table = TRUE) - refy_lkup <- refy_lkup[cache_id %in% paths_ids] + print(dim(refy_lkup)) refy_lkup[ , path := { fs::path(data_dir, @@ -194,17 +196,20 @@ create_lkups <- function(data_dir, versions) { } ] - if ("region_code" %in% names(svy_lkup)) { + print(dim(refy_lkup)) + if ("region_code" %in% names(refy_lkup)) { refy_lkup[, region_code := NULL] } + refy_lkup <- joyn::joyn(x = refy_lkup, y = countries, by = 'country_code', keep = "left", - reportvar = FALSE) - + reportvar = FALSE, + match_type = "m:1") + print(dim(refy_lkup)) #___________________________________________________________________________ From 1eeaa0fa61dd92f31eb8498d45dd9f90bd3f252f Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Wed, 11 Jun 2025 17:57:03 -0400 Subject: [PATCH 003/203] rm print --- R/create_lkups.R | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/R/create_lkups.R b/R/create_lkups.R index 60ca600e..a1aa7e6c 100644 --- a/R/create_lkups.R +++ b/R/create_lkups.R @@ -180,10 +180,10 @@ create_lkups <- function(data_dir, versions) { refy_lkup_path <- fs::path(data_dir, "estimations/prod_refy_estimation.fst") - # NOTE: THIS `prod_refy_estimation.fst` is the refy table + # NOTE: THIS `prod_refy_estimation.fst` is the refy table (not really) refy_lkup <- fst::read_fst(refy_lkup_path, as.data.table = TRUE) - print(dim(refy_lkup)) + refy_lkup[ , path := { fs::path(data_dir, @@ -196,21 +196,18 @@ create_lkups <- function(data_dir, versions) { } ] - print(dim(refy_lkup)) if ("region_code" %in% names(refy_lkup)) { refy_lkup[, region_code := NULL] } - refy_lkup <- joyn::joyn(x = refy_lkup, - y = countries, - by = 'country_code', - keep = "left", - reportvar = FALSE, + refy_lkup <- joyn::joyn(x = refy_lkup, + y = countries, + by = 'country_code', + keep = "left", + reportvar = FALSE, match_type = "m:1") - print(dim(refy_lkup)) - #___________________________________________________________________________ From 78ff6a5d04e4a3d0009e5c49be38bbec9d7664e1 Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Wed, 11 Jun 2025 18:01:32 -0400 Subject: [PATCH 004/203] testing fg_pip pipload additions --- R/fg_pip.R | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 95 insertions(+), 3 deletions(-) diff --git a/R/fg_pip.R b/R/fg_pip.R index 364d029e..2667e098 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -18,6 +18,7 @@ fg_pip <- function(country, interpolation_list <- lkup$interpolation_list data_dir <- lkup$data_root ref_lkup <- lkup$ref_lkup + refy_lkup <- lkup$refy_lkup cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb") # fg_pip is called from multiple places like pip, pip_grp_logic. We have connection object created @@ -49,12 +50,103 @@ fg_pip <- function(country, popshare = popshare) setDT(metadata) - # Return empty dataframe if no metadata is found - if (nrow(metadata) == 0) { - return(list(main_data = pipapi::empty_response_fg, data_in_cache = data_present_in_master)) + # if (nrow(metadata) == 0) { + # return(list(main_data = pipapi::empty_response_fg, data_in_cache = data_present_in_master)) + # } + + + # ZP Add: load refy data + #------------------------- + # Extract unique combinations of country-year + if (any(c("ALL", "WLD") %in% country)) { + cntry <- ref_lkup$country_code |> + unique() + } else { + cntry <- refy_lkup[country_code %in% country, + .(country_code)] |> + funique() + } + if (any(c("ALL") %in% year)) { + yr <- ref_lkup$reporting_year |> + unique() + } else { + yr <- refy_lkup[reporting_year %in% year, + .(reporting_year)] |> + funique() } + lt <- + pipload::load_list_refy(input_list = list(country_code = cntry, + year = yr), + path = fs::path(data_dir, + "lineup_data")) + lt <- lapply(lt, + FUN = \(x) { + x <- x |> + pipload::attr_to_column("reporting_level_rows") |> # only rep level???? + pipload::attr_to_column("country_code") |> + pipload::attr_to_column("reporting_year") |> + fmutate(file = paste0(country_code, + "_", + reporting_year)) + + if ("welfare_refy" %in% names(x)) { + setnames(x, + old = c("welfare_refy", + "weight_refy"), + new = c("welfare", + "weight")) + } + + x + }) + + rlang::env_poke(env = globalenv(), + nm = "pipload_list", + value = lt) + + # ZP Add: do fgt estimations using `res <- lapply(lt, process_dt, povline = povline)` + #------------------------- + res <- lapply(lt, process_dt, povline = povline) + res <- rbindlist(res, fill = TRUE) + + # TO BE REMOVED, ONLY FOR TESTING!!! + rlang::env_poke(env = globalenv(), + nm = "res_povest", + value = res) + + # ZP Add: join to metadata + #------------------------- + metadata[, + file := basename(path)] + + out <- join(res, + metadata, + on = c("file", + "reporting_level"), + how = "full", + validate = "m:1", + verbose = 0) + + out[, `:=`( + #mean = survey_mean_ppp, + #median = survey_median_ppp, + file = NULL + )] + + setnames(out, + "povline", + "poverty_line") + + + + + + + + + unique_survey_files <- unique(metadata$data_interpolation_id) # Interpolation list From 0ee3477894f908b91f1066a4f3ab3452d38b1f1c Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Wed, 11 Jun 2025 18:39:26 -0400 Subject: [PATCH 005/203] add more error messages --- R/duckdb_func.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/R/duckdb_func.R b/R/duckdb_func.R index 70779668..8dc4dc13 100644 --- a/R/duckdb_func.R +++ b/R/duckdb_func.R @@ -263,8 +263,11 @@ connect_with_retry <- function(db_path = NULL, message("Attempt ", attempt, " failed: ", conditionMessage(e)) } + # if (attempt == max_attempts) { + # stop("Failed to connect after ", max_attempts, " attempts.") + # } if (attempt == max_attempts) { - stop("Failed to connect after ", max_attempts, " attempts.") + stop("Failed to connect after ", max_attempts, " attempts.\nLast error: ", conditionMessage(e)) } Sys.sleep(delay_sec) attempt <<- attempt + 1 From 33134eb11a25387f9c11388a55ae0946148cd001 Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Tue, 17 Jun 2025 11:55:48 -0400 Subject: [PATCH 006/203] temp store metadata --- R/fg_pip.R | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/R/fg_pip.R b/R/fg_pip.R index 2288b6b3..cc5a7ecf 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -38,8 +38,7 @@ fg_pip <- function(country, data_dir = data_dir, povline = povline, cache_file_path = cache_file_path, - fill_gaps = TRUE - ) + fill_gaps = TRUE) data_present_in_master <- metadata$data_present_in_master povline <- metadata$povline @@ -51,9 +50,10 @@ fg_pip <- function(country, setDT(metadata) # Return empty dataframe if no metadata is found - # if (nrow(metadata) == 0) { - # return(list(main_data = pipapi::empty_response_fg, data_in_cache = data_present_in_master)) - # } + if (nrow(metadata) == 0) { + return(list(main_data = pipapi::empty_response_fg, + data_in_cache = data_present_in_master)) + } # ZP Add: load refy data @@ -108,8 +108,11 @@ fg_pip <- function(country, # ZP Add: do fgt estimations using `res <- lapply(lt, process_dt, povline = povline)` #------------------------- - res <- lapply(lt, process_dt, povline = povline) - res <- rbindlist(res, fill = TRUE) + res <- lapply(lt, + process_dt, + povline = povline) + res <- rbindlist(res, + fill = TRUE) # TO BE REMOVED, ONLY FOR TESTING!!! rlang::env_poke(env = globalenv(), @@ -120,6 +123,10 @@ fg_pip <- function(country, #------------------------- metadata[, file := basename(path)] + # TO BE REMOVED, ONLY FOR TESTING!!! + rlang::env_poke(env = globalenv(), + nm = "metadata_check", + value = metadata) out <- join(res, metadata, From 105859b73f269ef17a897dd8dd0f7f404b0ed59e Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Tue, 17 Jun 2025 23:27:26 -0400 Subject: [PATCH 007/203] fix joining of metadata --- R/fg_pip.R | 162 ++++++++++++++--------------------------------------- 1 file changed, 42 insertions(+), 120 deletions(-) diff --git a/R/fg_pip.R b/R/fg_pip.R index cc5a7ecf..58858511 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -87,6 +87,10 @@ fg_pip <- function(country, pipload::attr_to_column("reporting_level_rows") |> # only rep level???? pipload::attr_to_column("country_code") |> pipload::attr_to_column("reporting_year") |> + pipload::attr_to_column("mean", + dist_stats = TRUE) |> + pipload::attr_to_column("median", + dist_stats = TRUE) |> fmutate(file = paste0(country_code, "_", reporting_year)) @@ -110,7 +114,8 @@ fg_pip <- function(country, #------------------------- res <- lapply(lt, process_dt, - povline = povline) + povline = povline, + mean_and_med = TRUE) res <- rbindlist(res, fill = TRUE) @@ -127,9 +132,40 @@ fg_pip <- function(country, rlang::env_poke(env = globalenv(), nm = "metadata_check", value = metadata) + # try metadata unique code + tmp_metadata <- metadata + # Handle multiple distribution types (for aggregated distributions) + if (length(unique(tmp_metadata$distribution_type)) > 1) { + tmp_metadata[, distribution_type := "mixed"] + } + # convert survey_comparability to NA + # NOTE: This should not be necessary. for the new lineup distribution + # metadata should come without this variable. + tmp_metadata[, survey_comparability := NA] + # get all vars + meta_vars <- setdiff(names(tmp_metadata), "reporting_year") + # transform to NA when necessary - i.e. when interpolated (two rows per reporting_year) + tmp_metadata[, (meta_vars) := lapply(.SD, \(x) { + if (uniqueN(x) == 1) { + x + } else { + NA + }}), + by = reporting_year, .SDcols = meta_vars] + + # Remove duplicate rows by reporting_year (keep only one row per + # reporting_year) + tmp_metadata_unique <- unique(tmp_metadata, by = "reporting_year") + tmp_metadata_unique[, + file := paste0(country_code, + "_", + reporting_year)] + rlang::env_poke(env = globalenv(), + nm = "tmp_metadata_unique_check", + value = tmp_metadata_unique) out <- join(res, - metadata, + tmp_metadata_unique, on = c("file", "reporting_level"), how = "full", @@ -142,128 +178,12 @@ fg_pip <- function(country, file = NULL )] + setnames(out, "povline", "poverty_line") - - - - - - - - unique_survey_files <- unique(metadata$data_interpolation_id) - - # Interpolation list - interpolation_list <- interpolation_list[names(interpolation_list) %in% unique_survey_files] - - # Unique set of survey data to be read - out <- vector(mode = "list", length = length(unique_survey_files)) - - #NEW: iterate over survey files - for (svy_id in seq_along(unique_survey_files)) { - # Extract country-years for which stats will be computed from the same files - # tmp_metadata <- interpolation_list[[unique_survey_files[svy_id]]]$tmp_metadata - iteration <- interpolation_list[[unique_survey_files[svy_id]]] - svy_data <- get_svy_data(svy_id = iteration$cache_ids, - reporting_level = iteration$reporting_level, - path = iteration$paths) - - # Extract unique combinations of country-year - ctry_years <- subset_ctry_years(country = country, - year = year, - lkup = iteration$ctry_years, - valid_regions = valid_regions, - data_dir = data_dir) - - # Join because some data might be coming from cache so it might be absent in - # metadata - ctry_years <- collapse::join(ctry_years, metadata |> - collapse::fselect(intersect(names(ctry_years), - names(metadata))), - verbose = 0, - how = "inner", - overid = 2) - - results_subset <- vector(mode = "list", length = nrow(ctry_years)) - - for (ctry_year_id in seq_along(ctry_years$interpolation_id)) { - # Extract records to be used for a single country-year estimation - interp_id <- ctry_years[["interpolation_id"]][ctry_year_id] - tmp_metadata <- metadata[metadata$interpolation_id == interp_id, ] - - report_year <- ctry_years[["reporting_year"]][ctry_year_id] - - # Compute estimated statistics using the fill_gap method - tmp_stats <- wbpip:::prod_fg_compute_pip_stats( - request_year = report_year, - data = svy_data, - predicted_request_mean = tmp_metadata[["predicted_mean_ppp"]], - svy_mean_lcu = tmp_metadata[["survey_mean_lcu"]], - svy_median_lcu = tmp_metadata$survey_median_lcu, - svy_median_ppp = tmp_metadata$survey_median_ppp, - survey_year = tmp_metadata[["survey_year"]], - default_ppp = tmp_metadata[["ppp"]], - ppp = ppp, - distribution_type = tmp_metadata[["distribution_type"]], - poverty_line = povline, - popshare = popshare - ) - - # Handle multiple distribution types (for aggregated distributions) - if (length(unique(tmp_metadata$distribution_type)) > 1) { - tmp_metadata[, distribution_type := "mixed"] - } - # - # tmp_metadata <- unique(tmp_metadata) - # Add stats columns to data frame - - # Convert Statas into Data.table - ts_DT <- as.data.table(tmp_stats) - # Add reporting year to merge - ts_DT[, reporting_year := report_year] - - # convert survey_comparability to NA - # NOTE: This should not be necessary. for the new lineup distribution - # metadata should come without this variable. - tmp_metadata[, survey_comparability := NA] - - # get all vars - meta_vars <- setdiff(names(tmp_metadata), "reporting_year") - # transform to NA when necessary - tmp_metadata[, (meta_vars) := lapply(.SD, \(x) { - if (uniqueN(x) == 1) { - x - } else { - NA - }}), - by = reporting_year, .SDcols = meta_vars] - - # Remove duplicate rows by reporting_year (keep only one row per - # reporting_year) - tmp_metadata_unique <- unique(tmp_metadata, by = "reporting_year") - - # Now join as usual - - ts_md <- join(ts_DT, - tmp_metadata_unique, - on = "reporting_year", - how = "left", - verbose = 0, - overid = 2) - - results_subset[[ctry_year_id]] <- ts_md - } - out[[svy_id]] <- results_subset - } - out <- unlist(out, recursive = FALSE) - out <- data.table::rbindlist(out) - - # Remove median - # out[, median := NULL] - # Ensure that out does not have duplicates out <- fg_remove_duplicates(out) @@ -278,7 +198,9 @@ fg_pip <- function(country, out[, max_year := NULL] } - return(list(main_data = out, data_in_cache = data_present_in_master)) + return(list(main_data = out, + data_in_cache = data_present_in_master)) + } #' Remove duplicated rows created during the interpolation process From 928606e7c304fecacf93478a40bbd442f2f036c6 Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Tue, 17 Jun 2025 23:27:51 -0400 Subject: [PATCH 008/203] add mean and med to compute_fgt_dt --- R/rg_pip.R | 44 ++++++++++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/R/rg_pip.R b/R/rg_pip.R index 5ff8552b..019fde2e 100644 --- a/R/rg_pip.R +++ b/R/rg_pip.R @@ -95,11 +95,11 @@ rg_pip <- function(country, #' #' @return data.table with estimates poverty estimates #' @keywords internal -compute_fgt_dt <- function(dt, welfare, weight, povlines) { - w <- dt[[welfare]] - wt <- dt[[weight]] - n <- length(w) - m <- length(povlines) +compute_fgt_dt <- function(dt, welfare, weight, povlines, mean_and_med = FALSE) { + w <- dt[[welfare]] + wt <- dt[[weight]] + n <- length(w) + m <- length(povlines) # Pre-allocate result matrix res <- matrix(NA_real_, nrow = m, ncol = 3) @@ -128,18 +128,34 @@ compute_fgt_dt <- function(dt, welfare, weight, povlines) { watts_vec[i] <- 0 } } - data.table( - povline = povlines, - headcount = res[, 1], - poverty_gap = res[, 2], - poverty_severity = res[, 3], - watts = watts_vec - ) + + if (mean_and_med) { + mn <- funique(dt$mean) + med <- funique(dt$median) + out <- data.table( + povline = povlines, + headcount = res[, 1], + poverty_gap = res[, 2], + poverty_severity = res[, 3], + watts = watts_vec, + mean = mn, + median = med) + } else { + out <- data.table( + povline = povlines, + headcount = res[, 1], + poverty_gap = res[, 2], + poverty_severity = res[, 3], + watts = watts_vec) + } + + out + } -process_dt <- function(dt, povline) { - dt[, compute_fgt_dt(.SD, "welfare", "weight", povline), +process_dt <- function(dt, povline, mean_and_med = FALSE) { + dt[, compute_fgt_dt(.SD, "welfare", "weight", povline, mean_and_med), by = .(file, reporting_level)] } From 74e4093bef45ff9a51595f42344ea586bc73b079 Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Wed, 18 Jun 2025 17:27:58 -0400 Subject: [PATCH 009/203] comment out env checks --- R/fg_pip.R | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/R/fg_pip.R b/R/fg_pip.R index 58858511..36eec629 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -106,9 +106,9 @@ fg_pip <- function(country, x }) - rlang::env_poke(env = globalenv(), - nm = "pipload_list", - value = lt) + # rlang::env_poke(env = globalenv(), + # nm = "pipload_list", + # value = lt) # ZP Add: do fgt estimations using `res <- lapply(lt, process_dt, povline = povline)` #------------------------- @@ -120,18 +120,18 @@ fg_pip <- function(country, fill = TRUE) # TO BE REMOVED, ONLY FOR TESTING!!! - rlang::env_poke(env = globalenv(), - nm = "res_povest", - value = res) + # rlang::env_poke(env = globalenv(), + # nm = "res_povest", + # value = res) # ZP Add: join to metadata #------------------------- metadata[, file := basename(path)] # TO BE REMOVED, ONLY FOR TESTING!!! - rlang::env_poke(env = globalenv(), - nm = "metadata_check", - value = metadata) + # rlang::env_poke(env = globalenv(), + # nm = "metadata_check", + # value = metadata) # try metadata unique code tmp_metadata <- metadata # Handle multiple distribution types (for aggregated distributions) @@ -160,9 +160,9 @@ fg_pip <- function(country, file := paste0(country_code, "_", reporting_year)] - rlang::env_poke(env = globalenv(), - nm = "tmp_metadata_unique_check", - value = tmp_metadata_unique) + # rlang::env_poke(env = globalenv(), + # nm = "tmp_metadata_unique_check", + # value = tmp_metadata_unique) out <- join(res, tmp_metadata_unique, From 4574c8f6fcf27e310d53d9035c1ca7f28de8b385 Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Mon, 23 Jun 2025 16:34:39 -0400 Subject: [PATCH 010/203] fix full_list for file ingestion --- R/fg_pip.R | 113 ++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 91 insertions(+), 22 deletions(-) diff --git a/R/fg_pip.R b/R/fg_pip.R index 36eec629..d4038d3d 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -17,8 +17,8 @@ fg_pip <- function(country, valid_regions <- lkup$query_controls$region$values interpolation_list <- lkup$interpolation_list data_dir <- lkup$data_root - ref_lkup <- lkup$ref_lkup - refy_lkup <- lkup$refy_lkup + ref_lkup <- lkup$ref_lkup # the normal refy table, some country-years have two rows (interpolation) + refy_lkup <- lkup$refy_lkup # cleaned refy table, unique by country-years but some columns removed in order to do that cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb") # fg_pip is called from multiple places like pip, pip_grp_logic. We have connection object created @@ -33,7 +33,7 @@ fg_pip <- function(country, year = year, welfare_type = welfare_type, reporting_level = reporting_level, - lkup = ref_lkup, + lkup = ref_lkup, # only place this is used, for 'interpolation_id' valid_regions = valid_regions, data_dir = data_dir, povline = povline, @@ -51,6 +51,7 @@ fg_pip <- function(country, # Return empty dataframe if no metadata is found if (nrow(metadata) == 0) { + print("no metadata") return(list(main_data = pipapi::empty_response_fg, data_in_cache = data_present_in_master)) } @@ -59,28 +60,90 @@ fg_pip <- function(country, # ZP Add: load refy data #------------------------- # Extract unique combinations of country-year + # if (any(c("ALL", "WLD") %in% country)) { + # cntry <- refy_lkup$country_code |> + # unique() + # print("A") + # #cntry[!cntry %in% c("SSD", "SVK", "TLS", "VEN", "XKX")] # to be removed + # } else { + # cntry <- refy_lkup[country_code %in% country, + # .(country_code)] |> + # funique() + # print("B") + # } + # if (any(c("ALL") %in% year)) { + # yr <- refy_lkup$reporting_year |> + # unique() + # print("C") + # } else { + # yr <- refy_lkup[reporting_year %in% year, + # .(reporting_year)] |> + # funique() + # print("D") + # } + # + # print(as.vector(cntry)) + # print(yr) + # lt <- + # pipload::load_list_refy(input_list = list(country_code = cntry, + # year = yr), + # path = fs::path(data_dir, + # "lineup_data")) + + + #' # ZP Add: load refy data + #------------------------- + # Extract unique combinations of country-year if (any(c("ALL", "WLD") %in% country)) { - cntry <- ref_lkup$country_code |> + cntry <- refy_lkup$country_code |> unique() + print("A") + #cntry[!cntry %in% c("SSD", "SVK", "TLS", "VEN", "XKX")] # to be removed } else { cntry <- refy_lkup[country_code %in% country, .(country_code)] |> funique() + print("B") } if (any(c("ALL") %in% year)) { - yr <- ref_lkup$reporting_year |> + yr <- refy_lkup$reporting_year |> unique() + print("C") } else { yr <- refy_lkup[reporting_year %in% year, - .(reporting_year)] |> + .(reporting_year)] |> funique() + print("D") } - + dtemp <- + ref_lkup |> + fsubset(country_code %in% cntry & + reporting_year %in% yr) |> + fselect(country_code, + year = reporting_year) |> + funique() + + # Split years by country + full_list <- dtemp[, .(year = list(year)), by = country_code][ + , .(country_code, year = year) + ] + + # Convert to desired structure + full_list <- list( + country_code = full_list$country_code, + year = lapply(full_list$year, as.numeric) + ) + #return(full_list) + print(as.vector(cntry)) + print(yr) lt <- - pipload::load_list_refy(input_list = list(country_code = cntry, - year = yr), + pipload::load_list_refy(input_list = full_list, path = fs::path(data_dir, "lineup_data")) + + + + print(names(lt)) lt <- lapply(lt, FUN = \(x) { x <- x |> @@ -106,9 +169,9 @@ fg_pip <- function(country, x }) - # rlang::env_poke(env = globalenv(), - # nm = "pipload_list", - # value = lt) + rlang::env_poke(env = globalenv(), + nm = "pipload_list", + value = lt) # ZP Add: do fgt estimations using `res <- lapply(lt, process_dt, povline = povline)` #------------------------- @@ -120,18 +183,18 @@ fg_pip <- function(country, fill = TRUE) # TO BE REMOVED, ONLY FOR TESTING!!! - # rlang::env_poke(env = globalenv(), - # nm = "res_povest", - # value = res) + rlang::env_poke(env = globalenv(), + nm = "res_povest", + value = res) # ZP Add: join to metadata #------------------------- metadata[, file := basename(path)] # TO BE REMOVED, ONLY FOR TESTING!!! - # rlang::env_poke(env = globalenv(), - # nm = "metadata_check", - # value = metadata) + rlang::env_poke(env = globalenv(), + nm = "metadata_check", + value = metadata) # try metadata unique code tmp_metadata <- metadata # Handle multiple distribution types (for aggregated distributions) @@ -160,9 +223,9 @@ fg_pip <- function(country, file := paste0(country_code, "_", reporting_year)] - # rlang::env_poke(env = globalenv(), - # nm = "tmp_metadata_unique_check", - # value = tmp_metadata_unique) + rlang::env_poke(env = globalenv(), + nm = "tmp_metadata_unique_check", + value = tmp_metadata_unique) out <- join(res, tmp_metadata_unique, @@ -203,6 +266,13 @@ fg_pip <- function(country, } + +# process_dt_fg <- function(dt, povline, mean_and_med = FALSE) { +# dt[, compute_fgt_dt(.SD, "welfare", "weight", povline, mean_and_med), +# by = .(file, reporting_level)] +# } + + #' Remove duplicated rows created during the interpolation process #' #' @param df data.table: Table of results created in `fg_pip()` @@ -210,7 +280,6 @@ fg_pip <- function(country, #' #' @return data.table #' - fg_remove_duplicates <- function(df, cols = c("comparable_spell", "cpi", From e92e3b835009a6d450e7f4073b72c5b61ec9ded0 Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Mon, 23 Jun 2025 16:35:23 -0400 Subject: [PATCH 011/203] allow processing to return mean, med, country, year --- R/rg_pip.R | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/R/rg_pip.R b/R/rg_pip.R index 019fde2e..8b555753 100644 --- a/R/rg_pip.R +++ b/R/rg_pip.R @@ -132,6 +132,8 @@ compute_fgt_dt <- function(dt, welfare, weight, povlines, mean_and_med = FALSE) if (mean_and_med) { mn <- funique(dt$mean) med <- funique(dt$median) + cy <- funique(dt$coutnry_code) + ry <- funique(dt$reporting_year) out <- data.table( povline = povlines, headcount = res[, 1], @@ -139,7 +141,9 @@ compute_fgt_dt <- function(dt, welfare, weight, povlines, mean_and_med = FALSE) poverty_severity = res[, 3], watts = watts_vec, mean = mn, - median = med) + median = med, + country_code = cy, + reporting_year = ry) } else { out <- data.table( povline = povlines, From 67b1d8c1b14b5a9f157c3dd25e5999cc585ecc67 Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Tue, 5 Aug 2025 22:14:17 +0200 Subject: [PATCH 012/203] create old funcs --- R/fg_pip_old.R | 258 +++++++++++++++++++++++++++++++++++++++++++++ R/pip_zp_old.R | 281 +++++++++++++++++++++++++++++++++++++++++++++++++ R/rg_pip_old.R | 199 ++++++++++++++++++++++++++++++++++ 3 files changed, 738 insertions(+) create mode 100644 R/fg_pip_old.R create mode 100644 R/pip_zp_old.R create mode 100644 R/rg_pip_old.R diff --git a/R/fg_pip_old.R b/R/fg_pip_old.R new file mode 100644 index 00000000..b5fab88b --- /dev/null +++ b/R/fg_pip_old.R @@ -0,0 +1,258 @@ +#' Compute imputed year stats +#' +#' Compute the main PIP poverty and inequality statistics for imputed years. +#' +#' @inheritParams pip +#' @return data.frame +#' @keywords internal +fg_pip_old <- function(country, + year, + povline, + popshare, + welfare_type, + reporting_level, + ppp, + lkup) { + + valid_regions <- lkup$query_controls$region$values + interpolation_list <- lkup$interpolation_list + data_dir <- lkup$data_root + ref_lkup <- lkup$ref_lkup + + cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb") + # fg_pip is called from multiple places like pip, pip_grp_logic. We have connection object created + # when calling from `pip`. For other functions we create it here. + # if (is.null(con)) { + # cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb") + # con <- duckdb::dbConnect(duckdb::duckdb(), dbdir = cache_file_path, read_only = TRUE) + # } + # Handle interpolation + metadata <- subset_lkup( + country = country, + year = year, + welfare_type = welfare_type, + reporting_level = reporting_level, + lkup = ref_lkup, + valid_regions = valid_regions, + data_dir = data_dir, + povline = povline, + cache_file_path = cache_file_path, + fill_gaps = TRUE + ) + + data_present_in_master <- metadata$data_present_in_master + povline <- metadata$povline + metadata <- metadata$lkup + # Remove aggregate distribution if popshare is specified + # TEMPORARY FIX UNTIL popshare is supported for aggregate distributions + metadata <- filter_lkup(metadata = metadata, + popshare = popshare) + setDT(metadata) + + + # Return empty dataframe if no metadata is found + if (nrow(metadata) == 0) { + return(list(main_data = pipapi::empty_response_fg, data_in_cache = data_present_in_master)) + } + + unique_survey_files <- unique(metadata$data_interpolation_id) + + # Interpolation list + interpolation_list <- interpolation_list[names(interpolation_list) %in% unique_survey_files] + + # Unique set of survey data to be read + out <- vector(mode = "list", length = length(unique_survey_files)) + + #NEW: iterate over survey files + for (svy_id in seq_along(unique_survey_files)) { + # Extract country-years for which stats will be computed from the same files + # tmp_metadata <- interpolation_list[[unique_survey_files[svy_id]]]$tmp_metadata + iteration <- interpolation_list[[unique_survey_files[svy_id]]] + svy_data <- get_svy_data(svy_id = iteration$cache_ids, + reporting_level = iteration$reporting_level, + path = iteration$paths) + + # Extract unique combinations of country-year + ctry_years <- subset_ctry_years(country = country, + year = year, + lkup = iteration$ctry_years, + valid_regions = valid_regions, + data_dir = data_dir) + + # Join because some data might be coming from cache so it might be absent in + # metadata + ctry_years <- collapse::join(ctry_years, metadata |> + collapse::fselect(intersect(names(ctry_years), + names(metadata))), + verbose = 0, + how = "inner", + overid = 2) + + results_subset <- vector(mode = "list", length = nrow(ctry_years)) + + for (ctry_year_id in seq_along(ctry_years$interpolation_id)) { + # Extract records to be used for a single country-year estimation + interp_id <- ctry_years[["interpolation_id"]][ctry_year_id] + tmp_metadata <- metadata[metadata$interpolation_id == interp_id, ] + + report_year <- ctry_years[["reporting_year"]][ctry_year_id] + + # Compute estimated statistics using the fill_gap method + tmp_stats <- wbpip:::prod_fg_compute_pip_stats( + request_year = report_year, + data = svy_data, + predicted_request_mean = tmp_metadata[["predicted_mean_ppp"]], + svy_mean_lcu = tmp_metadata[["survey_mean_lcu"]], + svy_median_lcu = tmp_metadata$survey_median_lcu, + svy_median_ppp = tmp_metadata$survey_median_ppp, + survey_year = tmp_metadata[["survey_year"]], + default_ppp = tmp_metadata[["ppp"]], + ppp = ppp, + distribution_type = tmp_metadata[["distribution_type"]], + poverty_line = povline, + popshare = popshare + ) + + # Handle multiple distribution types (for aggregated distributions) + if (length(unique(tmp_metadata$distribution_type)) > 1) { + tmp_metadata[, distribution_type := "mixed"] + } + # + # tmp_metadata <- unique(tmp_metadata) + # Add stats columns to data frame + + # Convert Statas into Data.table + ts_DT <- as.data.table(tmp_stats) + # Add reporting year to merge + ts_DT[, reporting_year := report_year] + + # convert survey_comparability to NA + # NOTE: This should not be necessary. for the new lineup distribution + # metadata should come without this variable. + tmp_metadata[, survey_comparability := NA] + + # get all vars + meta_vars <- setdiff(names(tmp_metadata), "reporting_year") + # transform to NA when necessary + tmp_metadata[, (meta_vars) := lapply(.SD, \(x) { + if (uniqueN(x) == 1) { + x + } else { + NA + }}), + by = reporting_year, .SDcols = meta_vars] + + # Remove duplicate rows by reporting_year (keep only one row per + # reporting_year) + tmp_metadata_unique <- unique(tmp_metadata, by = "reporting_year") + + # Now join as usual + + ts_md <- join(ts_DT, + tmp_metadata_unique, + on = "reporting_year", + how = "left", + verbose = 0, + overid = 2) + + results_subset[[ctry_year_id]] <- ts_md + } + out[[svy_id]] <- results_subset + } + out <- unlist(out, recursive = FALSE) + out <- data.table::rbindlist(out) + + # Remove median + # out[, median := NULL] + + # Ensure that out does not have duplicates + out <- fg_remove_duplicates_old(out) + + # Fix issue with rounding of poverty lines + out[, + poverty_line := round(poverty_line, digits = 3) ] + + # Formatting. MUST be done in data.table tom modify by reference + out[, path := as.character(path)] + + if ("max_year" %in% names(out)) { + out[, max_year := NULL] + } + + return(list(main_data = out, data_in_cache = data_present_in_master)) +} + +#' OLD: Remove duplicated rows created during the interpolation process +#' +#' @param df data.table: Table of results created in `fg_pip()` +#' @param cols character: Columns with potential duplicate values +#' +#' @return data.table +#' +fg_remove_duplicates_old <- function(df, + cols = c("comparable_spell", + "cpi", + "display_cp", + "gd_type", + # "interpolation_id", + "path", + "predicted_mean_ppp", + "survey_acronym", + "survey_comparability", + "survey_coverage", + "survey_id", + "survey_mean_lcu", + "survey_mean_ppp", + "survey_median_lcu", + "survey_median_ppp", + "survey_time", + "survey_year", + "surveyid_year")) { + # Modify cache_id + # * Ensures that cache_id is unique for both extrapolated and interpolated surveys + # * Ensures that cache_id can be kept as an output of fg_pip() while still removing duplicated rows + df$cache_id <- fg_standardize_cache_id_old(cache_id = df$cache_id, + interpolation_id = df$data_interpolation_id, + reporting_level = df$reporting_level) + # Set collapse vars to NA (by type) + df <- fg_assign_nas_values_to_dup_cols_old(df = df, + cols = cols) + + # Ensure that out does not have duplicates + df <- unique(df) + + return(df) +} + +#' OLD: Standardize cache_id format to avoid duplication of rows +#' +#' @param cache_id character +#' @param interpolation_id character +#' @param reporting_level character +#' +#' @return character +fg_standardize_cache_id_old <- function(cache_id, + interpolation_id, + reporting_level) { + + out <- ifelse(grepl("|", interpolation_id, fixed = TRUE), + gsub(paste0("_", + unique(reporting_level), + collapse = '|'), + '', + interpolation_id), + cache_id) + return(out) +} + +#' OLD: Coerce variable causing potential duplicates to NAs +#' +#' @inheritParams fg_remove_duplicates_old +#' +#' @return data.table +fg_assign_nas_values_to_dup_cols_old <- function(df, + cols) { + #Classes are maintained by default. + df[, (cols) := NA] + return(df) +} diff --git a/R/pip_zp_old.R b/R/pip_zp_old.R new file mode 100644 index 00000000..2278ccce --- /dev/null +++ b/R/pip_zp_old.R @@ -0,0 +1,281 @@ +#' Compute PIP statistics +#' +#' Compute the main PIP poverty and inequality statistics. +#' +#' @param country character: Country ISO 3 codes +#' @param year integer: Reporting year +#' @param povline numeric: Poverty line +#' @param popshare numeric: Proportion of the population living below the +#' poverty line +#' @param fill_gaps logical: If set to TRUE, will interpolate / extrapolate +#' values for missing years +#' @param group_by character: Will return aggregated values for predefined +#' sub-groups +#' @param welfare_type character: Welfare type +#' @param reporting_level character: Geographical reporting level +#' @param ppp numeric: Custom Purchase Power Parity value +#' @param lkup list: A list of lkup tables +#' @param censor logical: Triggers censoring of country/year statistics +#' @param lkup_hash character: hash of pip +#' @param additional_ind logical: If TRUE add new set of indicators. Default if +#' FALSE +#' +#' @return data.table +#' @examples +#' \dontrun{ +#' # Create lkups +#' lkups <- create_lkups("") +#' +#' # A single country and year +#' pip(country = "AGO", +#' year = 2000, +#' povline = 1.9, +#' lkup = lkups) +#' +#' # All years for a single country +#' pip(country = "AGO", +#' year = "all", +#' povline = 1.9, +#' lkup = lkups) +#' +#' # Fill gaps +#' pip(country = "AGO", +#' year = "all", +#' povline = 1.9, +#' fill_gaps = TRUE, +#' lkup = lkups) +#' +#' # Group by regions +#' pip(country = "all", +#' year = "all", +#' povline = 1.9, +#' group_by = "wb", +#' lkup = lkups) +#' } +#' @export +pip_zp_old <- function(country = "ALL", + year = "ALL", + povline = 1.9, + popshare = NULL, + fill_gaps = FALSE, + group_by = c("none", "wb"), + welfare_type = c("all", "consumption", "income"), + reporting_level = c("all", "national", "rural", "urban"), + ppp = NULL, + lkup, + censor = TRUE, + lkup_hash = lkup$cache_data_id$hash_pip, + additional_ind = FALSE) { + + + # set up ------------- + welfare_type <- match.arg(welfare_type) + reporting_level <- match.arg(reporting_level) + group_by <- match.arg(group_by) + povline <- round(povline, digits = 3) + + + + # TEMPORARY UNTIL SELECTION MECHANISM IS BEING IMPROVED + country <- toupper(country) + if (is.character(year)) { + year <- toupper(year) + } + + # If svy_lkup is not part of lkup throw an error. + if (!all(c('svy_lkup') %in% names(lkup))) + stop("You are probably passing more than one dataset as lkup argument. + Try passing a single one by subsetting it lkup <- lkups$versions_paths$dataset_name_PROD") + + + # **** TO BE REMOVED **** REMOVAL STARTS HERE + # Once `pip-grp` has been integrated in ingestion pipeline + # Forces fill_gaps to TRUE when using group_by option + if (group_by != "none") { + fill_gaps <- TRUE + message("Info: argument group_by in pip() is deprecated; please use pip_grp() instead.") + } + # **** TO BE REMOVED **** REMOVAL ENDS HERE + + # Countries vector ------------ + lcv <- # List with countries vectors + create_countries_vctr( + country = country, + year = year, + valid_years = lkup$valid_years, + aux_files = lkup$aux_files + ) + # lcv$est_ctrs has all the country_code that we are interested in + + cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb") + if (!file.exists(cache_file_path)) { + # Create an empty duckdb file + create_duckdb_file(cache_file_path) + } + # mains estimates --------------- + if (fill_gaps) { + ## lineup years----------------- + out <- fg_pip_old( + country = lcv$est_ctrs, + year = year, + povline = povline, + popshare = popshare, + welfare_type = welfare_type, + reporting_level = reporting_level, + ppp = ppp, + lkup = lkup + ) + } else { + ## survey years ------------------ + out <- rg_pip_old( + country = lcv$est_ctrs, + year = year, + povline = povline, + popshare = popshare, + welfare_type = welfare_type, + reporting_level = reporting_level, + ppp = ppp, + lkup = lkup + ) + } + + cached_data <- out$data_in_cache + main_data <- out$main_data + + if (nrow(main_data) > 0) { + out <- main_data |> + rowbind(cached_data) + + update_master_file(main_data, cache_file_path, fill_gaps) + + } else { + out <- cached_data + } + if (!data.table::is.data.table(out)) { + setDT(out) + } + # Early return for empty table--------------- + if (nrow(out) == 0) return(pipapi::empty_response) + + # aggregate distributions ------------------ + if (reporting_level %in% c("national", "all")) { + out <- add_agg_stats( + df = out, + return_cols = lkup$return_cols$ag_average_poverty_stats + ) + if (reporting_level == "national") { + out <- out[reporting_level == "national"] + } + } + + + + add_vars_out_of_pipeline(out, fill_gaps = fill_gaps, lkup = lkup) + + # **** TO BE REMOVED **** REMOVAL STARTS HERE + # Once `pip-grp` has been integrated in ingestion pipeline + # Handles grouped aggregations + if (group_by != "none") { + # Handle potential (insignificant) difference in poverty_line values that + # may mess-up the grouping + out$poverty_line <- povline + + out <- pip_aggregate_by( + df = out, + group_lkup = lkup[["pop_region"]], + return_cols = lkup$return_cols$pip_grp + ) + # Censor regional values + if (censor) { + out <- censor_rows(out, lkup[["censored"]], type = "regions") + } + + out <- out[, c("region_name", + "region_code", + "reporting_year", + "reporting_pop", + "poverty_line", + "headcount", + "poverty_gap", + "poverty_severity", + "watts", + "mean", + "pop_in_poverty")] + + return(out) + } + # **** TO BE REMOVED **** REMOVAL ENDS HERE + + + # pre-computed distributional stats --------------- + crr_names <- names(out) # current variables + names2keep <- lkup$return_cols$pip$cols # all variables + + out <- add_dist_stats( + df = out, + dist_stats = lkup[["dist_stats"]] + ) + + # Add aggregate medians ---------------- + out <- add_agg_medians( + df = out, + fill_gaps = fill_gaps, + data_dir = lkup$data_root + ) + + # format ---------------- + + + if (fill_gaps) { + + ## Inequality indicators to NA for lineup years ---- + dist_vars <- names2keep[!(names2keep %in% crr_names)] + out[, + (dist_vars) := NA_real_] + + ## estimate_var ----- + out <- estimate_type_ctr_lnp(out, lkup) + + } else { + out[, estimate_type := NA_character_] + } + ## Handle survey coverage ------------ + if (reporting_level != "all") { + keep <- out$reporting_level == reporting_level + out <- out[keep, ] + } + + # Censor country values + if (censor) { + out <- censor_rows(out, lkup[["censored"]], type = "countries") + } + + + # Select columns + if (additional_ind) { + get_additional_indicators(out) + added_names <- attr(out, "new_indicators_names") + names2keep <- c(names2keep, added_names) + + } + # Keep relevant variables + out <- out[, .SD, .SDcols = names2keep] + + + # make sure we always report the same precision in all numeric variables + doub_vars <- + names(out)[unlist(lapply(out, is.double))] |> + data.table::copy() + + out[, (doub_vars) := lapply(.SD, round, digits = 12), + .SDcols = doub_vars] + + # Order rows by country code and reporting year + data.table::setorder(out, country_code, reporting_year, reporting_level, welfare_type) + #} + + # Make sure no duplicate remains + out <- out |> collapse::funique() + # return ------------- + return(out) +} diff --git a/R/rg_pip_old.R b/R/rg_pip_old.R new file mode 100644 index 00000000..e5f71be3 --- /dev/null +++ b/R/rg_pip_old.R @@ -0,0 +1,199 @@ +#' OLD: Compute survey year stats +#' +#' Compute the main PIP poverty and inequality statistics for survey years. +#' +#' @inheritParams pip +#' @return data.frame +#' @keywords internal +rg_pip_old <- function(country, + year, + povline, + popshare, + welfare_type, + reporting_level, + ppp, + lkup) { + # get values from lkup + valid_regions <- lkup$query_controls$region$values + svy_lkup <- lkup$svy_lkup + data_dir <- lkup$data_root + + cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb") + + metadata <- subset_lkup( + country = country, + year = year, + welfare_type = welfare_type, + reporting_level = reporting_level, + lkup = svy_lkup, + valid_regions = valid_regions, + data_dir = data_dir, + povline = povline, + cache_file_path = cache_file_path, + fill_gaps = FALSE + ) + + data_present_in_master <- metadata$data_present_in_master + povline <- metadata$povline + metadata <- metadata$lkup + + + # Remove aggregate distribution if popshare is specified + # TEMPORARY FIX UNTIL popshare is supported for aggregate distributions + metadata <- filter_lkup(metadata = metadata, + popshare = popshare) + + # return empty dataframe if no metadata is found + if (nrow(metadata) == 0) { + return(list(main_data = pipapi::empty_response, + data_in_cache = data_present_in_master)) + } + + # load data + lt <- load_data_list_old(metadata) + + # parallelization + # res <- get_pov_estimates(lt, povline = povline) + + # Regular lapply + res <- lapply(lt, process_dt_old, povline = povline) + + res <- rbindlist(res, fill = TRUE) + + + # clean data + metadata[, file := basename(path)] + + out <- join(res, + metadata, + on = c("file", "reporting_level"), + how = "full", + validate = "m:1", + verbose = 0) + + out[, `:=`( + mean = survey_mean_ppp, + median = survey_median_ppp, + file = NULL + )] + + setnames(out, "povline", "poverty_line") + + + return(list(main_data = out, data_in_cache = data_present_in_master)) +} + + + +# OLD: Efficient FGT calculation for a data.table and vector of poverty lines +#' Title +#' +#' @param dt data frame with `welfare` and `weight` columns +#' @param welfare character: welfare variable name +#' @param weight character: weight variable name +#' @param povlines double: vector with poveryt lines +#' +#' @return data.table with estimates poverty estimates +#' @keywords internal +compute_fgt_dt_old <- function(dt, welfare, weight, povlines) { + w <- dt[[welfare]] + wt <- dt[[weight]] + n <- length(w) + m <- length(povlines) + + # Pre-allocate result matrix + res <- matrix(NA_real_, nrow = m, ncol = 3) + colnames(res) <- c("FGT0", "FGT1", "FGT2") + watts_vec <- numeric(m) + + # Precompute log(w) for efficiency + logw <- rep(NA_real_, n) + pos <- w > 0 + logw[pos] <- log(w[pos]) + + for (i in seq_along(povlines)) { + pov <- povlines[i] + poor <- w < pov + rel_dist <- 1 - (w / pov) + rel_dist[!poor] <- 0 + res[i, 1] <- fmean(poor, w = wt) # FGT0 + res[i, 2] <- fmean(rel_dist, w = wt) # FGT1 + res[i, 3] <- fmean(rel_dist^2, w = wt) # FGT2 + + # Optimized Watts index calculation + keep <- poor & pos + if (any(keep, na.rm = TRUE)) { + watts_vec[i] <- (fsum((log(pov) - logw[keep]) * wt[keep])) / fsum(wt) + } else { + watts_vec[i] <- 0 + } + } + data.table( + povline = povlines, + headcount = res[, 1], + poverty_gap = res[, 2], + poverty_severity = res[, 3], + watts = watts_vec + ) +} + + +process_dt_old <- function(dt, povline) { + dt[, compute_fgt_dt_old(.SD, "welfare", "weight", povline), + by = .(file, reporting_level)] +} + +#' OLD: load survey year files and store them in a list +#' +#' @param metadata data frame from `subset_lkup()` +#' +#' @return list with survey years data +#' @keywords internal +load_data_list_old <- \(metadata) { + + # unique values + mdout <- metadata[, lapply(.SD, list), by = path] + upaths <- mdout$path + urep_level <- mdout$reporting_level + uppp <- mdout$ppp + ucpi <- mdout$cpi + + seq_along(upaths) |> + lapply(\(f) { + path <- upaths[f] + rep_level <- urep_level[f][[1]] + ppp <- uppp[f][[1]] + cpi <- ucpi[f][[1]] + + # Build a data.table to merge cpi and ppp + fdt <- data.table(reporting_level = as.character(rep_level), + ppp = ppp, + cpi = cpi) + + # load data and format + dt <- fst::read_fst(path, as.data.table = TRUE) + + if (length(rep_level) == 1) { + if (rep_level == "national") dt[, area := "national"] + } + setnames(dt, "area", "reporting_level") + dt[, + `:=`( + file = basename(path), + reporting_level = as.character(reporting_level) + ) + ] + + dt <- join(dt, fdt, + on = "reporting_level", + validate = "m:1", + how = "left", + verbose = 0) + + dt[, welfare := welfare/(cpi * ppp) + ][, + c("cpi", "ppp") := NULL] + + }) + +} From dd4a8b7905412332308753aa723abde27b71b071 Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Wed, 6 Aug 2025 09:37:21 -0400 Subject: [PATCH 013/203] master_file tryCatch for duckdb con error --- R/duckdb_func.R | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/R/duckdb_func.R b/R/duckdb_func.R index 8dc4dc13..d108fda0 100644 --- a/R/duckdb_func.R +++ b/R/duckdb_func.R @@ -22,8 +22,20 @@ return_if_exists <- function(slkup, lkup = slkup, povline = povline)) } - master_file <- load_inter_cache(cache_file_path = cache_file_path, - fill_gaps = fill_gaps) + + # ZP new temp code to avoid error from load_inter_cache due to dbConnect + master_file <- tryCatch( + load_inter_cache(cache_file_path = cache_file_path, + fill_gaps = fill_gaps), + error = function(e) { + cli::cli_warn("Failed to load intermediate cache: {e$message}") + master_file <- slkup[0] # zero-row data.table with same columns as lkup + } + ) + + # ZP old code: + # master_file <- load_inter_cache(cache_file_path = cache_file_path, + # fill_gaps = fill_gaps) if (fnrow(master_file) == 0) { return(list(data_present_in_master = NULL, @@ -37,6 +49,9 @@ return_if_exists <- function(slkup, # convert survey_comparability to NA # NOTE: This should not be necessary. for the new lineup distribution # metadata should come without this variable. + + # ZP comment: if using refy_lkup, this should be removed because + # it does not include survey_comparability slkup[, survey_comparability := NA] @@ -47,9 +62,13 @@ return_if_exists <- function(slkup, # This is probably unnecesary + # ZP comment: in my quick checks this has no impact, meaning + # slkup is already unique + # ZP Question: is this to get rid of duplicates from df_refy??? lkup_kvars <- slkup |> copy() |> funique() # this is not big. + # get all vars slkup_vars <- setdiff(names(slkup), key_vars) # transform to NA when necessary @@ -75,15 +94,15 @@ return_if_exists <- function(slkup, # lkup_kvars_pov <- lkup_kvars[, .(poverty_line = povline), # by = eval(names(lkup_kvars))] lkup_kvars_pov <- lkup_kvars[rep(seq_len(nrow(lkup_kvars)), - each = length(povline))] + each = length(povline))] # ZP: add povline lkup_kvars_pov[, poverty_line := rep(povline, times = nrow(lkup_kvars))] # Find which (key_vars, poverty_line) are present in master_file lk_not_ms <- join(x = lkup_kvars_pov, - y = master_file, + y = master_file, # ZP: remember, master_file is full cache file on = key_vars_pl, - how = "anti", + how = "anti", # rows in lkup not in master_file to know what new to do # validate = "1:1", overid = 2, verbose = 0, From 633fc8bc1fcef66f5f9484295e4f64af3abef246 Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Wed, 6 Aug 2025 10:25:43 -0400 Subject: [PATCH 014/203] make the NA of survey_comparability if fg type double for consistency --- R/duckdb_func.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/duckdb_func.R b/R/duckdb_func.R index d108fda0..135fb858 100644 --- a/R/duckdb_func.R +++ b/R/duckdb_func.R @@ -52,7 +52,7 @@ return_if_exists <- function(slkup, # ZP comment: if using refy_lkup, this should be removed because # it does not include survey_comparability - slkup[, survey_comparability := NA] + slkup[, survey_comparability := NA_real_] } else { From a0357c112164d7ab7400b0bf9215e4c9d58247f9 Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Wed, 6 Aug 2025 10:26:02 -0400 Subject: [PATCH 015/203] add interpolation id for df_refy --- R/create_lkups.R | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/R/create_lkups.R b/R/create_lkups.R index a1aa7e6c..8e7f797a 100644 --- a/R/create_lkups.R +++ b/R/create_lkups.R @@ -195,6 +195,11 @@ create_lkups <- function(data_dir, versions) { as.character() } ] + refy_lkup[, + interpolation_id := paste(country_code, + reporting_year, + reporting_level, + sep = "_")] if ("region_code" %in% names(refy_lkup)) { refy_lkup[, From 3fb45a263014ab697e4c775716228e562639fdd7 Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Wed, 6 Aug 2025 11:22:14 -0400 Subject: [PATCH 016/203] rm _refy from lineup columns --- R/fg_pip.R | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/R/fg_pip.R b/R/fg_pip.R index d4038d3d..74dcebe2 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -17,7 +17,7 @@ fg_pip <- function(country, valid_regions <- lkup$query_controls$region$values interpolation_list <- lkup$interpolation_list data_dir <- lkup$data_root - ref_lkup <- lkup$ref_lkup # the normal refy table, some country-years have two rows (interpolation) + ref_lkup <- lkup$ref_lkup # the normal refy table, some country-years have two rows (interpolation) refy_lkup <- lkup$refy_lkup # cleaned refy table, unique by country-years but some columns removed in order to do that cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb") @@ -27,19 +27,21 @@ fg_pip <- function(country, # cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb") # con <- duckdb::dbConnect(duckdb::duckdb(), dbdir = cache_file_path, read_only = TRUE) # } + # Handle interpolation metadata <- subset_lkup( country = country, year = year, welfare_type = welfare_type, reporting_level = reporting_level, - lkup = ref_lkup, # only place this is used, for 'interpolation_id' + lkup = refy_lkup, # only place this is used, for 'interpolation_id' valid_regions = valid_regions, data_dir = data_dir, povline = povline, cache_file_path = cache_file_path, fill_gaps = TRUE) + data_present_in_master <- metadata$data_present_in_master povline <- metadata$povline metadata <- metadata$lkup @@ -51,7 +53,7 @@ fg_pip <- function(country, # Return empty dataframe if no metadata is found if (nrow(metadata) == 0) { - print("no metadata") + print("ZP: no metadata - i.e. nothing additional to estimate") return(list(main_data = pipapi::empty_response_fg, data_in_cache = data_present_in_master)) } @@ -158,14 +160,6 @@ fg_pip <- function(country, "_", reporting_year)) - if ("welfare_refy" %in% names(x)) { - setnames(x, - old = c("welfare_refy", - "weight_refy"), - new = c("welfare", - "weight")) - } - x }) From 85ba9247a5ab482f81d2083de20a5b4f4b1b1d7a Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Thu, 7 Aug 2025 05:14:11 -0400 Subject: [PATCH 017/203] standardize format of country and year selection as vector --- R/fg_pip.R | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/R/fg_pip.R b/R/fg_pip.R index 74dcebe2..16ce4c35 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -98,12 +98,11 @@ fg_pip <- function(country, # Extract unique combinations of country-year if (any(c("ALL", "WLD") %in% country)) { cntry <- refy_lkup$country_code |> - unique() + funique() print("A") - #cntry[!cntry %in% c("SSD", "SVK", "TLS", "VEN", "XKX")] # to be removed } else { cntry <- refy_lkup[country_code %in% country, - .(country_code)] |> + ]$country_code |> funique() print("B") } @@ -113,7 +112,7 @@ fg_pip <- function(country, print("C") } else { yr <- refy_lkup[reporting_year %in% year, - .(reporting_year)] |> + ]$reporting_year |> funique() print("D") } From 36945016d700f00912afbc72d1f60355a7f234c6 Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Thu, 7 Aug 2025 05:19:07 -0400 Subject: [PATCH 018/203] replace pipload with pipdata --- R/fg_pip.R | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/R/fg_pip.R b/R/fg_pip.R index 16ce4c35..ebe2facc 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -87,7 +87,7 @@ fg_pip <- function(country, # print(as.vector(cntry)) # print(yr) # lt <- - # pipload::load_list_refy(input_list = list(country_code = cntry, + # pipdata::load_list_refy(input_list = list(country_code = cntry, # year = yr), # path = fs::path(data_dir, # "lineup_data")) @@ -138,7 +138,7 @@ fg_pip <- function(country, print(as.vector(cntry)) print(yr) lt <- - pipload::load_list_refy(input_list = full_list, + pipdata::load_list_refy(input_list = full_list, path = fs::path(data_dir, "lineup_data")) @@ -148,12 +148,12 @@ fg_pip <- function(country, lt <- lapply(lt, FUN = \(x) { x <- x |> - pipload::attr_to_column("reporting_level_rows") |> # only rep level???? - pipload::attr_to_column("country_code") |> - pipload::attr_to_column("reporting_year") |> - pipload::attr_to_column("mean", + pipdata::attr_to_column("reporting_level_rows") |> # only rep level???? + pipdata::attr_to_column("country_code") |> + pipdata::attr_to_column("reporting_year") |> + pipdata::attr_to_column("mean", dist_stats = TRUE) |> - pipload::attr_to_column("median", + pipdata::attr_to_column("median", dist_stats = TRUE) |> fmutate(file = paste0(country_code, "_", @@ -163,7 +163,7 @@ fg_pip <- function(country, }) rlang::env_poke(env = globalenv(), - nm = "pipload_list", + nm = "pipdata_list", value = lt) # ZP Add: do fgt estimations using `res <- lapply(lt, process_dt, povline = povline)` From a7da9071a96f1b48c77755d81f4a84ee56480ae9 Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Thu, 7 Aug 2025 09:05:19 -0400 Subject: [PATCH 019/203] add valid lineup years to lkup --- R/create_lkups.R | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/R/create_lkups.R b/R/create_lkups.R index 8e7f797a..02ed2b78 100644 --- a/R/create_lkups.R +++ b/R/create_lkups.R @@ -213,8 +213,19 @@ create_lkups <- function(data_dir, versions) { reportvar = FALSE, match_type = "m:1") + + # ZP ADD - CREATE OBJECT: refy_lkup #___________________________________________________________________________ + lineup_years_path <- + fs::path(data_dir, + "estimations/lineup_years.fst") + + # NOTE: THIS `prod_refy_estimation.fst` is the refy table (not really) + lineup_years <- fst::read_fst(lineup_years_path) |> + as.list() + + #___________________________________________________________________________ # CREATE OBJECT: interpolation_list ---- @@ -455,6 +466,8 @@ create_lkups <- function(data_dir, versions) { # CREATE OBJECT: valid_years ---- valid_years <- valid_years(data_dir) + valid_years <- c(valid_years, + lineup_years) # add lineup years # CREATE OBJECT: query_controls ---- # Create list of query controls From 5466131179c97c2fdd2deeef4dbfa9aede5ab9a1 Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Thu, 7 Aug 2025 10:39:23 -0400 Subject: [PATCH 020/203] only fg lineup years that are valid --- R/fg_pip.R | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/R/fg_pip.R b/R/fg_pip.R index ebe2facc..c90705f1 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -120,20 +120,23 @@ fg_pip <- function(country, ref_lkup |> fsubset(country_code %in% cntry & reporting_year %in% yr) |> + fsubset(reporting_year %in% lkup$valid_years$lineup_years) |> fselect(country_code, year = reporting_year) |> funique() # Split years by country - full_list <- dtemp[, .(year = list(year)), by = country_code][ + full_list <- dtemp[, + .(year = list(year)), + by = country_code][ , .(country_code, year = year) ] # Convert to desired structure full_list <- list( country_code = full_list$country_code, - year = lapply(full_list$year, as.numeric) - ) + year = lapply(full_list$year, + as.numeric)) #return(full_list) print(as.vector(cntry)) print(yr) From b29cc6b7ab2ae9df657fdbaee3c70e729030677d Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Mon, 11 Aug 2025 14:52:05 -0400 Subject: [PATCH 021/203] load dist stats in lkups --- R/create_lkups.R | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/R/create_lkups.R b/R/create_lkups.R index 02ed2b78..bf6e69e6 100644 --- a/R/create_lkups.R +++ b/R/create_lkups.R @@ -180,7 +180,8 @@ create_lkups <- function(data_dir, versions) { refy_lkup_path <- fs::path(data_dir, "estimations/prod_refy_estimation.fst") - # NOTE: THIS `prod_refy_estimation.fst` is the refy table (not really) + # NOTE: THIS `prod_refy_estimation.fst` is the refy table but + # unique at the country-year level refy_lkup <- fst::read_fst(refy_lkup_path, as.data.table = TRUE) @@ -214,16 +215,30 @@ create_lkups <- function(data_dir, versions) { match_type = "m:1") - # ZP ADD - CREATE OBJECT: refy_lkup + # ZP ADD - CREATE OBJECT: lineup years #___________________________________________________________________________ lineup_years_path <- fs::path(data_dir, "estimations/lineup_years.fst") - # NOTE: THIS `prod_refy_estimation.fst` is the refy table (not really) lineup_years <- fst::read_fst(lineup_years_path) |> as.list() + # ZP ADD - CREATE OBJECT: lineup dist stats + #___________________________________________________________________________ + lineup_dist_stats <- + fs::path(data_dir, + "estimations/lineup_dist_stats.fst") + + lineup_dist_stats <- fst::read_fst(lineup_dist_stats, + as.data.table = TRUE) |> + fmutate(file = paste(country_code, + reporting_year, + sep = "_")) + gv(lineup_dist_stats, + c("min", + "max")) <- NULL + #___________________________________________________________________________ @@ -588,7 +603,8 @@ create_lkups <- function(data_dir, versions) { aux_tables = aux_tables, interpolation_list = interpolation_list, valid_years = valid_years, - cache_data_id = cache_data_id + cache_data_id = cache_data_id, + lineup_dist_stats = lineup_dist_stats ) return(lkup) From b1484a0671a9964989fc9527c69885846474147b Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Mon, 11 Aug 2025 14:52:24 -0400 Subject: [PATCH 022/203] use dist stats in fg --- R/fg_pip.R | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/R/fg_pip.R b/R/fg_pip.R index c90705f1..792b56e9 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -145,8 +145,6 @@ fg_pip <- function(country, path = fs::path(data_dir, "lineup_data")) - - print(names(lt)) lt <- lapply(lt, FUN = \(x) { @@ -154,10 +152,6 @@ fg_pip <- function(country, pipdata::attr_to_column("reporting_level_rows") |> # only rep level???? pipdata::attr_to_column("country_code") |> pipdata::attr_to_column("reporting_year") |> - pipdata::attr_to_column("mean", - dist_stats = TRUE) |> - pipdata::attr_to_column("median", - dist_stats = TRUE) |> fmutate(file = paste0(country_code, "_", reporting_year)) @@ -183,6 +177,20 @@ fg_pip <- function(country, nm = "res_povest", value = res) + # ZP Add: join to dist_stats + #------------------------- + ly_dist <- lkup$lineup_dist_stats + res <- res |> + joyn::left_join(y = ly_dist, + by = c("file", + "reporting_level", + "reporting_year"), + relationship = "one-to-one", + reportvar = FALSE) + rlang::env_poke(env = globalenv(), + nm = "res_dist", + value = res) + # ZP Add: join to metadata #------------------------- metadata[, @@ -232,19 +240,21 @@ fg_pip <- function(country, verbose = 0) out[, `:=`( - #mean = survey_mean_ppp, - #median = survey_median_ppp, file = NULL )] - setnames(out, "povline", "poverty_line") - + rlang::env_poke(env = globalenv(), + nm = "out1", + value = out) # Ensure that out does not have duplicates out <- fg_remove_duplicates(out) + rlang::env_poke(env = globalenv(), + nm = "out2", + value = out) # Fix issue with rounding of poverty lines out[, @@ -263,12 +273,6 @@ fg_pip <- function(country, } -# process_dt_fg <- function(dt, povline, mean_and_med = FALSE) { -# dt[, compute_fgt_dt(.SD, "welfare", "weight", povline, mean_and_med), -# by = .(file, reporting_level)] -# } - - #' Remove duplicated rows created during the interpolation process #' #' @param df data.table: Table of results created in `fg_pip()` @@ -295,6 +299,9 @@ fg_remove_duplicates <- function(df, "survey_time", "survey_year", "surveyid_year")) { + # not all cols need to be changes + cols <- setdiff(cols, + colnames(df)) # Modify cache_id # * Ensures that cache_id is unique for both extrapolated and interpolated surveys # * Ensures that cache_id can be kept as an output of fg_pip() while still removing duplicated rows @@ -318,7 +325,6 @@ fg_remove_duplicates <- function(df, #' @param reporting_level character #' #' @return character - fg_standardize_cache_id <- function(cache_id, interpolation_id, reporting_level) { @@ -338,7 +344,6 @@ fg_standardize_cache_id <- function(cache_id, #' @inheritParams fg_remove_duplicates #' #' @return data.table - fg_assign_nas_values_to_dup_cols <- function(df, cols) { #Classes are maintained by default. From f0e0fffa8fb9ff8521b6ad5b2a9095750f5a0349 Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Mon, 11 Aug 2025 14:53:45 -0400 Subject: [PATCH 023/203] don't show joyn message --- R/fg_pip.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/fg_pip.R b/R/fg_pip.R index 792b56e9..9f1376c8 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -186,7 +186,8 @@ fg_pip <- function(country, "reporting_level", "reporting_year"), relationship = "one-to-one", - reportvar = FALSE) + reportvar = FALSE, + verbose = FALSE) rlang::env_poke(env = globalenv(), nm = "res_dist", value = res) From b081b44d2fa283ec110ccb5faf5f1aaa3e16e326 Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Mon, 11 Aug 2025 16:05:13 -0400 Subject: [PATCH 024/203] change full join to left join - this removes nowcast years - i.e. years not included as lineup years --- R/fg_pip.R | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/R/fg_pip.R b/R/fg_pip.R index 9f1376c8..e54cd5d8 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -234,11 +234,14 @@ fg_pip <- function(country, out <- join(res, tmp_metadata_unique, - on = c("file", - "reporting_level"), - how = "full", - validate = "m:1", - verbose = 0) + on = c("file", + "reporting_level"), + how = "left", # ZP: change from full to left, + # this rm nowcast years - i.e. years not included + # as lineup years + validate = "m:1", + drop.dup.cols = TRUE, + verbose = 0) out[, `:=`( file = NULL @@ -306,9 +309,9 @@ fg_remove_duplicates <- function(df, # Modify cache_id # * Ensures that cache_id is unique for both extrapolated and interpolated surveys # * Ensures that cache_id can be kept as an output of fg_pip() while still removing duplicated rows - df$cache_id <- fg_standardize_cache_id(cache_id = df$cache_id, - interpolation_id = df$data_interpolation_id, - reporting_level = df$reporting_level) + # df$cache_id <- fg_standardize_cache_id(cache_id = df$cache_id, + # interpolation_id = df$data_interpolation_id, + # reporting_level = df$reporting_level) # Set collapse vars to NA (by type) df <- fg_assign_nas_values_to_dup_cols(df = df, cols = cols) From d0bd7bf7756ce60e00c3f5a286185bf10022aa07 Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Tue, 12 Aug 2025 00:35:41 -0400 Subject: [PATCH 025/203] fix cache different columns --- R/duckdb_func.R | 14 ++++---- R/fg_pip.R | 76 ++++++++++++------------------------------- R/pip.R | 36 +++++++++++++++------ R/utils.R | 85 +++++++++++++++++++++++++++++++------------------ 4 files changed, 110 insertions(+), 101 deletions(-) diff --git a/R/duckdb_func.R b/R/duckdb_func.R index 135fb858..fc91b3f1 100644 --- a/R/duckdb_func.R +++ b/R/duckdb_func.R @@ -12,15 +12,15 @@ return_if_exists <- function(slkup, if (fnrow(slkup) == 0 ) { return(list(data_present_in_master = NULL, - lkup = slkup, - povline = povline)) + lkup = slkup, + povline = povline)) } if (getOption("pipapi.query_live_data")) { return(list(data_present_in_master = NULL, - lkup = slkup, - povline = povline)) + lkup = slkup, + povline = povline)) } # ZP new temp code to avoid error from load_inter_cache due to dbConnect @@ -32,6 +32,8 @@ return_if_exists <- function(slkup, master_file <- slkup[0] # zero-row data.table with same columns as lkup } ) + # temp ZP just to bypass cache + #master_file <- slkup[0] # ZP old code: # master_file <- load_inter_cache(cache_file_path = cache_file_path, @@ -39,8 +41,8 @@ return_if_exists <- function(slkup, if (fnrow(master_file) == 0) { return(list(data_present_in_master = NULL, - lkup = slkup, - povline = povline)) + lkup = slkup, + povline = povline)) } diff --git a/R/fg_pip.R b/R/fg_pip.R index e54cd5d8..1f8c5404 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -59,62 +59,24 @@ fg_pip <- function(country, } - # ZP Add: load refy data - #------------------------- - # Extract unique combinations of country-year - # if (any(c("ALL", "WLD") %in% country)) { - # cntry <- refy_lkup$country_code |> - # unique() - # print("A") - # #cntry[!cntry %in% c("SSD", "SVK", "TLS", "VEN", "XKX")] # to be removed - # } else { - # cntry <- refy_lkup[country_code %in% country, - # .(country_code)] |> - # funique() - # print("B") - # } - # if (any(c("ALL") %in% year)) { - # yr <- refy_lkup$reporting_year |> - # unique() - # print("C") - # } else { - # yr <- refy_lkup[reporting_year %in% year, - # .(reporting_year)] |> - # funique() - # print("D") - # } - # - # print(as.vector(cntry)) - # print(yr) - # lt <- - # pipdata::load_list_refy(input_list = list(country_code = cntry, - # year = yr), - # path = fs::path(data_dir, - # "lineup_data")) - - #' # ZP Add: load refy data #------------------------- # Extract unique combinations of country-year if (any(c("ALL", "WLD") %in% country)) { cntry <- refy_lkup$country_code |> funique() - print("A") } else { cntry <- refy_lkup[country_code %in% country, ]$country_code |> funique() - print("B") } if (any(c("ALL") %in% year)) { yr <- refy_lkup$reporting_year |> unique() - print("C") } else { yr <- refy_lkup[reporting_year %in% year, ]$reporting_year |> funique() - print("D") } dtemp <- ref_lkup |> @@ -137,21 +99,19 @@ fg_pip <- function(country, country_code = full_list$country_code, year = lapply(full_list$year, as.numeric)) - #return(full_list) - print(as.vector(cntry)) - print(yr) lt <- pipdata::load_list_refy(input_list = full_list, path = fs::path(data_dir, "lineup_data")) - print(names(lt)) lt <- lapply(lt, FUN = \(x) { x <- x |> pipdata::attr_to_column("reporting_level_rows") |> # only rep level???? pipdata::attr_to_column("country_code") |> pipdata::attr_to_column("reporting_year") |> + pipdata::attr_to_column("mean", dist_stats = TRUE) |> + pipdata::attr_to_column("median", dist_stats = TRUE) |> fmutate(file = paste0(country_code, "_", reporting_year)) @@ -179,18 +139,18 @@ fg_pip <- function(country, # ZP Add: join to dist_stats #------------------------- - ly_dist <- lkup$lineup_dist_stats - res <- res |> - joyn::left_join(y = ly_dist, - by = c("file", - "reporting_level", - "reporting_year"), - relationship = "one-to-one", - reportvar = FALSE, - verbose = FALSE) - rlang::env_poke(env = globalenv(), - nm = "res_dist", - value = res) + # ly_dist <- lkup$lineup_dist_stats + # res <- res |> + # joyn::left_join(y = ly_dist, + # by = c("file", + # "reporting_level", + # "reporting_year"), + # relationship = "one-to-one", + # reportvar = FALSE, + # verbose = FALSE) + # rlang::env_poke(env = globalenv(), + # nm = "res_dist", + # value = res) # ZP Add: join to metadata #------------------------- @@ -231,6 +191,9 @@ fg_pip <- function(country, rlang::env_poke(env = globalenv(), nm = "tmp_metadata_unique_check", value = tmp_metadata_unique) + rlang::env_poke(env = globalenv(), + nm = "res_final", + value = res) out <- join(res, tmp_metadata_unique, @@ -243,6 +206,9 @@ fg_pip <- function(country, drop.dup.cols = TRUE, verbose = 0) + rlang::env_poke(env = globalenv(), + nm = "out_check", + value = out) out[, `:=`( file = NULL )] @@ -313,7 +279,7 @@ fg_remove_duplicates <- function(df, # interpolation_id = df$data_interpolation_id, # reporting_level = df$reporting_level) # Set collapse vars to NA (by type) - df <- fg_assign_nas_values_to_dup_cols(df = df, + df <- fg_assign_nas_values_to_dup_cols(df = df, cols = cols) # Ensure that out does not have duplicates diff --git a/R/pip.R b/R/pip.R index dd62cd20..dc180832 100644 --- a/R/pip.R +++ b/R/pip.R @@ -139,12 +139,27 @@ pip <- function(country = "ALL", ) } - cached_data <- out$data_in_cache - main_data <- out$main_data + # Cache new data + #--------------------------------------------- + cached_data <- out$data_in_cache + #print(out) + cached_data <- qDT(cached_data) + main_data <- qDT(out$main_data) + # print(colnames(main_data)) + # print(colnames(cached_data)) + #print(setdiff(colnames(main_data), colnames(cached_data))) if (nrow(main_data) > 0) { - out <- main_data |> - rowbind(cached_data) + #print(is.data.table(cached_data)) + if (is.null(out$data_in_cache)) { + out <- main_data + } else { + if (fill_gaps) { + cached_data <- fg_remove_duplicates(cached_data) + } + out <- main_data |> + rowbind(cached_data) + } update_master_file(main_data, cache_file_path, fill_gaps) @@ -168,7 +183,8 @@ pip <- function(country = "ALL", } } - + # Add out of pipeline variablse + #--------------------------------------------- add_vars_out_of_pipeline(out, fill_gaps = fill_gaps, lkup = lkup) @@ -212,10 +228,11 @@ pip <- function(country = "ALL", names2keep <- lkup$return_cols$pip$cols # all variables out <- add_dist_stats( - df = out, - dist_stats = lkup[["dist_stats"]] - ) - + df = out, + lkup = lkup, + fill_gaps = fill_gaps) + print("fooooooo") + return(out) # Add aggregate medians ---------------- out <- add_agg_medians( df = out, @@ -239,6 +256,7 @@ pip <- function(country = "ALL", } else { out[, estimate_type := NA_character_] } + ## Handle survey coverage ------------ if (reporting_level != "all") { keep <- out$reporting_level == reporting_level diff --git a/R/utils.R b/R/utils.R index ae4fdc89..9b4bd945 100644 --- a/R/utils.R +++ b/R/utils.R @@ -24,11 +24,11 @@ subset_lkup <- function(country, # STEP 2 - Select countries keep <- select_country(lkup, keep, country, valid_regions) # STEP 3 - Select years - keep <- select_years(lkup = lkup, - keep = keep, - year = year, - country = country, - data_dir = data_dir, + keep <- select_years(lkup = lkup, + keep = keep, + year = year, + country = country, + data_dir = data_dir, valid_regions = valid_regions) # # step 4. Select MRV @@ -39,18 +39,18 @@ subset_lkup <- function(country, keep <- keep & lkup$welfare_type == welfare_type } # STEP 5 - Select reporting_level - keep <- select_reporting_level(lkup = lkup, - keep = keep, + keep <- select_reporting_level(lkup = lkup, + keep = keep, reporting_level = reporting_level[1]) lkup <- lkup[keep, ] # Return with grace - return_if_exists(slkup = lkup, - povline = povline, + return_if_exists(slkup = lkup, + povline = povline, cache_file_path = cache_file_path, - fill_gaps = fill_gaps) + fill_gaps = fill_gaps) } #' select_country @@ -290,30 +290,53 @@ get_svy_data <- function(svy_id, #' @return data.table #' @export #' -add_dist_stats <- function(df, dist_stats) { - # Keep only relevant columns - cols <- c( - "cache_id", - # "country_code", - # "reporting_year", - # "welfare_type", - "reporting_level", - "gini", - "polarization", - "mld", - sprintf("decile%s", 1:10) - ) - dist_stats <- dist_stats[, .SD, .SDcols = cols] +add_dist_stats <- function(df, lkup, fill_gaps) { + + if (fill_gaps) { + dist_stats <- lkup[["lineup_dist_stats"]] + } else { + dist_stats <- lkup[["dist_stats"]] + } - # merge dist stats with main table - # data.table::setnames(dist_stats, "survey_median_ppp", "median") + if (fill_gaps) { - df <- dist_stats[df, - on = .(cache_id, reporting_level), #.(country_code, reporting_year, welfare_type, reporting_level), - allow.cartesian = TRUE - ] + df <- df |> + joyn::joyn(y = dist_stats, + by = c("country_code", + "reporting_level", + "reporting_year"), + match_type = "1:1", + keep_common_vars = FALSE, + reportvar = FALSE, + verbose = FALSE, + keep = "left") - return(df) + } else { + # Keep only relevant columns + cols <- c( + "cache_id", + # "country_code", + # "reporting_year", + # "welfare_type", + "reporting_level", + "gini", + "polarization", + "mld", + sprintf("decile%s", 1:10) + ) + dist_stats <- dist_stats[, .SD, .SDcols = cols] + + # merge dist stats with main table + # data.table::setnames(dist_stats, "survey_median_ppp", "median") + + df <- dist_stats[df, + on = .(cache_id, reporting_level), #.(country_code, reporting_year, welfare_type, reporting_level), + allow.cartesian = TRUE + ] + } + + + df } #' Collapse rows From 7cc0295ddf7cd96ddb2126734b9dc816ca12f32b Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Tue, 12 Aug 2025 14:39:15 -0400 Subject: [PATCH 026/203] add to lkup whether to use new lineups --- R/create_lkups.R | 75 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 56 insertions(+), 19 deletions(-) diff --git a/R/create_lkups.R b/R/create_lkups.R index bf6e69e6..fb936bac 100644 --- a/R/create_lkups.R +++ b/R/create_lkups.R @@ -11,7 +11,6 @@ create_versioned_lkups <- vintage_pattern <- create_vintage_pattern_call(vintage_pattern) - data_dirs <- extract_data_dirs(data_dir = data_dir, vintage_pattern = vintage_pattern) @@ -175,6 +174,7 @@ create_lkups <- function(data_dir, versions) { collapse = "|"), by = .(interpolation_id)] + # ZP ADD - CREATE OBJECT: refy_lkup #___________________________________________________________________________ refy_lkup_path <- fs::path(data_dir, @@ -240,6 +240,8 @@ create_lkups <- function(data_dir, versions) { "max")) <- NULL + use_new_lineup_version <- use_new_lineup_version(versions) + #___________________________________________________________________________ @@ -588,24 +590,24 @@ create_lkups <- function(data_dir, versions) { # Create list of lkups lkup <- list( - svy_lkup = svy_lkup, - ref_lkup = ref_lkup, - refy_lkup = refy_lkup, - dist_stats = dist_stats, - pop_region = pop_region, - cp_lkups = cp_lkups, - pl_lkup = pl_lkup, - censored = censored, - aux_files = aux_files, - return_cols = return_cols, - query_controls = query_controls, - data_root = data_dir, - aux_tables = aux_tables, - interpolation_list = interpolation_list, - valid_years = valid_years, - cache_data_id = cache_data_id, - lineup_dist_stats = lineup_dist_stats - ) + svy_lkup = svy_lkup, + ref_lkup = ref_lkup, + refy_lkup = refy_lkup, + dist_stats = dist_stats, + pop_region = pop_region, + cp_lkups = cp_lkups, + pl_lkup = pl_lkup, + censored = censored, + aux_files = aux_files, + return_cols = return_cols, + query_controls = query_controls, + data_root = data_dir, + aux_tables = aux_tables, + interpolation_list = interpolation_list, + valid_years = valid_years, + cache_data_id = cache_data_id, + lineup_dist_stats = lineup_dist_stats, + use_new_lineup_version = use_new_lineup_version) return(lkup) } @@ -805,3 +807,38 @@ available_versions <- function(data_dir) { test_regex = vintage_pattern$test_regex) } + + +#' Should the new lineup approach be used? +#' +#' Check if the date in a string is more recent than May 2025 +#' +#' This function extracts the first 8 characters from an input string, +#' interprets them as a date in the format \code{YYYYMMDD}, and checks +#' whether this date is more recent than May 1st, 2025. +#' +#' @param x A character vector where each element starts with an +#' 8-digit date in the format \code{YYYYMMDD}. +#' +#' @return A logical vector: \code{TRUE} if the extracted date is +#' after May 1st, 2025, otherwise \code{FALSE}. +#' +#' @examples +#' use_new_lineup_version("20250401_2021_01_02_PROD") # FALSE +#' use_new_lineup_version("20250615_2021_01_02_PROD") # TRUE +#' +#' @export +use_new_lineup_version <- function(x) { + # Extract YYYYMMDD + date_str <- substr(x, 1, 8) + + # Convert to Date + date_val <- as.Date(date_str, format = "%Y%m%d") + + # Threshold date + threshold <- as.Date("2025-05-01") + + # Compare + date_val > threshold +} + From 9a9546afd6c4bf3c7637d1260071a53cf67da797 Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Tue, 12 Aug 2025 17:59:09 -0400 Subject: [PATCH 027/203] standardise ref and refy in lkup --- R/create_lkups.R | 159 ++++++++++++++++++++++++++++++----------------- 1 file changed, 102 insertions(+), 57 deletions(-) diff --git a/R/create_lkups.R b/R/create_lkups.R index fb936bac..896bf42b 100644 --- a/R/create_lkups.R +++ b/R/create_lkups.R @@ -16,7 +16,6 @@ create_versioned_lkups <- versions <- names(data_dirs) # versions[1] <- "latest_release" - versions_paths <- lapply(data_dirs, create_lkups, versions = versions) names(versions_paths) <- versions @@ -35,6 +34,9 @@ extract_data_dirs <- function(data_dir, vintage_pattern ) { + + + # List data directories under data_dir data_dirs <- fs::dir_ls(data_dir, type = "directory") @@ -71,6 +73,11 @@ extract_data_dirs <- #' @return list create_lkups <- function(data_dir, versions) { + # Use new lineup approach? ----- + use_new_lineup_version <- use_new_lineup_version(versions) + # ZP temp + use_new_lineup_version <- TRUE + # Get survey paths ---- paths <- list.files(fs::path(data_dir, "survey_data")) paths_ids <- tools::file_path_sans_ext(paths) @@ -175,72 +182,110 @@ create_lkups <- function(data_dir, versions) { by = .(interpolation_id)] + + # ZP ADD - CREATE OBJECT: refy_lkup #___________________________________________________________________________ - refy_lkup_path <- fs::path(data_dir, - "estimations/prod_refy_estimation.fst") - - # NOTE: THIS `prod_refy_estimation.fst` is the refy table but - # unique at the country-year level - refy_lkup <- fst::read_fst(refy_lkup_path, - as.data.table = TRUE) - - refy_lkup[ , - path := { - fs::path(data_dir, - "lineup_data", - paste0(country_code, - "_", - reporting_year), - ext = "qs") |> - as.character() - } - ] - refy_lkup[, - interpolation_id := paste(country_code, - reporting_year, - reporting_level, - sep = "_")] - - if ("region_code" %in% names(refy_lkup)) { - refy_lkup[, - region_code := NULL] - } + if (use_new_lineup_version) { + refy_lkup_path <- fs::path(data_dir, + "estimations/prod_refy_estimation.fst") + + # NOTE: THIS `prod_refy_estimation.fst` is the refy table but + # unique at the country-year level + refy_lkup <- fst::read_fst(refy_lkup_path, + as.data.table = TRUE) + + refy_lkup[ , + path := { + fs::path(data_dir, + "lineup_data", + paste0(country_code, + "_", + reporting_year), + ext = "qs") |> + as.character() + } + ] + refy_lkup[, + interpolation_id := paste(country_code, + reporting_year, + reporting_level, + sep = "_")] + + if ("region_code" %in% names(refy_lkup)) { + refy_lkup[, + region_code := NULL] + } - refy_lkup <- joyn::joyn(x = refy_lkup, - y = countries, - by = 'country_code', - keep = "left", - reportvar = FALSE, - match_type = "m:1") + refy_lkup[, + data_interpolation_id := paste(cache_id, + reporting_level, + sep = "_") + ] + + refy_lkup[, + data_interpolation_id := paste(unique(data_interpolation_id), + collapse = "|"), + by = .(interpolation_id)] + + refy_lkup <- joyn::joyn(x = refy_lkup, + y = countries, + by = 'country_code', + keep = "left", + reportvar = FALSE, + match_type = "m:1") + + + gv(refy_lkup, + c("monotonic", + "same_direction", + "mult_factor", + "nac", + "nac_sy", + "svy_mean", + #"data_interpolation", + "relative_distance")) <- NULL + gv(ref_lkup, + c("monotonic", + "same_direction", + "nac", + "nac_sy", + "svy_mean", + #"data_interpolation", + "relative_distance")) <- NULL + + + # ZP ADD - CREATE OBJECT: lineup years + #___________________________________________________________________________ + lineup_years_path <- + fs::path(data_dir, + "estimations/lineup_years.fst") + + lineup_years <- fst::read_fst(lineup_years_path) |> + as.list() + + # ZP ADD - CREATE OBJECT: lineup dist stats + #___________________________________________________________________________ + lineup_dist_stats <- + fs::path(data_dir, + "estimations/lineup_dist_stats.fst") + + lineup_dist_stats <- fst::read_fst(lineup_dist_stats, + as.data.table = TRUE) |> + fmutate(file = paste(country_code, + reporting_year, + sep = "_")) + gv(lineup_dist_stats, + c("min", + "max")) <- NULL - # ZP ADD - CREATE OBJECT: lineup years - #___________________________________________________________________________ - lineup_years_path <- - fs::path(data_dir, - "estimations/lineup_years.fst") + } - lineup_years <- fst::read_fst(lineup_years_path) |> - as.list() - # ZP ADD - CREATE OBJECT: lineup dist stats - #___________________________________________________________________________ - lineup_dist_stats <- - fs::path(data_dir, - "estimations/lineup_dist_stats.fst") - lineup_dist_stats <- fst::read_fst(lineup_dist_stats, - as.data.table = TRUE) |> - fmutate(file = paste(country_code, - reporting_year, - sep = "_")) - gv(lineup_dist_stats, - c("min", - "max")) <- NULL - use_new_lineup_version <- use_new_lineup_version(versions) #___________________________________________________________________________ From cda895ffccdce2f2557e729ae676d4f8b7a5bff9 Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Tue, 12 Aug 2025 19:15:58 -0400 Subject: [PATCH 028/203] rm nowcast years from refy --- R/create_lkups.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/create_lkups.R b/R/create_lkups.R index 896bf42b..fdfa31ed 100644 --- a/R/create_lkups.R +++ b/R/create_lkups.R @@ -264,6 +264,7 @@ create_lkups <- function(data_dir, versions) { lineup_years <- fst::read_fst(lineup_years_path) |> as.list() + refy_lkup <- refy_lkup[reporting_year %in% lineup_years$lineup_years, ] # ZP ADD - CREATE OBJECT: lineup dist stats #___________________________________________________________________________ From 3aa55355feb2d04847ea611352e3b6feb3c5bcbd Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Wed, 13 Aug 2025 13:44:46 -0400 Subject: [PATCH 029/203] add explanation comments --- R/duckdb_func.R | 10 +++++++++- R/fg_pip.R | 2 +- R/utils.R | 1 - 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/R/duckdb_func.R b/R/duckdb_func.R index fc91b3f1..663056c9 100644 --- a/R/duckdb_func.R +++ b/R/duckdb_func.R @@ -10,19 +10,21 @@ return_if_exists <- function(slkup, fill_gaps, verbose = getOption("pipapi.verbose")) { + # none selected if (fnrow(slkup) == 0 ) { return(list(data_present_in_master = NULL, lkup = slkup, povline = povline)) } - + # don't use cache if (getOption("pipapi.query_live_data")) { return(list(data_present_in_master = NULL, lkup = slkup, povline = povline)) } + # load cache # ZP new temp code to avoid error from load_inter_cache due to dbConnect master_file <- tryCatch( load_inter_cache(cache_file_path = cache_file_path, @@ -39,6 +41,7 @@ return_if_exists <- function(slkup, # master_file <- load_inter_cache(cache_file_path = cache_file_path, # fill_gaps = fill_gaps) + # if no cached files, return selected lkup if (fnrow(master_file) == 0) { return(list(data_present_in_master = NULL, lkup = slkup, @@ -121,8 +124,13 @@ return_if_exists <- function(slkup, verbose = 0, multiple = TRUE) + # now we have two dfs: lk_not_ms and data_present_in_master + # which gives the lkup rows not in cache (master_file), + # and the lkup rows in cache (master_file) + # If no data is present in master + # i.e. if no common rows between if (fnrow(data_present_in_master) == 0) { return(list(data_present_in_master = NULL, lkup = slkup, diff --git a/R/fg_pip.R b/R/fg_pip.R index 1f8c5404..7c5cbc02 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -51,7 +51,7 @@ fg_pip <- function(country, popshare = popshare) setDT(metadata) - # Return empty dataframe if no metadata is found + # Return empty dataframe if no metadata is found (i.e. all in cache) if (nrow(metadata) == 0) { print("ZP: no metadata - i.e. nothing additional to estimate") return(list(main_data = pipapi::empty_response_fg, diff --git a/R/utils.R b/R/utils.R index 9b4bd945..3ff1150d 100644 --- a/R/utils.R +++ b/R/utils.R @@ -289,7 +289,6 @@ get_svy_data <- function(svy_id, #' #' @return data.table #' @export -#' add_dist_stats <- function(df, lkup, fill_gaps) { if (fill_gaps) { From e57badab43867906882c01096dd8dbdbeabcca58 Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Wed, 13 Aug 2025 13:47:17 -0400 Subject: [PATCH 030/203] rm NA dist stats for FG --- R/pip.R | 69 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 37 insertions(+), 32 deletions(-) diff --git a/R/pip.R b/R/pip.R index dc180832..b7f313c6 100644 --- a/R/pip.R +++ b/R/pip.R @@ -139,31 +139,35 @@ pip <- function(country = "ALL", ) } - # Cache new data #--------------------------------------------- - cached_data <- out$data_in_cache - #print(out) - cached_data <- qDT(cached_data) + cached_data <- qDT(out$data_in_cache) main_data <- qDT(out$main_data) - # print(colnames(main_data)) - # print(colnames(cached_data)) - #print(setdiff(colnames(main_data), colnames(cached_data))) + print("A") + if (nrow(main_data) > 0) { - #print(is.data.table(cached_data)) + + print("B") + if (is.null(out$data_in_cache)) { + print("C") out <- main_data } else { + print("D") if (fill_gaps) { + print("E") cached_data <- fg_remove_duplicates(cached_data) } + print("F") out <- main_data |> rowbind(cached_data) } + print("G") update_master_file(main_data, cache_file_path, fill_gaps) } else { + print("H") out <- cached_data } if (!data.table::is.data.table(out)) { @@ -175,9 +179,8 @@ pip <- function(country = "ALL", # aggregate distributions ------------------ if (reporting_level %in% c("national", "all")) { out <- add_agg_stats( - df = out, - return_cols = lkup$return_cols$ag_average_poverty_stats - ) + df = out, + return_cols = lkup$return_cols$ag_average_poverty_stats) if (reporting_level == "national") { out <- out[reporting_level == "national"] } @@ -185,8 +188,9 @@ pip <- function(country = "ALL", # Add out of pipeline variablse #--------------------------------------------- - - add_vars_out_of_pipeline(out, fill_gaps = fill_gaps, lkup = lkup) + add_vars_out_of_pipeline(out, + fill_gaps = fill_gaps, + lkup = lkup) # **** TO BE REMOVED **** REMOVAL STARTS HERE # Once `pip-grp` has been integrated in ingestion pipeline @@ -199,11 +203,13 @@ pip <- function(country = "ALL", out <- pip_aggregate_by( df = out, group_lkup = lkup[["pop_region"]], - return_cols = lkup$return_cols$pip_grp - ) + return_cols = lkup$return_cols$pip_grp) + # Censor regional values if (censor) { - out <- censor_rows(out, lkup[["censored"]], type = "regions") + out <- censor_rows(out, + lkup[["censored"]], + type = "regions") } out <- out[, c("region_name", @@ -231,8 +237,7 @@ pip <- function(country = "ALL", df = out, lkup = lkup, fill_gaps = fill_gaps) - print("fooooooo") - return(out) + # Add aggregate medians ---------------- out <- add_agg_medians( df = out, @@ -242,20 +247,20 @@ pip <- function(country = "ALL", # format ---------------- - - if (fill_gaps) { - - ## Inequality indicators to NA for lineup years ---- - dist_vars <- names2keep[!(names2keep %in% crr_names)] - out[, - (dist_vars) := NA_real_] - - ## estimate_var ----- - out <- estimate_type_ctr_lnp(out, lkup) - - } else { - out[, estimate_type := NA_character_] - } +# ZP temp NA lineups + # if (fill_gaps) { + # + # ## Inequality indicators to NA for lineup years ---- + # dist_vars <- names2keep[!(names2keep %in% crr_names)] + # out[, + # (dist_vars) := NA_real_] + # + # ## estimate_var ----- + # out <- estimate_type_ctr_lnp(out, lkup) + # + # } else { + out[, estimate_type := NA_character_] + # } ## Handle survey coverage ------------ if (reporting_level != "all") { From 842d57f8254c99707d6d6280b788d0c3fb43417a Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Wed, 13 Aug 2025 13:47:48 -0400 Subject: [PATCH 031/203] fg_pip_old in pip_old --- R/pip_old.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pip_old.R b/R/pip_old.R index a6ed2c2b..52eaf0fb 100644 --- a/R/pip_old.R +++ b/R/pip_old.R @@ -113,7 +113,7 @@ pip_old <- function(country = "ALL", # mains estimates --------------- if (fill_gaps) { ## lineup years----------------- - out <- fg_pip( + out <- fg_pip_old( country = lcv$est_ctrs, year = year, povline = povline, From 1d0d4459ec6f25dc350ef08d52d741cc64ac2bca Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Wed, 13 Aug 2025 14:01:17 -0400 Subject: [PATCH 032/203] dist_stats_old fn for old pip --- R/pip_old.R | 2 +- R/pip_zp_old.R | 2 +- R/utils.R | 37 +++++++++++++++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 2 deletions(-) diff --git a/R/pip_old.R b/R/pip_old.R index 52eaf0fb..096b90ed 100644 --- a/R/pip_old.R +++ b/R/pip_old.R @@ -213,7 +213,7 @@ pip_old <- function(country = "ALL", crr_names <- names(out) # current variables names2keep <- lkup$return_cols$pip$cols # all variables - out <- add_dist_stats( + out <- add_dist_stats_old( df = out, dist_stats = lkup[["dist_stats"]] ) diff --git a/R/pip_zp_old.R b/R/pip_zp_old.R index 2278ccce..ee22541b 100644 --- a/R/pip_zp_old.R +++ b/R/pip_zp_old.R @@ -211,7 +211,7 @@ pip_zp_old <- function(country = "ALL", crr_names <- names(out) # current variables names2keep <- lkup$return_cols$pip$cols # all variables - out <- add_dist_stats( + out <- add_dist_stats_old( df = out, dist_stats = lkup[["dist_stats"]] ) diff --git a/R/utils.R b/R/utils.R index 3ff1150d..2be6a305 100644 --- a/R/utils.R +++ b/R/utils.R @@ -338,6 +338,43 @@ add_dist_stats <- function(df, lkup, fill_gaps) { df } + + +#' Add pre-computed distributional stats +#' +#' @param df data.table: Data frame of poverty statistics +#' @param dist_stats data.table: Distributional stats lookup +#' +#' @return data.table +#' @export +#' +add_dist_stats_old <- function(df, dist_stats) { + # Keep only relevant columns + cols <- c( + "cache_id", + # "country_code", + # "reporting_year", + # "welfare_type", + "reporting_level", + "gini", + "polarization", + "mld", + sprintf("decile%s", 1:10) + ) + dist_stats <- dist_stats[, .SD, .SDcols = cols] + + # merge dist stats with main table + # data.table::setnames(dist_stats, "survey_median_ppp", "median") + + df <- dist_stats[df, + on = .(cache_id, reporting_level), #.(country_code, reporting_year, welfare_type, reporting_level), + allow.cartesian = TRUE + ] + + return(df) +} + + #' Collapse rows #' @return data.table #' @noRd From b6b94427bd82ea754b67e0bfe908ea390153a9b6 Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Thu, 14 Aug 2025 09:39:43 -0400 Subject: [PATCH 033/203] add estimate type to fg --- R/pip.R | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/R/pip.R b/R/pip.R index b7f313c6..44ee4184 100644 --- a/R/pip.R +++ b/R/pip.R @@ -247,20 +247,21 @@ pip <- function(country = "ALL", # format ---------------- -# ZP temp NA lineups - # if (fill_gaps) { - # + + if (fill_gaps) { + # ZP temp NA lineups: + #--------------------- # ## Inequality indicators to NA for lineup years ---- # dist_vars <- names2keep[!(names2keep %in% crr_names)] # out[, # (dist_vars) := NA_real_] # - # ## estimate_var ----- - # out <- estimate_type_ctr_lnp(out, lkup) - # - # } else { - out[, estimate_type := NA_character_] - # } + ## estimate_var ----- + out <- estimate_type_ctr_lnp(out, lkup) + + } else { + out[, estimate_type := NA_character_] + } ## Handle survey coverage ------------ if (reporting_level != "all") { From fe4dc9530fb6556c8bd302e933ab52f5c91426f1 Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Thu, 14 Aug 2025 09:53:41 -0400 Subject: [PATCH 034/203] fix non-numeric surveyid_year for fg --- R/utils.R | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/R/utils.R b/R/utils.R index 2be6a305..65f2df74 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1203,8 +1203,8 @@ add_distribution_type <- function(df, lkup, fill_gaps) { dt[, # distribution type by year distribution_type := fcase(use_groupdata == 1, "group", - use_imputed == 1, "imputed", - default = "micro") + use_imputed == 1, "imputed", + default = "micro") ] dt <- dt[, # collapse by reporting_year and keep relevant variables @@ -1213,9 +1213,12 @@ add_distribution_type <- function(df, lkup, fill_gaps) { } - df[, - surveyid_year := as.numeric(surveyid_year) - ][dt, + if (!fill_gaps) { + df <- df[, + surveyid_year := as.numeric(surveyid_year) + ] + } + df[dt, on = by_vars, distribution_type := i.distribution_type ][, @@ -1404,6 +1407,7 @@ get_caller_names <- function() { #' @keywords internal #' @return data.table from pip or pip_grp functions. add_vars_out_of_pipeline <- function(out, fill_gaps, lkup) { + ## Add SPL and SPR --------------- out <- add_spl(df = out, fill_gaps = fill_gaps, @@ -1417,9 +1421,9 @@ add_vars_out_of_pipeline <- function(out, fill_gaps, lkup) { ## add distribution type ------------- # based on info in framework data, rather than welfare data - out <- add_distribution_type(df = out, - lkup = lkup, - fill_gaps = fill_gaps) + out <- add_distribution_type(df = out, + lkup = lkup, + fill_gaps = fill_gaps) invisible(out) } From 81e4816697fd9493583518d78568bc5af419dde8 Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Thu, 14 Aug 2025 10:59:12 -0400 Subject: [PATCH 035/203] add pip wrapper for new and old lineups --- R/pip.R | 141 ++++++++++++++++++++++---- R/pip_old.R | 32 +++--- R/{pip_zp_old.R => pip_old_lineups.R} | 12 +-- 3 files changed, 146 insertions(+), 39 deletions(-) rename R/{pip_zp_old.R => pip_old_lineups.R} (96%) diff --git a/R/pip.R b/R/pip.R index 44ee4184..7fe60b2b 100644 --- a/R/pip.R +++ b/R/pip.R @@ -1,24 +1,13 @@ + + #' Compute PIP statistics #' #' Compute the main PIP poverty and inequality statistics. #' -#' @param country character: Country ISO 3 codes -#' @param year integer: Reporting year -#' @param povline numeric: Poverty line -#' @param popshare numeric: Proportion of the population living below the -#' poverty line -#' @param fill_gaps logical: If set to TRUE, will interpolate / extrapolate -#' values for missing years -#' @param group_by character: Will return aggregated values for predefined -#' sub-groups -#' @param welfare_type character: Welfare type -#' @param reporting_level character: Geographical reporting level -#' @param ppp numeric: Custom Purchase Power Parity value -#' @param lkup list: A list of lkup tables -#' @param censor logical: Triggers censoring of country/year statistics -#' @param lkup_hash character: hash of pip -#' @param additional_ind logical: If TRUE add new set of indicators. Default if -#' FALSE +#' This function is a wrapper around the [pip_new_lineups] and [pip_old_lineups] +#' functions. +#' +#' @inheritParams pip_new_lineups #' #' @return data.table #' @examples @@ -67,6 +56,124 @@ pip <- function(country = "ALL", lkup_hash = lkup$cache_data_id$hash_pip, additional_ind = FALSE) { + # Should pip_old or pip_new be used? + #------------------------------------- + use_new <- lkup$use_new_lineup_version + + # Run correct function + #------------------------------------- + out <- if (use_new) { + pip_new_lineups(country = country, + year = year, + povline = povline, + popshare = popshare, + fill_gaps = fill_gaps, + group_by = group_by, + welfare_type = welfare_type, + reporting_level = reporting_level, + ppp = ppp, + lkup = lkup, + censor = censor, + lkup_hash = lkup_hash, + additional_ind = additional_ind) + } else { + pip_old_lineups(country = country, + year = year, + povline = povline, + popshare = popshare, + fill_gaps = fill_gaps, + group_by = group_by, + welfare_type = welfare_type, + reporting_level = reporting_level, + ppp = ppp, + lkup = lkup, + censor = censor, + lkup_hash = lkup_hash, + additional_ind = additional_ind) + } + + # Return + #------------------------------------- + out + +} + + + + + + + + +#' Compute PIP statistics +#' +#' Compute the main PIP poverty and inequality statistics. +#' +#' @param country character: Country ISO 3 codes +#' @param year integer: Reporting year +#' @param povline numeric: Poverty line +#' @param popshare numeric: Proportion of the population living below the +#' poverty line +#' @param fill_gaps logical: If set to TRUE, will interpolate / extrapolate +#' values for missing years +#' @param group_by character: Will return aggregated values for predefined +#' sub-groups +#' @param welfare_type character: Welfare type +#' @param reporting_level character: Geographical reporting level +#' @param ppp numeric: Custom Purchase Power Parity value +#' @param lkup list: A list of lkup tables +#' @param censor logical: Triggers censoring of country/year statistics +#' @param lkup_hash character: hash of pip +#' @param additional_ind logical: If TRUE add new set of indicators. Default if +#' FALSE +#' +#' @return data.table +#' @examples +#' \dontrun{ +#' # Create lkups +#' lkups <- create_lkups("") +#' +#' # A single country and year +#' pip_new_lineups(country = "AGO", +#' year = 2000, +#' povline = 1.9, +#' lkup = lkups) +#' +#' # All years for a single country +#' pip_new_lineups(country = "AGO", +#' year = "all", +#' povline = 1.9, +#' lkup = lkups) +#' +#' # Fill gaps +#' pip_new_lineups(country = "AGO", +#' year = "all", +#' povline = 1.9, +#' fill_gaps = TRUE, +#' lkup = lkups) +#' +#' # Group by regions +#' pip_new_lineups(country = "all", +#' year = "all", +#' povline = 1.9, +#' group_by = "wb", +#' lkup = lkups) +#' } +#' @export +pip_new_lineups <- function(country = "ALL", + year = "ALL", + povline = 1.9, + popshare = NULL, + fill_gaps = FALSE, + group_by = c("none", "wb"), + welfare_type = c("all", "consumption", "income"), + reporting_level = c("all", "national", "rural", "urban"), + ppp = NULL, + lkup, + censor = TRUE, + lkup_hash = lkup$cache_data_id$hash_pip, + additional_ind = FALSE) { + # set up ------------- welfare_type <- match.arg(welfare_type) diff --git a/R/pip_old.R b/R/pip_old.R index 096b90ed..3fc5cee7 100644 --- a/R/pip_old.R +++ b/R/pip_old.R @@ -27,26 +27,26 @@ #' lkups <- create_lkups("") #' #' # A single country and year -#' pip(country = "AGO", +#' pip_old(country = "AGO", #' year = 2000, #' povline = 1.9, #' lkup = lkups) #' #' # All years for a single country -#' pip(country = "AGO", +#' pip_old(country = "AGO", #' year = "all", #' povline = 1.9, #' lkup = lkups) #' #' # Fill gaps -#' pip(country = "AGO", +#' pip_old(country = "AGO", #' year = "all", #' povline = 1.9, #' fill_gaps = TRUE, #' lkup = lkups) #' #' # Group by regions -#' pip(country = "all", +#' pip_old(country = "all", #' year = "all", #' povline = 1.9, #' group_by = "wb", @@ -55,18 +55,18 @@ #' @export #' pip_old <- function(country = "ALL", - year = "ALL", - povline = 1.9, - popshare = NULL, - fill_gaps = FALSE, - group_by = c("none", "wb"), - welfare_type = c("all", "consumption", "income"), - reporting_level = c("all", "national", "rural", "urban"), - ppp = NULL, - lkup, - censor = TRUE, - lkup_hash = lkup$cache_data_id$hash_pip, - additional_ind = FALSE) { + year = "ALL", + povline = 1.9, + popshare = NULL, + fill_gaps = FALSE, + group_by = c("none", "wb"), + welfare_type = c("all", "consumption", "income"), + reporting_level = c("all", "national", "rural", "urban"), + ppp = NULL, + lkup, + censor = TRUE, + lkup_hash = lkup$cache_data_id$hash_pip, + additional_ind = FALSE) { # set up ------------- diff --git a/R/pip_zp_old.R b/R/pip_old_lineups.R similarity index 96% rename from R/pip_zp_old.R rename to R/pip_old_lineups.R index ee22541b..8994a3ba 100644 --- a/R/pip_zp_old.R +++ b/R/pip_old_lineups.R @@ -1,4 +1,4 @@ -#' Compute PIP statistics +#' Compute PIP statistics - Old lineups function #' #' Compute the main PIP poverty and inequality statistics. #' @@ -27,33 +27,33 @@ #' lkups <- create_lkups("") #' #' # A single country and year -#' pip(country = "AGO", +#' pip_old_lineups(country = "AGO", #' year = 2000, #' povline = 1.9, #' lkup = lkups) #' #' # All years for a single country -#' pip(country = "AGO", +#' pip_old_lineups(country = "AGO", #' year = "all", #' povline = 1.9, #' lkup = lkups) #' #' # Fill gaps -#' pip(country = "AGO", +#' pip_old_lineups(country = "AGO", #' year = "all", #' povline = 1.9, #' fill_gaps = TRUE, #' lkup = lkups) #' #' # Group by regions -#' pip(country = "all", +#' pip_old_lineups(country = "all", #' year = "all", #' povline = 1.9, #' group_by = "wb", #' lkup = lkups) #' } #' @export -pip_zp_old <- function(country = "ALL", +pip_old_lineups <- function(country = "ALL", year = "ALL", povline = 1.9, popshare = NULL, From 1081c77886d19c40e75b500671179a2c2c771c30 Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Thu, 14 Aug 2025 11:02:24 -0400 Subject: [PATCH 036/203] document lineups functions --- NAMESPACE | 4 + man/add_dist_stats.Rd | 2 +- man/add_dist_stats_old.Rd | 19 +++++ man/compute_fgt_dt.Rd | 2 +- man/compute_fgt_dt_old.Rd | 24 ++++++ man/fg_assign_nas_values_to_dup_cols_old.Rd | 19 +++++ man/fg_pip.Rd | 6 ++ man/fg_pip_old.Rd | 42 ++++++++++ man/fg_remove_duplicates_old.Rd | 25 ++++++ man/fg_standardize_cache_id_old.Rd | 21 +++++ man/load_data_list_old.Rd | 18 ++++ man/pip.Rd | 4 + man/pip_new_lineups.Rd | 91 +++++++++++++++++++++ man/pip_old.Rd | 8 +- man/pip_old_lineups.Rd | 91 +++++++++++++++++++++ man/rg_pip_old.Rd | 17 +++- man/use_new_lineup_version.Rd | 29 +++++++ 17 files changed, 415 insertions(+), 7 deletions(-) create mode 100644 man/add_dist_stats_old.Rd create mode 100644 man/compute_fgt_dt_old.Rd create mode 100644 man/fg_assign_nas_values_to_dup_cols_old.Rd create mode 100644 man/fg_pip_old.Rd create mode 100644 man/fg_remove_duplicates_old.Rd create mode 100644 man/fg_standardize_cache_id_old.Rd create mode 100644 man/load_data_list_old.Rd create mode 100644 man/pip_new_lineups.Rd create mode 100644 man/pip_old_lineups.Rd create mode 100644 man/use_new_lineup_version.Rd diff --git a/NAMESPACE b/NAMESPACE index 207921ed..5f8d1ce5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,7 @@ # Generated by roxygen2: do not edit by hand export(add_dist_stats) +export(add_dist_stats_old) export(assign_serializer) export(change_grouped_stats_to_csv) export(citation_from_version) @@ -26,7 +27,9 @@ export(load_inter_cache) export(pip) export(pip_grp) export(pip_grp_logic) +export(pip_new_lineups) export(pip_old) +export(pip_old_lineups) export(pipgd_lorenz_curve) export(return_correct_version) export(return_if_exists) @@ -44,6 +47,7 @@ export(ui_pc_regional) export(ui_svy_meta) export(unnest_dt_longer) export(update_master_file) +export(use_new_lineup_version) export(valid_years) export(validate_input_grouped_stats) export(version_dataframe) diff --git a/man/add_dist_stats.Rd b/man/add_dist_stats.Rd index 8b9ba3bf..5ccd113c 100644 --- a/man/add_dist_stats.Rd +++ b/man/add_dist_stats.Rd @@ -4,7 +4,7 @@ \alias{add_dist_stats} \title{Add pre-computed distributional stats} \usage{ -add_dist_stats(df, dist_stats) +add_dist_stats(df, lkup, fill_gaps) } \arguments{ \item{df}{data.table: Data frame of poverty statistics} diff --git a/man/add_dist_stats_old.Rd b/man/add_dist_stats_old.Rd new file mode 100644 index 00000000..e59eb531 --- /dev/null +++ b/man/add_dist_stats_old.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{add_dist_stats_old} +\alias{add_dist_stats_old} +\title{Add pre-computed distributional stats} +\usage{ +add_dist_stats_old(df, dist_stats) +} +\arguments{ +\item{df}{data.table: Data frame of poverty statistics} + +\item{dist_stats}{data.table: Distributional stats lookup} +} +\value{ +data.table +} +\description{ +Add pre-computed distributional stats +} diff --git a/man/compute_fgt_dt.Rd b/man/compute_fgt_dt.Rd index 80504ccb..a57b0748 100644 --- a/man/compute_fgt_dt.Rd +++ b/man/compute_fgt_dt.Rd @@ -4,7 +4,7 @@ \alias{compute_fgt_dt} \title{Title} \usage{ -compute_fgt_dt(dt, welfare, weight, povlines) +compute_fgt_dt(dt, welfare, weight, povlines, mean_and_med = FALSE) } \arguments{ \item{dt}{data frame with \code{welfare} and \code{weight} columns} diff --git a/man/compute_fgt_dt_old.Rd b/man/compute_fgt_dt_old.Rd new file mode 100644 index 00000000..82c9babd --- /dev/null +++ b/man/compute_fgt_dt_old.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/rg_pip_old.R +\name{compute_fgt_dt_old} +\alias{compute_fgt_dt_old} +\title{Title} +\usage{ +compute_fgt_dt_old(dt, welfare, weight, povlines) +} +\arguments{ +\item{dt}{data frame with \code{welfare} and \code{weight} columns} + +\item{welfare}{character: welfare variable name} + +\item{weight}{character: weight variable name} + +\item{povlines}{double: vector with poveryt lines} +} +\value{ +data.table with estimates poverty estimates +} +\description{ +Title +} +\keyword{internal} diff --git a/man/fg_assign_nas_values_to_dup_cols_old.Rd b/man/fg_assign_nas_values_to_dup_cols_old.Rd new file mode 100644 index 00000000..da765eea --- /dev/null +++ b/man/fg_assign_nas_values_to_dup_cols_old.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fg_pip_old.R +\name{fg_assign_nas_values_to_dup_cols_old} +\alias{fg_assign_nas_values_to_dup_cols_old} +\title{OLD: Coerce variable causing potential duplicates to NAs} +\usage{ +fg_assign_nas_values_to_dup_cols_old(df, cols) +} +\arguments{ +\item{df}{data.table: Table of results created in \code{fg_pip()}} + +\item{cols}{character: Columns with potential duplicate values} +} +\value{ +data.table +} +\description{ +OLD: Coerce variable causing potential duplicates to NAs +} diff --git a/man/fg_pip.Rd b/man/fg_pip.Rd index 9012b01b..f83e836f 100644 --- a/man/fg_pip.Rd +++ b/man/fg_pip.Rd @@ -39,4 +39,10 @@ data.frame \description{ Compute the main PIP poverty and inequality statistics for imputed years. } +\keyword{#} +\keyword{Add:} +\keyword{ZP} +\keyword{data} \keyword{internal} +\keyword{load} +\keyword{refy} diff --git a/man/fg_pip_old.Rd b/man/fg_pip_old.Rd new file mode 100644 index 00000000..456989ea --- /dev/null +++ b/man/fg_pip_old.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fg_pip_old.R +\name{fg_pip_old} +\alias{fg_pip_old} +\title{Compute imputed year stats} +\usage{ +fg_pip_old( + country, + year, + povline, + popshare, + welfare_type, + reporting_level, + ppp, + lkup +) +} +\arguments{ +\item{country}{character: Country ISO 3 codes} + +\item{year}{integer: Reporting year} + +\item{povline}{numeric: Poverty line} + +\item{popshare}{numeric: Proportion of the population living below the +poverty line} + +\item{welfare_type}{character: Welfare type} + +\item{reporting_level}{character: Geographical reporting level} + +\item{ppp}{numeric: Custom Purchase Power Parity value} + +\item{lkup}{list: A list of lkup tables} +} +\value{ +data.frame +} +\description{ +Compute the main PIP poverty and inequality statistics for imputed years. +} +\keyword{internal} diff --git a/man/fg_remove_duplicates_old.Rd b/man/fg_remove_duplicates_old.Rd new file mode 100644 index 00000000..2190c4f1 --- /dev/null +++ b/man/fg_remove_duplicates_old.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fg_pip_old.R +\name{fg_remove_duplicates_old} +\alias{fg_remove_duplicates_old} +\title{OLD: Remove duplicated rows created during the interpolation process} +\usage{ +fg_remove_duplicates_old( + df, + cols = c("comparable_spell", "cpi", "display_cp", "gd_type", "path", + "predicted_mean_ppp", "survey_acronym", "survey_comparability", "survey_coverage", + "survey_id", "survey_mean_lcu", "survey_mean_ppp", "survey_median_lcu", + "survey_median_ppp", "survey_time", "survey_year", "surveyid_year") +) +} +\arguments{ +\item{df}{data.table: Table of results created in \code{fg_pip()}} + +\item{cols}{character: Columns with potential duplicate values} +} +\value{ +data.table +} +\description{ +OLD: Remove duplicated rows created during the interpolation process +} diff --git a/man/fg_standardize_cache_id_old.Rd b/man/fg_standardize_cache_id_old.Rd new file mode 100644 index 00000000..fd56a1b4 --- /dev/null +++ b/man/fg_standardize_cache_id_old.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fg_pip_old.R +\name{fg_standardize_cache_id_old} +\alias{fg_standardize_cache_id_old} +\title{OLD: Standardize cache_id format to avoid duplication of rows} +\usage{ +fg_standardize_cache_id_old(cache_id, interpolation_id, reporting_level) +} +\arguments{ +\item{cache_id}{character} + +\item{interpolation_id}{character} + +\item{reporting_level}{character} +} +\value{ +character +} +\description{ +OLD: Standardize cache_id format to avoid duplication of rows +} diff --git a/man/load_data_list_old.Rd b/man/load_data_list_old.Rd new file mode 100644 index 00000000..b6ea172f --- /dev/null +++ b/man/load_data_list_old.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/rg_pip_old.R +\name{load_data_list_old} +\alias{load_data_list_old} +\title{OLD: load survey year files and store them in a list} +\usage{ +load_data_list_old(metadata) +} +\arguments{ +\item{metadata}{data frame from \code{subset_lkup()}} +} +\value{ +list with survey years data +} +\description{ +OLD: load survey year files and store them in a list +} +\keyword{internal} diff --git a/man/pip.Rd b/man/pip.Rd index 3e31f031..0161eeef 100644 --- a/man/pip.Rd +++ b/man/pip.Rd @@ -57,6 +57,10 @@ data.table \description{ Compute the main PIP poverty and inequality statistics. } +\details{ +This function is a wrapper around the \link{pip_new_lineups} and \link{pip_old_lineups} +functions. +} \examples{ \dontrun{ # Create lkups diff --git a/man/pip_new_lineups.Rd b/man/pip_new_lineups.Rd new file mode 100644 index 00000000..eded364f --- /dev/null +++ b/man/pip_new_lineups.Rd @@ -0,0 +1,91 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pip.R +\name{pip_new_lineups} +\alias{pip_new_lineups} +\title{Compute PIP statistics} +\usage{ +pip_new_lineups( + country = "ALL", + year = "ALL", + povline = 1.9, + popshare = NULL, + fill_gaps = FALSE, + group_by = c("none", "wb"), + welfare_type = c("all", "consumption", "income"), + reporting_level = c("all", "national", "rural", "urban"), + ppp = NULL, + lkup, + censor = TRUE, + lkup_hash = lkup$cache_data_id$hash_pip, + additional_ind = FALSE +) +} +\arguments{ +\item{country}{character: Country ISO 3 codes} + +\item{year}{integer: Reporting year} + +\item{povline}{numeric: Poverty line} + +\item{popshare}{numeric: Proportion of the population living below the +poverty line} + +\item{fill_gaps}{logical: If set to TRUE, will interpolate / extrapolate +values for missing years} + +\item{group_by}{character: Will return aggregated values for predefined +sub-groups} + +\item{welfare_type}{character: Welfare type} + +\item{reporting_level}{character: Geographical reporting level} + +\item{ppp}{numeric: Custom Purchase Power Parity value} + +\item{lkup}{list: A list of lkup tables} + +\item{censor}{logical: Triggers censoring of country/year statistics} + +\item{lkup_hash}{character: hash of pip} + +\item{additional_ind}{logical: If TRUE add new set of indicators. Default if +FALSE} +} +\value{ +data.table +} +\description{ +Compute the main PIP poverty and inequality statistics. +} +\examples{ +\dontrun{ +# Create lkups +lkups <- create_lkups("") + +# A single country and year +pip_new_lineups(country = "AGO", + year = 2000, + povline = 1.9, + lkup = lkups) + +# All years for a single country +pip_new_lineups(country = "AGO", + year = "all", + povline = 1.9, + lkup = lkups) + +# Fill gaps +pip_new_lineups(country = "AGO", + year = "all", + povline = 1.9, + fill_gaps = TRUE, + lkup = lkups) + +# Group by regions +pip_new_lineups(country = "all", + year = "all", + povline = 1.9, + group_by = "wb", + lkup = lkups) +} +} diff --git a/man/pip_old.Rd b/man/pip_old.Rd index 2c1ff5a2..1a725396 100644 --- a/man/pip_old.Rd +++ b/man/pip_old.Rd @@ -63,26 +63,26 @@ Compute the main PIP poverty and inequality statistics. lkups <- create_lkups("") # A single country and year -pip(country = "AGO", +pip_old(country = "AGO", year = 2000, povline = 1.9, lkup = lkups) # All years for a single country -pip(country = "AGO", +pip_old(country = "AGO", year = "all", povline = 1.9, lkup = lkups) # Fill gaps -pip(country = "AGO", +pip_old(country = "AGO", year = "all", povline = 1.9, fill_gaps = TRUE, lkup = lkups) # Group by regions -pip(country = "all", +pip_old(country = "all", year = "all", povline = 1.9, group_by = "wb", diff --git a/man/pip_old_lineups.Rd b/man/pip_old_lineups.Rd new file mode 100644 index 00000000..1a4dbb71 --- /dev/null +++ b/man/pip_old_lineups.Rd @@ -0,0 +1,91 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pip_old_lineups.R +\name{pip_old_lineups} +\alias{pip_old_lineups} +\title{Compute PIP statistics - Old lineups function} +\usage{ +pip_old_lineups( + country = "ALL", + year = "ALL", + povline = 1.9, + popshare = NULL, + fill_gaps = FALSE, + group_by = c("none", "wb"), + welfare_type = c("all", "consumption", "income"), + reporting_level = c("all", "national", "rural", "urban"), + ppp = NULL, + lkup, + censor = TRUE, + lkup_hash = lkup$cache_data_id$hash_pip, + additional_ind = FALSE +) +} +\arguments{ +\item{country}{character: Country ISO 3 codes} + +\item{year}{integer: Reporting year} + +\item{povline}{numeric: Poverty line} + +\item{popshare}{numeric: Proportion of the population living below the +poverty line} + +\item{fill_gaps}{logical: If set to TRUE, will interpolate / extrapolate +values for missing years} + +\item{group_by}{character: Will return aggregated values for predefined +sub-groups} + +\item{welfare_type}{character: Welfare type} + +\item{reporting_level}{character: Geographical reporting level} + +\item{ppp}{numeric: Custom Purchase Power Parity value} + +\item{lkup}{list: A list of lkup tables} + +\item{censor}{logical: Triggers censoring of country/year statistics} + +\item{lkup_hash}{character: hash of pip} + +\item{additional_ind}{logical: If TRUE add new set of indicators. Default if +FALSE} +} +\value{ +data.table +} +\description{ +Compute the main PIP poverty and inequality statistics. +} +\examples{ +\dontrun{ +# Create lkups +lkups <- create_lkups("") + +# A single country and year +pip_old_lineups(country = "AGO", + year = 2000, + povline = 1.9, + lkup = lkups) + +# All years for a single country +pip_old_lineups(country = "AGO", + year = "all", + povline = 1.9, + lkup = lkups) + +# Fill gaps +pip_old_lineups(country = "AGO", + year = "all", + povline = 1.9, + fill_gaps = TRUE, + lkup = lkups) + +# Group by regions +pip_old_lineups(country = "all", + year = "all", + povline = 1.9, + group_by = "wb", + lkup = lkups) +} +} diff --git a/man/rg_pip_old.Rd b/man/rg_pip_old.Rd index bee24977..d5cea86f 100644 --- a/man/rg_pip_old.Rd +++ b/man/rg_pip_old.Rd @@ -1,9 +1,20 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pip_old.R +% Please edit documentation in R/pip_old.R, R/rg_pip_old.R \name{rg_pip_old} \alias{rg_pip_old} \title{Compute survey year stats} \usage{ +rg_pip_old( + country, + year, + povline, + popshare, + welfare_type, + reporting_level, + ppp, + lkup +) + rg_pip_old( country, year, @@ -34,9 +45,13 @@ poverty line} \item{lkup}{list: A list of lkup tables} } \value{ +data.frame + data.frame } \description{ +Compute the main PIP poverty and inequality statistics for survey years. + Compute the main PIP poverty and inequality statistics for survey years. } \keyword{internal} diff --git a/man/use_new_lineup_version.Rd b/man/use_new_lineup_version.Rd new file mode 100644 index 00000000..59360d6c --- /dev/null +++ b/man/use_new_lineup_version.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_lkups.R +\name{use_new_lineup_version} +\alias{use_new_lineup_version} +\title{Should the new lineup approach be used?} +\usage{ +use_new_lineup_version(x) +} +\arguments{ +\item{x}{A character vector where each element starts with an +8-digit date in the format \code{YYYYMMDD}.} +} +\value{ +A logical vector: \code{TRUE} if the extracted date is +after May 1st, 2025, otherwise \code{FALSE}. +} +\description{ +Check if the date in a string is more recent than May 2025 +} +\details{ +This function extracts the first 8 characters from an input string, +interprets them as a date in the format \code{YYYYMMDD}, and checks +whether this date is more recent than May 1st, 2025. +} +\examples{ +use_new_lineup_version("20250401_2021_01_02_PROD") # FALSE +use_new_lineup_version("20250615_2021_01_02_PROD") # TRUE + +} From 68a4684c76de62ffb283c7dfb9b069a2210fc77c Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Thu, 14 Aug 2025 12:11:37 -0400 Subject: [PATCH 037/203] document --- man/ui_cp_charts.Rd | 2 +- man/ui_cp_poverty_charts.Rd | 2 +- man/ui_hp_countries.Rd | 2 +- man/ui_pc_charts.Rd | 2 +- man/ui_pc_regional.Rd | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/man/ui_cp_charts.Rd b/man/ui_cp_charts.Rd index 010a71b1..03ea2a00 100644 --- a/man/ui_cp_charts.Rd +++ b/man/ui_cp_charts.Rd @@ -7,7 +7,7 @@ ui_cp_charts( country = "AGO", povline = 1.9, - pop_units = 1e+06, + pop_units = 1000000, lkup, lkup_hash = lkup$cache_data_id$hash_ui_cp ) diff --git a/man/ui_cp_poverty_charts.Rd b/man/ui_cp_poverty_charts.Rd index 3b211f31..dcfbb943 100644 --- a/man/ui_cp_poverty_charts.Rd +++ b/man/ui_cp_poverty_charts.Rd @@ -4,7 +4,7 @@ \alias{ui_cp_poverty_charts} \title{CP Poverty Charts} \usage{ -ui_cp_poverty_charts(country, povline, pop_units, lkup) +ui_cp_poverty_charts(country, povline, pop_units = 1000000, lkup) } \arguments{ \item{country}{character: Country ISO 3 codes} diff --git a/man/ui_hp_countries.Rd b/man/ui_hp_countries.Rd index 2ffcf443..760f458e 100644 --- a/man/ui_hp_countries.Rd +++ b/man/ui_hp_countries.Rd @@ -7,7 +7,7 @@ ui_hp_countries( country = c("IDN", "CIV"), povline = 1.9, - pop_units = 1e+06, + pop_units = 1000000, lkup ) } diff --git a/man/ui_pc_charts.Rd b/man/ui_pc_charts.Rd index 49715b48..438dfca9 100644 --- a/man/ui_pc_charts.Rd +++ b/man/ui_pc_charts.Rd @@ -12,7 +12,7 @@ ui_pc_charts( group_by = "none", welfare_type = c("all", "consumption", "income"), reporting_level = c("all", "national", "rural", "urban"), - pop_units = 1e+06, + pop_units = 1000000, lkup ) } diff --git a/man/ui_pc_regional.Rd b/man/ui_pc_regional.Rd index 083e418b..3cbde341 100644 --- a/man/ui_pc_regional.Rd +++ b/man/ui_pc_regional.Rd @@ -8,7 +8,7 @@ ui_pc_regional( country = "ALL", year = "ALL", povline = 1.9, - pop_units = 1e+06, + pop_units = 1000000, lkup ) } From b69b1c62a8a9190977ad65423ba01cd864b5fe4b Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Thu, 14 Aug 2025 12:12:09 -0400 Subject: [PATCH 038/203] Increment version number to 1.3.19.9000 --- DESCRIPTION | 2 +- NEWS.md | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 9f6f1693..ceb175a6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pipapi Title: API for the Poverty and Inequality Platform -Version: 1.3.19 +Version: 1.3.19.9000 Authors@R: c(person(given = "Tony", family = "Fujs", diff --git a/NEWS.md b/NEWS.md index 55099b39..50441fb5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,5 @@ +# pipapi (development version) + # pipapi 1.3.19 * fix issue with comparability From d325cec19e0d53fd38b9f7e2d587f9ac9359d402 Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Thu, 14 Aug 2025 12:17:01 -0400 Subject: [PATCH 039/203] rm setting use_new_lneups to TRUE --- R/create_lkups.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/R/create_lkups.R b/R/create_lkups.R index fdfa31ed..09da3a0c 100644 --- a/R/create_lkups.R +++ b/R/create_lkups.R @@ -75,8 +75,6 @@ create_lkups <- function(data_dir, versions) { # Use new lineup approach? ----- use_new_lineup_version <- use_new_lineup_version(versions) - # ZP temp - use_new_lineup_version <- TRUE # Get survey paths ---- paths <- list.files(fs::path(data_dir, "survey_data")) From f17a5cc1179fb74440127851743f8148a7d77fa8 Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Thu, 14 Aug 2025 15:34:08 -0400 Subject: [PATCH 040/203] rm ref_lkup from new fg --- R/fg_pip.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/fg_pip.R b/R/fg_pip.R index 7c5cbc02..d1df4c90 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -17,7 +17,6 @@ fg_pip <- function(country, valid_regions <- lkup$query_controls$region$values interpolation_list <- lkup$interpolation_list data_dir <- lkup$data_root - ref_lkup <- lkup$ref_lkup # the normal refy table, some country-years have two rows (interpolation) refy_lkup <- lkup$refy_lkup # cleaned refy table, unique by country-years but some columns removed in order to do that cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb") @@ -79,7 +78,7 @@ fg_pip <- function(country, funique() } dtemp <- - ref_lkup |> + refy_lkup |> fsubset(country_code %in% cntry & reporting_year %in% yr) |> fsubset(reporting_year %in% lkup$valid_years$lineup_years) |> From ed57649d9f216b9c5fa958c1b2e23de6270e140e Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Thu, 14 Aug 2025 15:36:57 -0400 Subject: [PATCH 041/203] fix lkups for old fn --- R/create_lkups.R | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/R/create_lkups.R b/R/create_lkups.R index 09da3a0c..531f7933 100644 --- a/R/create_lkups.R +++ b/R/create_lkups.R @@ -73,6 +73,7 @@ extract_data_dirs <- #' @return list create_lkups <- function(data_dir, versions) { + # Use new lineup approach? ----- use_new_lineup_version <- use_new_lineup_version(versions) @@ -527,8 +528,11 @@ create_lkups <- function(data_dir, versions) { # CREATE OBJECT: valid_years ---- valid_years <- valid_years(data_dir) - valid_years <- c(valid_years, - lineup_years) # add lineup years + if (use_new_lineup_version) { + valid_years <- c(valid_years, + lineup_years) # add lineup years + + } # CREATE OBJECT: query_controls ---- # Create list of query controls @@ -636,7 +640,6 @@ create_lkups <- function(data_dir, versions) { lkup <- list( svy_lkup = svy_lkup, ref_lkup = ref_lkup, - refy_lkup = refy_lkup, dist_stats = dist_stats, pop_region = pop_region, cp_lkups = cp_lkups, @@ -650,9 +653,13 @@ create_lkups <- function(data_dir, versions) { interpolation_list = interpolation_list, valid_years = valid_years, cache_data_id = cache_data_id, - lineup_dist_stats = lineup_dist_stats, use_new_lineup_version = use_new_lineup_version) + if (use_new_lineup_version) { + lkup$refy_lkup <- refy_lkup + lkup$lineup_dist_stats <- lineup_dist_stats + } + return(lkup) } From f561be821c825e51065a7b95f2e07bc9384aa73e Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Mon, 18 Aug 2025 15:53:27 -0400 Subject: [PATCH 042/203] full list function, incl rm cache country-year --- R/fg_pip.R | 135 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 91 insertions(+), 44 deletions(-) diff --git a/R/fg_pip.R b/R/fg_pip.R index d1df4c90..a53f2478 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -57,47 +57,11 @@ fg_pip <- function(country, data_in_cache = data_present_in_master)) } + full_list <- create_full_list(country = country, + year = year, + refy_lkup = refy_lkup, + data_present_in_master = data_present_in_master) - #' # ZP Add: load refy data - #------------------------- - # Extract unique combinations of country-year - if (any(c("ALL", "WLD") %in% country)) { - cntry <- refy_lkup$country_code |> - funique() - } else { - cntry <- refy_lkup[country_code %in% country, - ]$country_code |> - funique() - } - if (any(c("ALL") %in% year)) { - yr <- refy_lkup$reporting_year |> - unique() - } else { - yr <- refy_lkup[reporting_year %in% year, - ]$reporting_year |> - funique() - } - dtemp <- - refy_lkup |> - fsubset(country_code %in% cntry & - reporting_year %in% yr) |> - fsubset(reporting_year %in% lkup$valid_years$lineup_years) |> - fselect(country_code, - year = reporting_year) |> - funique() - - # Split years by country - full_list <- dtemp[, - .(year = list(year)), - by = country_code][ - , .(country_code, year = year) - ] - - # Convert to desired structure - full_list <- list( - country_code = full_list$country_code, - year = lapply(full_list$year, - as.numeric)) lt <- pipdata::load_list_refy(input_list = full_list, path = fs::path(data_dir, @@ -155,10 +119,14 @@ fg_pip <- function(country, #------------------------- metadata[, file := basename(path)] + print(metadata) + # TO BE REMOVED, ONLY FOR TESTING!!! rlang::env_poke(env = globalenv(), - nm = "metadata_check", + nm = "metadata_check2", value = metadata) + metadata <- copy(metadata_check2) + #stop("---------------") # try metadata unique code tmp_metadata <- metadata # Handle multiple distribution types (for aggregated distributions) @@ -178,17 +146,21 @@ fg_pip <- function(country, } else { NA }}), - by = reporting_year, .SDcols = meta_vars] + by = c("reporting_year", "country_code", "reporting_level", "welfare_type"), + .SDcols = meta_vars] # Remove duplicate rows by reporting_year (keep only one row per # reporting_year) - tmp_metadata_unique <- unique(tmp_metadata, by = "reporting_year") + tmp_metadata_unique <- funique(tmp_metadata) tmp_metadata_unique[, file := paste0(country_code, "_", reporting_year)] + # rlang::env_poke(env = globalenv(), + # nm = "tmp_metadata_unique_check", + # value = tmp_metadata_unique) rlang::env_poke(env = globalenv(), - nm = "tmp_metadata_unique_check", + nm = "tmp_metadata_unique_check2", value = tmp_metadata_unique) rlang::env_poke(env = globalenv(), nm = "res_final", @@ -236,6 +208,13 @@ fg_pip <- function(country, out[, max_year := NULL] } + # in_cache <- data_present_in_master |> + # fmutate(file = paste(country_code, reporting_year, sep = "_")) |> + # fselect(file) |> + # reg_elem() + # # rm rows in out that are in cache + # out <- out[!(file %in% in_cache)] + return(list(main_data = out, data_in_cache = data_present_in_master)) @@ -319,3 +298,71 @@ fg_assign_nas_values_to_dup_cols <- function(df, df[, (cols) := NA] return(df) } + + + + + + +#' Create full list for fg data load, not including country-years in cache +#' +#' @param country Country selected in [fg_pip] function +#' @param year Year/s selected in [fg_pip] function +#' @param refy_lkup reference year lkup table with full lineups and new method +#' @param data_present_in_master cache data +#' +#' @return list +create_full_list <- function(country, year, refy_lkup, data_present_in_master) { + + if (!is.null(data_present_in_master)) { + data_not_in_cache <- + joyn::anti_join(x = refy_lkup, + y = data_present_in_master, + by = c("country_code", "reporting_year")) + } else { + data_not_in_cache <- refy_lkup + } + + # Extract unique combinations of country-year + if (any(c("ALL", "WLD") %in% country)) { + cntry <- data_not_in_cache$country_code |> + funique() + } else { + cntry <- data_not_in_cache[country_code %in% country, + ]$country_code |> + funique() + } + if (any(c("ALL") %in% year)) { + yr <- data_not_in_cache$reporting_year |> + unique() + } else { + yr <- data_not_in_cache[reporting_year %in% year, + ]$reporting_year |> + funique() + } + dtemp <- + data_not_in_cache |> + fsubset(country_code %in% cntry & + reporting_year %in% yr) |> + fsubset(reporting_year %in% lkup$valid_years$lineup_years) |> + fselect(country_code, + year = reporting_year) |> + funique() + + # Split years by country + full_list <- dtemp[, + .(year = list(year)), + by = country_code][ + , .(country_code, year = year) + ] + + # Convert to desired structure + full_list <- list( + country_code = full_list$country_code, + year = lapply(full_list$year, + as.numeric)) + + + full_list + +} From 5394ac494801a7f5b64a9b5b18551250c0615306 Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Mon, 18 Aug 2025 16:22:38 -0400 Subject: [PATCH 043/203] comment out env save checks --- R/fg_pip.R | 60 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/R/fg_pip.R b/R/fg_pip.R index a53f2478..58993206 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -82,9 +82,9 @@ fg_pip <- function(country, x }) - rlang::env_poke(env = globalenv(), - nm = "pipdata_list", - value = lt) + # rlang::env_poke(env = globalenv(), + # nm = "pipdata_list", + # value = lt) # ZP Add: do fgt estimations using `res <- lapply(lt, process_dt, povline = povline)` #------------------------- @@ -95,10 +95,10 @@ fg_pip <- function(country, res <- rbindlist(res, fill = TRUE) - # TO BE REMOVED, ONLY FOR TESTING!!! - rlang::env_poke(env = globalenv(), - nm = "res_povest", - value = res) + # # TO BE REMOVED, ONLY FOR TESTING!!! + # rlang::env_poke(env = globalenv(), + # nm = "res_povest", + # value = res) # ZP Add: join to dist_stats #------------------------- @@ -122,10 +122,10 @@ fg_pip <- function(country, print(metadata) # TO BE REMOVED, ONLY FOR TESTING!!! - rlang::env_poke(env = globalenv(), - nm = "metadata_check2", - value = metadata) - metadata <- copy(metadata_check2) + # rlang::env_poke(env = globalenv(), + # nm = "metadata_check2", + # value = metadata) + # metadata <- copy(metadata_check2) #stop("---------------") # try metadata unique code tmp_metadata <- metadata @@ -159,12 +159,12 @@ fg_pip <- function(country, # rlang::env_poke(env = globalenv(), # nm = "tmp_metadata_unique_check", # value = tmp_metadata_unique) - rlang::env_poke(env = globalenv(), - nm = "tmp_metadata_unique_check2", - value = tmp_metadata_unique) - rlang::env_poke(env = globalenv(), - nm = "res_final", - value = res) + # rlang::env_poke(env = globalenv(), + # nm = "tmp_metadata_unique_check2", + # value = tmp_metadata_unique) + # rlang::env_poke(env = globalenv(), + # nm = "res_final", + # value = res) out <- join(res, tmp_metadata_unique, @@ -177,9 +177,9 @@ fg_pip <- function(country, drop.dup.cols = TRUE, verbose = 0) - rlang::env_poke(env = globalenv(), - nm = "out_check", - value = out) + # rlang::env_poke(env = globalenv(), + # nm = "out_check", + # value = out) out[, `:=`( file = NULL )] @@ -188,14 +188,14 @@ fg_pip <- function(country, "povline", "poverty_line") - rlang::env_poke(env = globalenv(), - nm = "out1", - value = out) + # rlang::env_poke(env = globalenv(), + # nm = "out1", + # value = out) # Ensure that out does not have duplicates out <- fg_remove_duplicates(out) - rlang::env_poke(env = globalenv(), - nm = "out2", - value = out) + # rlang::env_poke(env = globalenv(), + # nm = "out2", + # value = out) # Fix issue with rounding of poverty lines out[, @@ -316,9 +316,11 @@ create_full_list <- function(country, year, refy_lkup, data_present_in_master) { if (!is.null(data_present_in_master)) { data_not_in_cache <- - joyn::anti_join(x = refy_lkup, - y = data_present_in_master, - by = c("country_code", "reporting_year")) + joyn::anti_join(x = refy_lkup, + y = data_present_in_master, + by = c("country_code", "reporting_year"), + reportvar = FALSE, + verbose = FALSE) } else { data_not_in_cache <- refy_lkup } From 7a1ae757cd0b0558c0f9e9fb454e13eebd17a853 Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Mon, 18 Aug 2025 16:23:20 -0400 Subject: [PATCH 044/203] Increment version number to 1.3.19.9001 --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index ceb175a6..a758dd4f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pipapi Title: API for the Poverty and Inequality Platform -Version: 1.3.19.9000 +Version: 1.3.19.9001 Authors@R: c(person(given = "Tony", family = "Fujs", From 129aa4142e0a60aca2db309c2fc5a5929a576367 Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Wed, 20 Aug 2025 15:39:12 -0400 Subject: [PATCH 045/203] make dist stats NA for fg pip --- R/pip.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/pip.R b/R/pip.R index 7fe60b2b..bbd9e04f 100644 --- a/R/pip.R +++ b/R/pip.R @@ -359,10 +359,10 @@ pip_new_lineups <- function(country = "ALL", # ZP temp NA lineups: #--------------------- # ## Inequality indicators to NA for lineup years ---- - # dist_vars <- names2keep[!(names2keep %in% crr_names)] - # out[, - # (dist_vars) := NA_real_] - # + dist_vars <- names2keep[!(names2keep %in% crr_names)] + out[, + (dist_vars) := NA_real_] + ## estimate_var ----- out <- estimate_type_ctr_lnp(out, lkup) From fa602a0589d4050085a407bf7456367b78828158 Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Thu, 21 Aug 2025 23:12:55 -0400 Subject: [PATCH 046/203] new attr to cols functions --- R/add_attributes_as_columns.R | 229 ++++++++++++++++++++++++++++++++++ R/fg_pip.R | 55 ++++---- 2 files changed, 255 insertions(+), 29 deletions(-) create mode 100644 R/add_attributes_as_columns.R diff --git a/R/add_attributes_as_columns.R b/R/add_attributes_as_columns.R new file mode 100644 index 00000000..2617f1d1 --- /dev/null +++ b/R/add_attributes_as_columns.R @@ -0,0 +1,229 @@ +#' Add attributes as columns (vectorized, in-place) +#' +#' @description +#' Converts survey attributes on a `data.table`—including +#' `reporting_level_rows`, `country_code`, `reporting_year`, and `dist_stats`— +#' into columns using a **loop-free, segment-replication** strategy. Designed +#' for very large tables and objects loaded via `readRDS()`/`load()`: +#' uses `setDT()` and `alloc.col()` to ensure in-place assignment. +#' +#' @details +#' The function expects an attribute `reporting_level_rows`, a list with: +#' - `reporting_level`: character vector of the level label for each segment +#' (e.g., `c("rural","urban","rural", ...)`). +#' - `rows`: integer vector of **cumulative** row-ends (e.g., +#' `c(100000, 200000, 300000, ...)`). +#' +#' Segment lengths are computed as `diff(c(0L, rows))`, and `reporting_level` +#' is replicated with `rep.int(lev, counts)`. Constants `country_code`, +#' `reporting_year`, and `file` (`paste0(country_code, "_", reporting_year)`) +#' are added to all rows. If `dist_stats$mean` / `dist_stats$median` are +#' provided (as named vectors/lists keyed by level), they are mapped by level +#' name and replicated per segment. If a level is missing from the names, +#' `NA` values may result for that segment. +#' +#' This implementation avoids loops and `findInterval()` edge cases, and +#' modifies `dt` by reference. +#' +#' @param dt A `data.table` carrying the attributes described above. +#' +#' @return The same `data.table`, modified by reference, with added columns: +#' `reporting_level`, `country_code`, `reporting_year`, `file`, and (if +#' present) `mean`, `median`. +#' +#' @section Assumptions: +#' * `length(reporting_level_rows$reporting_level) == length(reporting_level_rows$rows)`. +#' * `rows` are cumulative and non-decreasing, and their segment lengths sum to `nrow(dt)`. +#' * If `dist_stats$mean` / `dist_stats$median` have multiple values, their names +#' align with the level labels. +#' +#' @note For objects loaded from disk (e.g., via `readRDS()`), `alloc.col(dt)` +#' ensures there is spare column capacity for by-reference assignment. +#' +#' @seealso [add_attributes_as_columns_multi()], [assign_stat()] +#' +#' @examples +#' \dontrun{ +#' library(data.table) +#' dt <- data.table(weight = 1:6, welfare = runif(6)) +#' attr(dt, "reporting_level_rows") <- list( +#' reporting_level = c("rural","urban","rural"), +#' rows = c(2L, 4L, 6L) +#' ) +#' attr(dt, "country_code") <- "XXY" +#' attr(dt, "reporting_year") <- 2000L +#' attr(dt, "dist_stats") <- list( +#' mean = list(rural = 2.5, urban = 5.0), +#' median = list(rural = 2.0, urban = 4.5) +#' ) +#' +#' add_attributes_as_columns_vectorized(dt) +#' head(dt) +#' } +#' +#' @import data.table +#' @export +add_attributes_as_columns_vectorized <- function(dt) { + # Ensure proper internal state & spare column capacity (handles readRDS/load cases) + setDT(dt) # harmless if already a data.table + alloc.col(dt) # pre-allocate room for new columns + + rl <- attr(dt, "reporting_level_rows") + lev <- rl$reporting_level + rows <- as.integer(rl$rows) + n <- nrow(dt) + + counts <- diff(c(0L, rows)) + if (sum(counts) != n) stop("Sum of 'rows' in attribute does not equal nrow(dt).") + + # reporting_level: vectorized, no loops, no findInterval edge cases + dt[, reporting_level := rep.int(lev, counts)] + + # constants + cc <- attr(dt, "country_code") + ry <- attr(dt, "reporting_year") + dt[, `:=`( + country_code = cc, + reporting_year = ry, + file = paste0(cc, "_", ry) + )] + + # dist_stats per reporting_level (align by names, then replicate by counts) + ds <- attr(dt, "dist_stats") + if (length(ds)) { + if (!is.null(ds$mean)) { + m <- unlist(ds$mean, use.names = TRUE) + dt[, mean := rep.int(unname(m[match(lev, names(m))]), counts)] + } + if (!is.null(ds$median)) { + md <- unlist(ds$median, use.names = TRUE) + dt[, median := rep.int(unname(md[match(lev, names(md))]), counts)] + } + } + + dt +} + + + +#' Add attributes as columns for multi-segment reporting levels +#' +#' @description +#' Converts attributes on a survey `data.table` (e.g., `reporting_level_rows`, +#' `country_code`, `reporting_year`, and `dist_stats`) into columns, handling +#' **multiple alternating segments** (e.g., CHN rural/urban/rural/urban) or +#' single-segment cases (e.g., ZAF). +#' +#' @param dt A `data.table` with attributes: +#' - `reporting_level_rows`: list with `reporting_level` (character) and +#' `rows` (integer cumulative row ends). +#' - `country_code` (character). +#' - `reporting_year` (integer/numeric). +#' - `dist_stats` (list) optionally containing `mean` and/or `median`, each as +#' a named list/vector keyed by reporting level, or a single scalar. +#' +#' @return The same `data.table`, modified by reference, with new columns: +#' `reporting_level`, `country_code`, `reporting_year`, `file`, and +#' optionally `mean`, `median`. +#' +#' @examples +#' # chn2000_cols <- add_attributes_as_columns_multi(chn2000) +#' # zaf2000_cols <- add_attributes_as_columns_multi(zaf2000) +#' @import data.table +#' @export +add_attributes_as_columns_multi <- function(dt) { + # Ensure DT internals and spare capacity for new columns + setDT(dt) + alloc.col(dt) + + # --- Pull + validate segment metadata --- + rl <- attr(dt, "reporting_level_rows") + if (is.null(rl) || is.null(rl$reporting_level) || is.null(rl$rows)) { + stop("Missing 'reporting_level_rows' attribute with $reporting_level and $rows.") + } + lev <- as.character(rl$reporting_level) + rows <- as.integer(rl$rows) + n <- nrow(dt) + + if (length(lev) != length(rows)) stop("'reporting_level' and 'rows' lengths differ.") + if (length(rows) == 0L) stop("'rows' is empty.") + if (any(diff(rows) < 0L)) stop("'rows' must be non-decreasing.") + if (rows[length(rows)] != n) stop("Last element of 'rows' must equal nrow(dt).") + + counts <- diff(c(0L, rows)) + if (any(counts <= 0L)) stop("Computed non-positive segment length(s).") + + # --- reporting_level: vectorized per-segment replication --- + dt[, reporting_level := rep.int(lev, counts)] + + # --- constants --- + cc <- attr(dt, "country_code") + ry <- attr(dt, "reporting_year") + dt[, `:=`( + country_code = cc, + reporting_year = ry, + file = paste0(cc, "_", ry) + )] + + # --- distribution stats --- + ds <- attr(dt, "dist_stats") + if (length(ds)) { + assign_stat(dt, lev, counts, ds$mean, "mean") + assign_stat(dt, lev, counts, ds$median, "median") + } + + dt +} + + + +#' Assign a per-level statistic to a data.table column (by reference) +#' +#' @description +#' Replicates a statistic per reporting-level segment and assigns it to a new +#' column in `dt`, **in place**. `stat` can be a scalar (broadcast), a named +#' vector, or a named list (one value per level). +#' +#' @param dt A `data.table`. Modified by reference. +#' @param lev Character vector of reporting-level labels per segment +#' (e.g., `c("rural","urban","rural", ...)`). +#' @param counts Integer vector of segment lengths matching `lev` +#' (e.g., `c(100000, 100000, 100000, ...)`). +#' @param stat A numeric scalar, named vector, or named list with one value per +#' level (names must match `lev` values). +#' @param colname Name of the column to create/overwrite. +#' +#' @return Invisibly returns `dt` (modified by reference). +#' @examples +#' # assign_stat(dt, lev, counts, list(rural = 2.6, urban = 5.5), "mean") +#' @import data.table +#' @export +assign_stat <- function(dt, lev, counts, stat, colname) { + if (is.null(stat)) return(invisible(dt)) + n <- nrow(dt) + + v <- if (is.list(stat)) unlist(stat, use.names = TRUE) else stat + + # Single scalar: broadcast + if (length(v) == 1L && is.null(names(v))) { + dt[, (colname) := rep.int(unname(v), n)] + return(invisible(dt)) + } + + # Need names to map values to levels + if (is.null(names(v))) { + stop("`stat` has length > 1 but no names; cannot map to levels.") + } + + map_idx <- match(lev, names(v)) + if (anyNA(map_idx)) { + missing_levels <- unique(lev[is.na(map_idx)]) + stop( + sprintf("`stat` missing value(s) for level(s): %s", + paste(missing_levels, collapse = ", ")) + ) + } + + dt[, (colname) := rep.int(unname(v[map_idx]), counts)] + invisible(dt) +} diff --git a/R/fg_pip.R b/R/fg_pip.R index 58993206..3a075d67 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -52,7 +52,7 @@ fg_pip <- function(country, # Return empty dataframe if no metadata is found (i.e. all in cache) if (nrow(metadata) == 0) { - print("ZP: no metadata - i.e. nothing additional to estimate") + #print("ZP: no metadata - i.e. nothing additional to estimate") return(list(main_data = pipapi::empty_response_fg, data_in_cache = data_present_in_master)) } @@ -69,17 +69,8 @@ fg_pip <- function(country, lt <- lapply(lt, FUN = \(x) { - x <- x |> - pipdata::attr_to_column("reporting_level_rows") |> # only rep level???? - pipdata::attr_to_column("country_code") |> - pipdata::attr_to_column("reporting_year") |> - pipdata::attr_to_column("mean", dist_stats = TRUE) |> - pipdata::attr_to_column("median", dist_stats = TRUE) |> - fmutate(file = paste0(country_code, - "_", - reporting_year)) - - x + x |> + add_attributes_as_columns_vectorized() }) # rlang::env_poke(env = globalenv(), @@ -119,7 +110,6 @@ fg_pip <- function(country, #------------------------- metadata[, file := basename(path)] - print(metadata) # TO BE REMOVED, ONLY FOR TESTING!!! # rlang::env_poke(env = globalenv(), @@ -192,7 +182,8 @@ fg_pip <- function(country, # nm = "out1", # value = out) # Ensure that out does not have duplicates - out <- fg_remove_duplicates(out) + out <- fg_remove_duplicates(out, + use_new_lineup_version = lkup$use_new_lineup_version) # rlang::env_poke(env = globalenv(), # nm = "out2", # value = out) @@ -246,22 +237,28 @@ fg_remove_duplicates <- function(df, "survey_median_ppp", "survey_time", "survey_year", - "surveyid_year")) { - # not all cols need to be changes - cols <- setdiff(cols, - colnames(df)) - # Modify cache_id - # * Ensures that cache_id is unique for both extrapolated and interpolated surveys - # * Ensures that cache_id can be kept as an output of fg_pip() while still removing duplicated rows - # df$cache_id <- fg_standardize_cache_id(cache_id = df$cache_id, - # interpolation_id = df$data_interpolation_id, - # reporting_level = df$reporting_level) - # Set collapse vars to NA (by type) - df <- fg_assign_nas_values_to_dup_cols(df = df, - cols = cols) + "surveyid_year"), + use_new_lineup_version = FALSE) { + + if (isFALSE(use_new_lineup_version)) { + print("here") + # not all cols need to be changes + cols <- setdiff(cols, + colnames(df)) + # Modify cache_id + # * Ensures that cache_id is unique for both extrapolated and interpolated surveys + # * Ensures that cache_id can be kept as an output of fg_pip() while still removing duplicated rows + # df$cache_id <- fg_standardize_cache_id(cache_id = df$cache_id, + # interpolation_id = df$data_interpolation_id, + # reporting_level = df$reporting_level) + # Set collapse vars to NA (by type) + df <- fg_assign_nas_values_to_dup_cols(df = df, + cols = cols) + + # Ensure that out does not have duplicates + df <- unique(df) + } - # Ensure that out does not have duplicates - df <- unique(df) return(df) } From 1ca018f76179427aa9cb3c69294cf511565eb08b Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Thu, 21 Aug 2025 23:15:06 -0400 Subject: [PATCH 047/203] rm comments & print checks --- R/fg_pip.R | 56 +----------------------------------------------------- R/pip.R | 18 +++++++----------- 2 files changed, 8 insertions(+), 66 deletions(-) diff --git a/R/fg_pip.R b/R/fg_pip.R index 3a075d67..7e568ab5 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -73,10 +73,6 @@ fg_pip <- function(country, add_attributes_as_columns_vectorized() }) - # rlang::env_poke(env = globalenv(), - # nm = "pipdata_list", - # value = lt) - # ZP Add: do fgt estimations using `res <- lapply(lt, process_dt, povline = povline)` #------------------------- res <- lapply(lt, @@ -86,37 +82,11 @@ fg_pip <- function(country, res <- rbindlist(res, fill = TRUE) - # # TO BE REMOVED, ONLY FOR TESTING!!! - # rlang::env_poke(env = globalenv(), - # nm = "res_povest", - # value = res) - - # ZP Add: join to dist_stats - #------------------------- - # ly_dist <- lkup$lineup_dist_stats - # res <- res |> - # joyn::left_join(y = ly_dist, - # by = c("file", - # "reporting_level", - # "reporting_year"), - # relationship = "one-to-one", - # reportvar = FALSE, - # verbose = FALSE) - # rlang::env_poke(env = globalenv(), - # nm = "res_dist", - # value = res) - # ZP Add: join to metadata #------------------------- metadata[, file := basename(path)] - # TO BE REMOVED, ONLY FOR TESTING!!! - # rlang::env_poke(env = globalenv(), - # nm = "metadata_check2", - # value = metadata) - # metadata <- copy(metadata_check2) - #stop("---------------") # try metadata unique code tmp_metadata <- metadata # Handle multiple distribution types (for aggregated distributions) @@ -146,15 +116,6 @@ fg_pip <- function(country, file := paste0(country_code, "_", reporting_year)] - # rlang::env_poke(env = globalenv(), - # nm = "tmp_metadata_unique_check", - # value = tmp_metadata_unique) - # rlang::env_poke(env = globalenv(), - # nm = "tmp_metadata_unique_check2", - # value = tmp_metadata_unique) - # rlang::env_poke(env = globalenv(), - # nm = "res_final", - # value = res) out <- join(res, tmp_metadata_unique, @@ -167,9 +128,6 @@ fg_pip <- function(country, drop.dup.cols = TRUE, verbose = 0) - # rlang::env_poke(env = globalenv(), - # nm = "out_check", - # value = out) out[, `:=`( file = NULL )] @@ -178,15 +136,10 @@ fg_pip <- function(country, "povline", "poverty_line") - # rlang::env_poke(env = globalenv(), - # nm = "out1", - # value = out) # Ensure that out does not have duplicates out <- fg_remove_duplicates(out, use_new_lineup_version = lkup$use_new_lineup_version) - # rlang::env_poke(env = globalenv(), - # nm = "out2", - # value = out) + # Fix issue with rounding of poverty lines out[, @@ -199,13 +152,6 @@ fg_pip <- function(country, out[, max_year := NULL] } - # in_cache <- data_present_in_master |> - # fmutate(file = paste(country_code, reporting_year, sep = "_")) |> - # fselect(file) |> - # reg_elem() - # # rm rows in out that are in cache - # out <- out[!(file %in% in_cache)] - return(list(main_data = out, data_in_cache = data_present_in_master)) diff --git a/R/pip.R b/R/pip.R index bbd9e04f..0f24ebb7 100644 --- a/R/pip.R +++ b/R/pip.R @@ -250,31 +250,27 @@ pip_new_lineups <- function(country = "ALL", #--------------------------------------------- cached_data <- qDT(out$data_in_cache) main_data <- qDT(out$main_data) - print("A") if (nrow(main_data) > 0) { - print("B") - if (is.null(out$data_in_cache)) { - print("C") + out <- main_data } else { - print("D") + if (fill_gaps) { - print("E") - cached_data <- fg_remove_duplicates(cached_data) + + cached_data <- fg_remove_duplicates(cached_data, + use_new_lineup_version = lkup$use_new_lineup_version) } - print("F") + out <- main_data |> rowbind(cached_data) } - print("G") update_master_file(main_data, cache_file_path, fill_gaps) } else { - print("H") out <- cached_data } if (!data.table::is.data.table(out)) { @@ -293,7 +289,7 @@ pip_new_lineups <- function(country = "ALL", } } - # Add out of pipeline variablse + # Add out of pipeline variable #--------------------------------------------- add_vars_out_of_pipeline(out, fill_gaps = fill_gaps, From 7da94f68c0951eca3bf8127ddf1e484824529302 Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Thu, 21 Aug 2025 23:17:10 -0400 Subject: [PATCH 048/203] change checks to cli --- R/add_attributes_as_columns.R | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/R/add_attributes_as_columns.R b/R/add_attributes_as_columns.R index 2617f1d1..f1cd461f 100644 --- a/R/add_attributes_as_columns.R +++ b/R/add_attributes_as_columns.R @@ -64,6 +64,7 @@ #' @import data.table #' @export add_attributes_as_columns_vectorized <- function(dt) { + # Ensure proper internal state & spare column capacity (handles readRDS/load cases) setDT(dt) # harmless if already a data.table alloc.col(dt) # pre-allocate room for new columns @@ -74,10 +75,11 @@ add_attributes_as_columns_vectorized <- function(dt) { n <- nrow(dt) counts <- diff(c(0L, rows)) - if (sum(counts) != n) stop("Sum of 'rows' in attribute does not equal nrow(dt).") + if (sum(counts) != n) cli::cli_abort("Sum of 'rows' in attribute does not equal nrow(dt).") # reporting_level: vectorized, no loops, no findInterval edge cases - dt[, reporting_level := rep.int(lev, counts)] + dt[, + reporting_level := rep.int(lev, counts)] # constants cc <- attr(dt, "country_code") @@ -93,11 +95,17 @@ add_attributes_as_columns_vectorized <- function(dt) { if (length(ds)) { if (!is.null(ds$mean)) { m <- unlist(ds$mean, use.names = TRUE) - dt[, mean := rep.int(unname(m[match(lev, names(m))]), counts)] + dt[, + mean := rep.int(unname(m[match(lev, + names(m))]), + counts)] } if (!is.null(ds$median)) { md <- unlist(ds$median, use.names = TRUE) - dt[, median := rep.int(unname(md[match(lev, names(md))]), counts)] + dt[, + median := rep.int(unname(md[match(lev, + names(md))]), + counts)] } } @@ -139,19 +147,19 @@ add_attributes_as_columns_multi <- function(dt) { # --- Pull + validate segment metadata --- rl <- attr(dt, "reporting_level_rows") if (is.null(rl) || is.null(rl$reporting_level) || is.null(rl$rows)) { - stop("Missing 'reporting_level_rows' attribute with $reporting_level and $rows.") + cli::cli_abort("Missing 'reporting_level_rows' attribute with $reporting_level and $rows.") } lev <- as.character(rl$reporting_level) rows <- as.integer(rl$rows) n <- nrow(dt) - if (length(lev) != length(rows)) stop("'reporting_level' and 'rows' lengths differ.") - if (length(rows) == 0L) stop("'rows' is empty.") - if (any(diff(rows) < 0L)) stop("'rows' must be non-decreasing.") - if (rows[length(rows)] != n) stop("Last element of 'rows' must equal nrow(dt).") + if (length(lev) != length(rows)) cli::cli_abort("'reporting_level' and 'rows' lengths differ.") + if (length(rows) == 0L) cli::cli_abort("'rows' is empty.") + if (any(diff(rows) < 0L)) cli::cli_abort("'rows' must be non-decreasing.") + if (rows[length(rows)] != n) cli::cli_abort("Last element of 'rows' must equal nrow(dt).") counts <- diff(c(0L, rows)) - if (any(counts <= 0L)) stop("Computed non-positive segment length(s).") + if (any(counts <= 0L)) cli::cli_abort("Computed non-positive segment length(s).") # --- reporting_level: vectorized per-segment replication --- dt[, reporting_level := rep.int(lev, counts)] From bf68fccb26c400e2aa9e5625fd6c85a73f498afa Mon Sep 17 00:00:00 2001 From: zander-prinsloo Date: Thu, 21 Aug 2025 23:17:42 -0400 Subject: [PATCH 049/203] Increment version number to 1.3.19.9002 --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index a758dd4f..fa2ae7c1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pipapi Title: API for the Poverty and Inequality Platform -Version: 1.3.19.9001 +Version: 1.3.19.9002 Authors@R: c(person(given = "Tony", family = "Fujs", From 4a25d5b54f194b74b2c8f8a8c0057246151eead4 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Fri, 22 Aug 2025 11:29:47 -0400 Subject: [PATCH 050/203] documnet and add pipdata functions into utils-pipdata --- NAMESPACE | 4 ++ R/fg_pip.R | 18 +++-- R/utils-pipdata.R | 49 +++++++++++++ man/add_attributes_as_columns_multi.Rd | 34 +++++++++ man/add_attributes_as_columns_vectorized.Rd | 80 +++++++++++++++++++++ man/assign_stat.Rd | 33 +++++++++ man/create_full_list.Rd | 23 ++++++ man/fg_pip.Rd | 6 -- man/fg_remove_duplicates.Rd | 3 +- man/load_list_refy.Rd | 20 ++++++ man/transform_input.Rd | 18 +++++ man/ui_cp_charts.Rd | 2 +- man/ui_cp_poverty_charts.Rd | 2 +- man/ui_hp_countries.Rd | 2 +- man/ui_pc_charts.Rd | 2 +- man/ui_pc_regional.Rd | 2 +- 16 files changed, 276 insertions(+), 22 deletions(-) create mode 100644 R/utils-pipdata.R create mode 100644 man/add_attributes_as_columns_multi.Rd create mode 100644 man/add_attributes_as_columns_vectorized.Rd create mode 100644 man/assign_stat.Rd create mode 100644 man/create_full_list.Rd create mode 100644 man/load_list_refy.Rd create mode 100644 man/transform_input.Rd diff --git a/NAMESPACE b/NAMESPACE index 5f8d1ce5..2ee13c9d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,8 +1,11 @@ # Generated by roxygen2: do not edit by hand +export(add_attributes_as_columns_multi) +export(add_attributes_as_columns_vectorized) export(add_dist_stats) export(add_dist_stats_old) export(assign_serializer) +export(assign_stat) export(change_grouped_stats_to_csv) export(citation_from_version) export(create_etag_header) @@ -53,6 +56,7 @@ export(validate_input_grouped_stats) export(version_dataframe) export(wld_lineup_year) import(collapse, except = fdroplevels) +import(data.table) import(data.table, except = fdroplevels) importFrom(glue,glue) importFrom(glue,glue_collapse) diff --git a/R/fg_pip.R b/R/fg_pip.R index 7e568ab5..e8c3edb2 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -63,14 +63,11 @@ fg_pip <- function(country, data_present_in_master = data_present_in_master) lt <- - pipdata::load_list_refy(input_list = full_list, - path = fs::path(data_dir, - "lineup_data")) - - lt <- lapply(lt, - FUN = \(x) { - x |> - add_attributes_as_columns_vectorized() + load_list_refy(input_list = full_list, + path = fs::path(data_dir, "lineup_data")) + + lt <- lapply(lt, \(x) { + add_attributes_as_columns_vectorized(x) }) # ZP Add: do fgt estimations using `res <- lapply(lt, process_dt, povline = povline)` @@ -297,8 +294,9 @@ create_full_list <- function(country, year, refy_lkup, data_present_in_master) { # Split years by country full_list <- dtemp[, .(year = list(year)), - by = country_code][ - , .(country_code, year = year) + by = country_code + ][, + .(country_code, year = year) ] # Convert to desired structure diff --git a/R/utils-pipdata.R b/R/utils-pipdata.R new file mode 100644 index 00000000..16b996ce --- /dev/null +++ b/R/utils-pipdata.R @@ -0,0 +1,49 @@ +#' load refy list +#' +#' @param input_list list. output from [create_full_list] +#' @param path character: directory path +#' +#' @return character vector +#' @keywords internal +load_list_refy <- \(input_list, path){ + input_list <- transform_input(input_list) + + dl <- lapply(input_list, FUN = function(x) { + qs::qread(file = fs::path(path, paste0(x$country_code, "_", + x$year), + ext = "qs")) + }) + + names(dl) <- vapply(input_list, \(x) { + paste0(x$country_code, x$year) + }, + FUN.VALUE = character(1)) + dl +} + + +#' transform input list +#' +#' @inheritParams load_list_refy +#' +#' @return formated list +#' @keywords internal +transform_input <- function(input_list){ + country_codes <- input_list$country_code + years <- input_list$year + if (!is.list(years)) { + years <- lapply(country_codes, function(x) years) + } + else { + if (length(years) != length(country_codes)) { + stop("The length of the 'year' list must match the length of the 'country_code' vector.") + } + } + output_list <- lapply(seq_along(country_codes), function(i) { + lapply(years[[i]], function(y) { + list(country_code = country_codes[i], year = y) + }) + }) + output_list <- unlist(output_list, recursive = FALSE) + return(output_list) +} diff --git a/man/add_attributes_as_columns_multi.Rd b/man/add_attributes_as_columns_multi.Rd new file mode 100644 index 00000000..7df2c2ed --- /dev/null +++ b/man/add_attributes_as_columns_multi.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/add_attributes_as_columns.R +\name{add_attributes_as_columns_multi} +\alias{add_attributes_as_columns_multi} +\title{Add attributes as columns for multi-segment reporting levels} +\usage{ +add_attributes_as_columns_multi(dt) +} +\arguments{ +\item{dt}{A \code{data.table} with attributes: +\itemize{ +\item \code{reporting_level_rows}: list with \code{reporting_level} (character) and +\code{rows} (integer cumulative row ends). +\item \code{country_code} (character). +\item \code{reporting_year} (integer/numeric). +\item \code{dist_stats} (list) optionally containing \code{mean} and/or \code{median}, each as +a named list/vector keyed by reporting level, or a single scalar. +}} +} +\value{ +The same \code{data.table}, modified by reference, with new columns: +\code{reporting_level}, \code{country_code}, \code{reporting_year}, \code{file}, and +optionally \code{mean}, \code{median}. +} +\description{ +Converts attributes on a survey \code{data.table} (e.g., \code{reporting_level_rows}, +\code{country_code}, \code{reporting_year}, and \code{dist_stats}) into columns, handling +\strong{multiple alternating segments} (e.g., CHN rural/urban/rural/urban) or +single-segment cases (e.g., ZAF). +} +\examples{ +# chn2000_cols <- add_attributes_as_columns_multi(chn2000) +# zaf2000_cols <- add_attributes_as_columns_multi(zaf2000) +} diff --git a/man/add_attributes_as_columns_vectorized.Rd b/man/add_attributes_as_columns_vectorized.Rd new file mode 100644 index 00000000..66c51a00 --- /dev/null +++ b/man/add_attributes_as_columns_vectorized.Rd @@ -0,0 +1,80 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/add_attributes_as_columns.R +\name{add_attributes_as_columns_vectorized} +\alias{add_attributes_as_columns_vectorized} +\title{Add attributes as columns (vectorized, in-place)} +\usage{ +add_attributes_as_columns_vectorized(dt) +} +\arguments{ +\item{dt}{A \code{data.table} carrying the attributes described above.} +} +\value{ +The same \code{data.table}, modified by reference, with added columns: +\code{reporting_level}, \code{country_code}, \code{reporting_year}, \code{file}, and (if +present) \code{mean}, \code{median}. +} +\description{ +Converts survey attributes on a \code{data.table}—including +\code{reporting_level_rows}, \code{country_code}, \code{reporting_year}, and \code{dist_stats}— +into columns using a \strong{loop-free, segment-replication} strategy. Designed +for very large tables and objects loaded via \code{readRDS()}/\code{load()}: +uses \code{setDT()} and \code{alloc.col()} to ensure in-place assignment. +} +\details{ +The function expects an attribute \code{reporting_level_rows}, a list with: +\itemize{ +\item \code{reporting_level}: character vector of the level label for each segment +(e.g., \code{c("rural","urban","rural", ...)}). +\item \code{rows}: integer vector of \strong{cumulative} row-ends (e.g., +\code{c(100000, 200000, 300000, ...)}). +} + +Segment lengths are computed as \code{diff(c(0L, rows))}, and \code{reporting_level} +is replicated with \code{rep.int(lev, counts)}. Constants \code{country_code}, +\code{reporting_year}, and \code{file} (\code{paste0(country_code, "_", reporting_year)}) +are added to all rows. If \code{dist_stats$mean} / \code{dist_stats$median} are +provided (as named vectors/lists keyed by level), they are mapped by level +name and replicated per segment. If a level is missing from the names, +\code{NA} values may result for that segment. + +This implementation avoids loops and \code{findInterval()} edge cases, and +modifies \code{dt} by reference. +} +\note{ +For objects loaded from disk (e.g., via \code{readRDS()}), \code{alloc.col(dt)} +ensures there is spare column capacity for by-reference assignment. +} +\section{Assumptions}{ + +\itemize{ +\item \code{length(reporting_level_rows$reporting_level) == length(reporting_level_rows$rows)}. +\item \code{rows} are cumulative and non-decreasing, and their segment lengths sum to \code{nrow(dt)}. +\item If \code{dist_stats$mean} / \code{dist_stats$median} have multiple values, their names +align with the level labels. +} +} + +\examples{ +\dontrun{ +library(data.table) +dt <- data.table(weight = 1:6, welfare = runif(6)) +attr(dt, "reporting_level_rows") <- list( + reporting_level = c("rural","urban","rural"), + rows = c(2L, 4L, 6L) +) +attr(dt, "country_code") <- "XXY" +attr(dt, "reporting_year") <- 2000L +attr(dt, "dist_stats") <- list( + mean = list(rural = 2.5, urban = 5.0), + median = list(rural = 2.0, urban = 4.5) +) + +add_attributes_as_columns_vectorized(dt) +head(dt) +} + +} +\seealso{ +\code{\link[=add_attributes_as_columns_multi]{add_attributes_as_columns_multi()}}, \code{\link[=assign_stat]{assign_stat()}} +} diff --git a/man/assign_stat.Rd b/man/assign_stat.Rd new file mode 100644 index 00000000..987b08f4 --- /dev/null +++ b/man/assign_stat.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/add_attributes_as_columns.R +\name{assign_stat} +\alias{assign_stat} +\title{Assign a per-level statistic to a data.table column (by reference)} +\usage{ +assign_stat(dt, lev, counts, stat, colname) +} +\arguments{ +\item{dt}{A \code{data.table}. Modified by reference.} + +\item{lev}{Character vector of reporting-level labels per segment +(e.g., \code{c("rural","urban","rural", ...)}).} + +\item{counts}{Integer vector of segment lengths matching \code{lev} +(e.g., \code{c(100000, 100000, 100000, ...)}).} + +\item{stat}{A numeric scalar, named vector, or named list with one value per +level (names must match \code{lev} values).} + +\item{colname}{Name of the column to create/overwrite.} +} +\value{ +Invisibly returns \code{dt} (modified by reference). +} +\description{ +Replicates a statistic per reporting-level segment and assigns it to a new +column in \code{dt}, \strong{in place}. \code{stat} can be a scalar (broadcast), a named +vector, or a named list (one value per level). +} +\examples{ +# assign_stat(dt, lev, counts, list(rural = 2.6, urban = 5.5), "mean") +} diff --git a/man/create_full_list.Rd b/man/create_full_list.Rd new file mode 100644 index 00000000..249ae8b6 --- /dev/null +++ b/man/create_full_list.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fg_pip.R +\name{create_full_list} +\alias{create_full_list} +\title{Create full list for fg data load, not including country-years in cache} +\usage{ +create_full_list(country, year, refy_lkup, data_present_in_master) +} +\arguments{ +\item{country}{Country selected in \link{fg_pip} function} + +\item{year}{Year/s selected in \link{fg_pip} function} + +\item{refy_lkup}{reference year lkup table with full lineups and new method} + +\item{data_present_in_master}{cache data} +} +\value{ +list +} +\description{ +Create full list for fg data load, not including country-years in cache +} diff --git a/man/fg_pip.Rd b/man/fg_pip.Rd index f83e836f..9012b01b 100644 --- a/man/fg_pip.Rd +++ b/man/fg_pip.Rd @@ -39,10 +39,4 @@ data.frame \description{ Compute the main PIP poverty and inequality statistics for imputed years. } -\keyword{#} -\keyword{Add:} -\keyword{ZP} -\keyword{data} \keyword{internal} -\keyword{load} -\keyword{refy} diff --git a/man/fg_remove_duplicates.Rd b/man/fg_remove_duplicates.Rd index b9d9c87b..9b6b79dc 100644 --- a/man/fg_remove_duplicates.Rd +++ b/man/fg_remove_duplicates.Rd @@ -9,7 +9,8 @@ fg_remove_duplicates( cols = c("comparable_spell", "cpi", "display_cp", "gd_type", "path", "predicted_mean_ppp", "survey_acronym", "survey_comparability", "survey_coverage", "survey_id", "survey_mean_lcu", "survey_mean_ppp", "survey_median_lcu", - "survey_median_ppp", "survey_time", "survey_year", "surveyid_year") + "survey_median_ppp", "survey_time", "survey_year", "surveyid_year"), + use_new_lineup_version = FALSE ) } \arguments{ diff --git a/man/load_list_refy.Rd b/man/load_list_refy.Rd new file mode 100644 index 00000000..59a37d7c --- /dev/null +++ b/man/load_list_refy.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-pipdata.R +\name{load_list_refy} +\alias{load_list_refy} +\title{load refy list} +\usage{ +load_list_refy(input_list, path) +} +\arguments{ +\item{input_list}{list. output from \link{create_full_list}} + +\item{path}{character: directory path} +} +\value{ +character vector +} +\description{ +load refy list +} +\keyword{internal} diff --git a/man/transform_input.Rd b/man/transform_input.Rd new file mode 100644 index 00000000..e460cf0f --- /dev/null +++ b/man/transform_input.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-pipdata.R +\name{transform_input} +\alias{transform_input} +\title{transform input list} +\usage{ +transform_input(input_list) +} +\arguments{ +\item{input_list}{list. output from \link{create_full_list}} +} +\value{ +formated list +} +\description{ +transform input list +} +\keyword{internal} diff --git a/man/ui_cp_charts.Rd b/man/ui_cp_charts.Rd index 03ea2a00..010a71b1 100644 --- a/man/ui_cp_charts.Rd +++ b/man/ui_cp_charts.Rd @@ -7,7 +7,7 @@ ui_cp_charts( country = "AGO", povline = 1.9, - pop_units = 1000000, + pop_units = 1e+06, lkup, lkup_hash = lkup$cache_data_id$hash_ui_cp ) diff --git a/man/ui_cp_poverty_charts.Rd b/man/ui_cp_poverty_charts.Rd index dcfbb943..e2356d1e 100644 --- a/man/ui_cp_poverty_charts.Rd +++ b/man/ui_cp_poverty_charts.Rd @@ -4,7 +4,7 @@ \alias{ui_cp_poverty_charts} \title{CP Poverty Charts} \usage{ -ui_cp_poverty_charts(country, povline, pop_units = 1000000, lkup) +ui_cp_poverty_charts(country, povline, pop_units = 1e+06, lkup) } \arguments{ \item{country}{character: Country ISO 3 codes} diff --git a/man/ui_hp_countries.Rd b/man/ui_hp_countries.Rd index 760f458e..2ffcf443 100644 --- a/man/ui_hp_countries.Rd +++ b/man/ui_hp_countries.Rd @@ -7,7 +7,7 @@ ui_hp_countries( country = c("IDN", "CIV"), povline = 1.9, - pop_units = 1000000, + pop_units = 1e+06, lkup ) } diff --git a/man/ui_pc_charts.Rd b/man/ui_pc_charts.Rd index 438dfca9..49715b48 100644 --- a/man/ui_pc_charts.Rd +++ b/man/ui_pc_charts.Rd @@ -12,7 +12,7 @@ ui_pc_charts( group_by = "none", welfare_type = c("all", "consumption", "income"), reporting_level = c("all", "national", "rural", "urban"), - pop_units = 1000000, + pop_units = 1e+06, lkup ) } diff --git a/man/ui_pc_regional.Rd b/man/ui_pc_regional.Rd index 3cbde341..083e418b 100644 --- a/man/ui_pc_regional.Rd +++ b/man/ui_pc_regional.Rd @@ -8,7 +8,7 @@ ui_pc_regional( country = "ALL", year = "ALL", povline = 1.9, - pop_units = 1000000, + pop_units = 1e+06, lkup ) } From 3521a1415e8fed38e8ca3da7f935442211d2d2df Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Fri, 22 Aug 2025 12:28:59 -0400 Subject: [PATCH 051/203] send pip_new_lineups to new file --- R/pip.R | 302 ----------------------------------------- R/pip_new_lineups.R | 302 +++++++++++++++++++++++++++++++++++++++++ man/pip_new_lineups.Rd | 2 +- 3 files changed, 303 insertions(+), 303 deletions(-) create mode 100644 R/pip_new_lineups.R diff --git a/R/pip.R b/R/pip.R index 0f24ebb7..0689ef26 100644 --- a/R/pip.R +++ b/R/pip.R @@ -104,305 +104,3 @@ pip <- function(country = "ALL", - -#' Compute PIP statistics -#' -#' Compute the main PIP poverty and inequality statistics. -#' -#' @param country character: Country ISO 3 codes -#' @param year integer: Reporting year -#' @param povline numeric: Poverty line -#' @param popshare numeric: Proportion of the population living below the -#' poverty line -#' @param fill_gaps logical: If set to TRUE, will interpolate / extrapolate -#' values for missing years -#' @param group_by character: Will return aggregated values for predefined -#' sub-groups -#' @param welfare_type character: Welfare type -#' @param reporting_level character: Geographical reporting level -#' @param ppp numeric: Custom Purchase Power Parity value -#' @param lkup list: A list of lkup tables -#' @param censor logical: Triggers censoring of country/year statistics -#' @param lkup_hash character: hash of pip -#' @param additional_ind logical: If TRUE add new set of indicators. Default if -#' FALSE -#' -#' @return data.table -#' @examples -#' \dontrun{ -#' # Create lkups -#' lkups <- create_lkups("") -#' -#' # A single country and year -#' pip_new_lineups(country = "AGO", -#' year = 2000, -#' povline = 1.9, -#' lkup = lkups) -#' -#' # All years for a single country -#' pip_new_lineups(country = "AGO", -#' year = "all", -#' povline = 1.9, -#' lkup = lkups) -#' -#' # Fill gaps -#' pip_new_lineups(country = "AGO", -#' year = "all", -#' povline = 1.9, -#' fill_gaps = TRUE, -#' lkup = lkups) -#' -#' # Group by regions -#' pip_new_lineups(country = "all", -#' year = "all", -#' povline = 1.9, -#' group_by = "wb", -#' lkup = lkups) -#' } -#' @export -pip_new_lineups <- function(country = "ALL", - year = "ALL", - povline = 1.9, - popshare = NULL, - fill_gaps = FALSE, - group_by = c("none", "wb"), - welfare_type = c("all", "consumption", "income"), - reporting_level = c("all", "national", "rural", "urban"), - ppp = NULL, - lkup, - censor = TRUE, - lkup_hash = lkup$cache_data_id$hash_pip, - additional_ind = FALSE) { - - - # set up ------------- - welfare_type <- match.arg(welfare_type) - reporting_level <- match.arg(reporting_level) - group_by <- match.arg(group_by) - povline <- round(povline, digits = 3) - - - - # TEMPORARY UNTIL SELECTION MECHANISM IS BEING IMPROVED - country <- toupper(country) - if (is.character(year)) { - year <- toupper(year) - } - - # If svy_lkup is not part of lkup throw an error. - if (!all(c('svy_lkup') %in% names(lkup))) - stop("You are probably passing more than one dataset as lkup argument. - Try passing a single one by subsetting it lkup <- lkups$versions_paths$dataset_name_PROD") - - - # **** TO BE REMOVED **** REMOVAL STARTS HERE - # Once `pip-grp` has been integrated in ingestion pipeline - # Forces fill_gaps to TRUE when using group_by option - if (group_by != "none") { - fill_gaps <- TRUE - message("Info: argument group_by in pip() is deprecated; please use pip_grp() instead.") - } - # **** TO BE REMOVED **** REMOVAL ENDS HERE - - # Countries vector ------------ - lcv <- # List with countries vectors - create_countries_vctr( - country = country, - year = year, - valid_years = lkup$valid_years, - aux_files = lkup$aux_files - ) - # lcv$est_ctrs has all the country_code that we are interested in - - cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb") - if (!file.exists(cache_file_path)) { - # Create an empty duckdb file - create_duckdb_file(cache_file_path) - } - # mains estimates --------------- - if (fill_gaps) { - ## lineup years----------------- - out <- fg_pip( - country = lcv$est_ctrs, - year = year, - povline = povline, - popshare = popshare, - welfare_type = welfare_type, - reporting_level = reporting_level, - ppp = ppp, - lkup = lkup - ) - } else { - ## survey years ------------------ - out <- rg_pip( - country = lcv$est_ctrs, - year = year, - povline = povline, - popshare = popshare, - welfare_type = welfare_type, - reporting_level = reporting_level, - ppp = ppp, - lkup = lkup - ) - } - - # Cache new data - #--------------------------------------------- - cached_data <- qDT(out$data_in_cache) - main_data <- qDT(out$main_data) - - if (nrow(main_data) > 0) { - - if (is.null(out$data_in_cache)) { - - out <- main_data - } else { - - if (fill_gaps) { - - cached_data <- fg_remove_duplicates(cached_data, - use_new_lineup_version = lkup$use_new_lineup_version) - } - - out <- main_data |> - rowbind(cached_data) - } - - update_master_file(main_data, cache_file_path, fill_gaps) - - } else { - out <- cached_data - } - if (!data.table::is.data.table(out)) { - setDT(out) - } - # Early return for empty table--------------- - if (nrow(out) == 0) return(pipapi::empty_response) - - # aggregate distributions ------------------ - if (reporting_level %in% c("national", "all")) { - out <- add_agg_stats( - df = out, - return_cols = lkup$return_cols$ag_average_poverty_stats) - if (reporting_level == "national") { - out <- out[reporting_level == "national"] - } - } - - # Add out of pipeline variable - #--------------------------------------------- - add_vars_out_of_pipeline(out, - fill_gaps = fill_gaps, - lkup = lkup) - - # **** TO BE REMOVED **** REMOVAL STARTS HERE - # Once `pip-grp` has been integrated in ingestion pipeline - # Handles grouped aggregations - if (group_by != "none") { - # Handle potential (insignificant) difference in poverty_line values that - # may mess-up the grouping - out$poverty_line <- povline - - out <- pip_aggregate_by( - df = out, - group_lkup = lkup[["pop_region"]], - return_cols = lkup$return_cols$pip_grp) - - # Censor regional values - if (censor) { - out <- censor_rows(out, - lkup[["censored"]], - type = "regions") - } - - out <- out[, c("region_name", - "region_code", - "reporting_year", - "reporting_pop", - "poverty_line", - "headcount", - "poverty_gap", - "poverty_severity", - "watts", - "mean", - "pop_in_poverty")] - - return(out) - } - # **** TO BE REMOVED **** REMOVAL ENDS HERE - - - # pre-computed distributional stats --------------- - crr_names <- names(out) # current variables - names2keep <- lkup$return_cols$pip$cols # all variables - - out <- add_dist_stats( - df = out, - lkup = lkup, - fill_gaps = fill_gaps) - - # Add aggregate medians ---------------- - out <- add_agg_medians( - df = out, - fill_gaps = fill_gaps, - data_dir = lkup$data_root - ) - - # format ---------------- - - - if (fill_gaps) { - # ZP temp NA lineups: - #--------------------- - # ## Inequality indicators to NA for lineup years ---- - dist_vars <- names2keep[!(names2keep %in% crr_names)] - out[, - (dist_vars) := NA_real_] - - ## estimate_var ----- - out <- estimate_type_ctr_lnp(out, lkup) - - } else { - out[, estimate_type := NA_character_] - } - - ## Handle survey coverage ------------ - if (reporting_level != "all") { - keep <- out$reporting_level == reporting_level - out <- out[keep, ] - } - - # Censor country values - if (censor) { - out <- censor_rows(out, lkup[["censored"]], type = "countries") - } - - - # Select columns - if (additional_ind) { - get_additional_indicators(out) - added_names <- attr(out, "new_indicators_names") - names2keep <- c(names2keep, added_names) - - } - # Keep relevant variables - out <- out[, .SD, .SDcols = names2keep] - - - # make sure we always report the same precision in all numeric variables - doub_vars <- - names(out)[unlist(lapply(out, is.double))] |> - data.table::copy() - - out[, (doub_vars) := lapply(.SD, round, digits = 12), - .SDcols = doub_vars] - - # Order rows by country code and reporting year - data.table::setorder(out, country_code, reporting_year, reporting_level, welfare_type) - #} - - # Make sure no duplicate remains - out <- out |> collapse::funique() - # return ------------- - return(out) -} diff --git a/R/pip_new_lineups.R b/R/pip_new_lineups.R new file mode 100644 index 00000000..6b0c6139 --- /dev/null +++ b/R/pip_new_lineups.R @@ -0,0 +1,302 @@ + +#' Compute PIP statistics +#' +#' Compute the main PIP poverty and inequality statistics. +#' +#' @param country character: Country ISO 3 codes +#' @param year integer: Reporting year +#' @param povline numeric: Poverty line +#' @param popshare numeric: Proportion of the population living below the +#' poverty line +#' @param fill_gaps logical: If set to TRUE, will interpolate / extrapolate +#' values for missing years +#' @param group_by character: Will return aggregated values for predefined +#' sub-groups +#' @param welfare_type character: Welfare type +#' @param reporting_level character: Geographical reporting level +#' @param ppp numeric: Custom Purchase Power Parity value +#' @param lkup list: A list of lkup tables +#' @param censor logical: Triggers censoring of country/year statistics +#' @param lkup_hash character: hash of pip +#' @param additional_ind logical: If TRUE add new set of indicators. Default if +#' FALSE +#' +#' @return data.table +#' @examples +#' \dontrun{ +#' # Create lkups +#' lkups <- create_lkups("") +#' +#' # A single country and year +#' pip_new_lineups(country = "AGO", +#' year = 2000, +#' povline = 1.9, +#' lkup = lkups) +#' +#' # All years for a single country +#' pip_new_lineups(country = "AGO", +#' year = "all", +#' povline = 1.9, +#' lkup = lkups) +#' +#' # Fill gaps +#' pip_new_lineups(country = "AGO", +#' year = "all", +#' povline = 1.9, +#' fill_gaps = TRUE, +#' lkup = lkups) +#' +#' # Group by regions +#' pip_new_lineups(country = "all", +#' year = "all", +#' povline = 1.9, +#' group_by = "wb", +#' lkup = lkups) +#' } +#' @export +pip_new_lineups <- function(country = "ALL", + year = "ALL", + povline = 1.9, + popshare = NULL, + fill_gaps = FALSE, + group_by = c("none", "wb"), + welfare_type = c("all", "consumption", "income"), + reporting_level = c("all", "national", "rural", "urban"), + ppp = NULL, + lkup, + censor = TRUE, + lkup_hash = lkup$cache_data_id$hash_pip, + additional_ind = FALSE) { + + + # set up ------------- + welfare_type <- match.arg(welfare_type) + reporting_level <- match.arg(reporting_level) + group_by <- match.arg(group_by) + povline <- round(povline, digits = 3) + + + + # TEMPORARY UNTIL SELECTION MECHANISM IS BEING IMPROVED + country <- toupper(country) + if (is.character(year)) { + year <- toupper(year) + } + + # If svy_lkup is not part of lkup throw an error. + if (!all(c('svy_lkup') %in% names(lkup))) + stop("You are probably passing more than one dataset as lkup argument. + Try passing a single one by subsetting it lkup <- lkups$versions_paths$dataset_name_PROD") + + + # **** TO BE REMOVED **** REMOVAL STARTS HERE + # Once `pip-grp` has been integrated in ingestion pipeline + # Forces fill_gaps to TRUE when using group_by option + if (group_by != "none") { + fill_gaps <- TRUE + message("Info: argument group_by in pip() is deprecated; please use pip_grp() instead.") + } + # **** TO BE REMOVED **** REMOVAL ENDS HERE + + # Countries vector ------------ + lcv <- # List with countries vectors + create_countries_vctr( + country = country, + year = year, + valid_years = lkup$valid_years, + aux_files = lkup$aux_files + ) + # lcv$est_ctrs has all the country_code that we are interested in + + cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb") + if (!file.exists(cache_file_path)) { + # Create an empty duckdb file + create_duckdb_file(cache_file_path) + } + # mains estimates --------------- + if (fill_gaps) { + ## lineup years----------------- + out <- fg_pip( + country = lcv$est_ctrs, + year = year, + povline = povline, + popshare = popshare, + welfare_type = welfare_type, + reporting_level = reporting_level, + ppp = ppp, + lkup = lkup + ) + } else { + ## survey years ------------------ + out <- rg_pip( + country = lcv$est_ctrs, + year = year, + povline = povline, + popshare = popshare, + welfare_type = welfare_type, + reporting_level = reporting_level, + ppp = ppp, + lkup = lkup + ) + } + + # Cache new data + #--------------------------------------------- + cached_data <- qDT(out$data_in_cache) + main_data <- qDT(out$main_data) + + if (nrow(main_data) > 0) { + + if (is.null(out$data_in_cache)) { + + out <- main_data + } else { + + if (fill_gaps) { + + cached_data <- fg_remove_duplicates(cached_data, + use_new_lineup_version = lkup$use_new_lineup_version) + } + + out <- main_data |> + rowbind(cached_data) + } + + update_master_file(main_data, cache_file_path, fill_gaps) + + } else { + out <- cached_data + } + if (!is.data.table(out)) { + setDT(out) + } + # Early return for empty table--------------- + if (nrow(out) == 0) return(pipapi::empty_response) + + # aggregate distributions ------------------ + if (reporting_level %in% c("national", "all")) { + out <- add_agg_stats( + df = out, + return_cols = lkup$return_cols$ag_average_poverty_stats) + if (reporting_level == "national") { + out <- out[reporting_level == "national"] + } + } + + # Add out of pipeline variable + #--------------------------------------------- + add_vars_out_of_pipeline(out, + fill_gaps = fill_gaps, + lkup = lkup) + + # **** TO BE REMOVED **** REMOVAL STARTS HERE + # Once `pip-grp` has been integrated in ingestion pipeline + # Handles grouped aggregations + if (group_by != "none") { + # Handle potential (insignificant) difference in poverty_line values that + # may mess-up the grouping + out$poverty_line <- povline + + out <- pip_aggregate_by( + df = out, + group_lkup = lkup[["pop_region"]], + return_cols = lkup$return_cols$pip_grp) + + # Censor regional values + if (censor) { + out <- censor_rows(out, + lkup[["censored"]], + type = "regions") + } + + out <- out[, c("region_name", + "region_code", + "reporting_year", + "reporting_pop", + "poverty_line", + "headcount", + "poverty_gap", + "poverty_severity", + "watts", + "mean", + "pop_in_poverty")] + + return(out) + } + # **** TO BE REMOVED **** REMOVAL ENDS HERE + + + # pre-computed distributional stats --------------- + crr_names <- names(out) # current variables + names2keep <- lkup$return_cols$pip$cols # all variables + + out <- add_dist_stats( + df = out, + lkup = lkup, + fill_gaps = fill_gaps) + + # Add aggregate medians ---------------- + out <- add_agg_medians( + df = out, + fill_gaps = fill_gaps, + data_dir = lkup$data_root + ) + + # format ---------------- + + + if (fill_gaps) { + # ZP temp NA lineups: + #--------------------- + # ## Inequality indicators to NA for lineup years ---- + dist_vars <- names2keep[!(names2keep %in% crr_names)] + out[, + (dist_vars) := NA_real_] + + ## estimate_var ----- + out <- estimate_type_ctr_lnp(out, lkup) + + } else { + out[, estimate_type := NA_character_] + } + + ## Handle survey coverage ------------ + if (reporting_level != "all") { + keep <- out$reporting_level == reporting_level + out <- out[keep, ] + } + + # Censor country values + if (censor) { + out <- censor_rows(out, lkup[["censored"]], type = "countries") + } + + + # Select columns + if (additional_ind) { + get_additional_indicators(out) + added_names <- attr(out, "new_indicators_names") + names2keep <- c(names2keep, added_names) + + } + # Keep relevant variables + out <- out[, .SD, .SDcols = names2keep] + + + # make sure we always report the same precision in all numeric variables + doub_vars <- + names(out)[unlist(lapply(out, is.double))] |> + data.table::copy() + + out[, (doub_vars) := lapply(.SD, round, digits = 12), + .SDcols = doub_vars] + + # Order rows by country code and reporting year + data.table::setorder(out, country_code, reporting_year, reporting_level, welfare_type) + #} + + # Make sure no duplicate remains + out <- out |> collapse::funique() + # return ------------- + return(out) +} diff --git a/man/pip_new_lineups.Rd b/man/pip_new_lineups.Rd index eded364f..db43427e 100644 --- a/man/pip_new_lineups.Rd +++ b/man/pip_new_lineups.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pip.R +% Please edit documentation in R/pip_new_lineups.R \name{pip_new_lineups} \alias{pip_new_lineups} \title{Compute PIP statistics} From d6105fecf59393fb07cdaa1e338206de4351e621 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Fri, 22 Aug 2025 13:38:30 -0400 Subject: [PATCH 052/203] organize functions --- R/compute_fgt_new.R | 133 ++++++++++++++++++++++++++++++++++++++ R/compute_fgt_old.R | 112 ++++++++++++++++++++++++++++++++ R/pip_new_lineups.R | 13 +++- R/rg_pip.R | 133 -------------------------------------- R/rg_pip_old.R | 112 -------------------------------- man/compute_fgt_dt.Rd | 2 +- man/compute_fgt_dt_old.Rd | 2 +- man/load_data_list.Rd | 2 +- man/load_data_list_old.Rd | 2 +- 9 files changed, 261 insertions(+), 250 deletions(-) create mode 100644 R/compute_fgt_new.R create mode 100644 R/compute_fgt_old.R diff --git a/R/compute_fgt_new.R b/R/compute_fgt_new.R new file mode 100644 index 00000000..f4651f21 --- /dev/null +++ b/R/compute_fgt_new.R @@ -0,0 +1,133 @@ +# Efficient FGT calculation for a data.table and vector of poverty lines +#' Title +#' +#' @param dt data frame with `welfare` and `weight` columns +#' @param welfare character: welfare variable name +#' @param weight character: weight variable name +#' @param povlines double: vector with poveryt lines +#' +#' @return data.table with estimates poverty estimates +#' @keywords internal +compute_fgt_dt <- function(dt, welfare, weight, povlines, mean_and_med = FALSE) { + w <- dt[[welfare]] + wt <- dt[[weight]] + n <- length(w) + m <- length(povlines) + + # Pre-allocate result matrix + res <- matrix(NA_real_, nrow = m, ncol = 3) + colnames(res) <- c("FGT0", "FGT1", "FGT2") + watts_vec <- numeric(m) + + # Precompute log(w) for efficiency + logw <- rep(NA_real_, n) + pos <- w > 0 + logw[pos] <- log(w[pos]) + + for (i in seq_along(povlines)) { + pov <- povlines[i] + poor <- w < pov + rel_dist <- 1 - (w / pov) + rel_dist[!poor] <- 0 + res[i, 1] <- fmean(poor, w = wt) # FGT0 + res[i, 2] <- fmean(rel_dist, w = wt) # FGT1 + res[i, 3] <- fmean(rel_dist^2, w = wt) # FGT2 + + # Optimized Watts index calculation + keep <- poor & pos + if (any(keep, na.rm = TRUE)) { + watts_vec[i] <- (fsum((log(pov) - logw[keep]) * wt[keep])) / fsum(wt) + } else { + watts_vec[i] <- 0 + } + } + + if (mean_and_med) { + mn <- funique(dt$mean) + med <- funique(dt$median) + cy <- funique(dt$coutnry_code) + ry <- funique(dt$reporting_year) + out <- data.table( + povline = povlines, + headcount = res[, 1], + poverty_gap = res[, 2], + poverty_severity = res[, 3], + watts = watts_vec, + mean = mn, + median = med, + country_code = cy, + reporting_year = ry) + } else { + out <- data.table( + povline = povlines, + headcount = res[, 1], + poverty_gap = res[, 2], + poverty_severity = res[, 3], + watts = watts_vec) + } + + out + +} + + +process_dt <- function(dt, povline, mean_and_med = FALSE) { + dt[, compute_fgt_dt(.SD, "welfare", "weight", povline, mean_and_med), + by = .(file, reporting_level)] +} + +#' load survey year files and store them in a list +#' +#' @param metadata data frame from `subset_lkup()` +#' +#' @return list with survey years data +#' @keywords internal +load_data_list <- \(metadata) { + + # unique values + mdout <- metadata[, lapply(.SD, list), by = path] + upaths <- mdout$path + urep_level <- mdout$reporting_level + uppp <- mdout$ppp + ucpi <- mdout$cpi + + seq_along(upaths) |> + lapply(\(f) { + path <- upaths[f] + rep_level <- urep_level[f][[1]] + ppp <- uppp[f][[1]] + cpi <- ucpi[f][[1]] + + # Build a data.table to merge cpi and ppp + fdt <- data.table(reporting_level = as.character(rep_level), + ppp = ppp, + cpi = cpi) + + # load data and format + dt <- fst::read_fst(path, as.data.table = TRUE) + + if (length(rep_level) == 1) { + if (rep_level == "national") dt[, area := "national"] + } + setnames(dt, "area", "reporting_level") + dt[, + `:=`( + file = basename(path), + reporting_level = as.character(reporting_level) + ) + ] + + dt <- join(dt, fdt, + on = "reporting_level", + validate = "m:1", + how = "left", + verbose = 0) + + dt[, welfare := welfare/(cpi * ppp) + ][, + c("cpi", "ppp") := NULL] + + }) + +} + diff --git a/R/compute_fgt_old.R b/R/compute_fgt_old.R new file mode 100644 index 00000000..cf999b08 --- /dev/null +++ b/R/compute_fgt_old.R @@ -0,0 +1,112 @@ +# OLD: Efficient FGT calculation for a data.table and vector of poverty lines +#' Title +#' +#' @param dt data frame with `welfare` and `weight` columns +#' @param welfare character: welfare variable name +#' @param weight character: weight variable name +#' @param povlines double: vector with poveryt lines +#' +#' @return data.table with estimates poverty estimates +#' @keywords internal +compute_fgt_dt_old <- function(dt, welfare, weight, povlines) { + w <- dt[[welfare]] + wt <- dt[[weight]] + n <- length(w) + m <- length(povlines) + + # Pre-allocate result matrix + res <- matrix(NA_real_, nrow = m, ncol = 3) + colnames(res) <- c("FGT0", "FGT1", "FGT2") + watts_vec <- numeric(m) + + # Precompute log(w) for efficiency + logw <- rep(NA_real_, n) + pos <- w > 0 + logw[pos] <- log(w[pos]) + + for (i in seq_along(povlines)) { + pov <- povlines[i] + poor <- w < pov + rel_dist <- 1 - (w / pov) + rel_dist[!poor] <- 0 + res[i, 1] <- fmean(poor, w = wt) # FGT0 + res[i, 2] <- fmean(rel_dist, w = wt) # FGT1 + res[i, 3] <- fmean(rel_dist^2, w = wt) # FGT2 + + # Optimized Watts index calculation + keep <- poor & pos + if (any(keep, na.rm = TRUE)) { + watts_vec[i] <- (fsum((log(pov) - logw[keep]) * wt[keep])) / fsum(wt) + } else { + watts_vec[i] <- 0 + } + } + data.table( + povline = povlines, + headcount = res[, 1], + poverty_gap = res[, 2], + poverty_severity = res[, 3], + watts = watts_vec + ) +} + + +process_dt_old <- function(dt, povline) { + dt[, compute_fgt_dt_old(.SD, "welfare", "weight", povline), + by = .(file, reporting_level)] +} + +#' OLD: load survey year files and store them in a list +#' +#' @param metadata data frame from `subset_lkup()` +#' +#' @return list with survey years data +#' @keywords internal +load_data_list_old <- \(metadata) { + + # unique values + mdout <- metadata[, lapply(.SD, list), by = path] + upaths <- mdout$path + urep_level <- mdout$reporting_level + uppp <- mdout$ppp + ucpi <- mdout$cpi + + seq_along(upaths) |> + lapply(\(f) { + path <- upaths[f] + rep_level <- urep_level[f][[1]] + ppp <- uppp[f][[1]] + cpi <- ucpi[f][[1]] + + # Build a data.table to merge cpi and ppp + fdt <- data.table(reporting_level = as.character(rep_level), + ppp = ppp, + cpi = cpi) + + # load data and format + dt <- fst::read_fst(path, as.data.table = TRUE) + + if (length(rep_level) == 1) { + if (rep_level == "national") dt[, area := "national"] + } + setnames(dt, "area", "reporting_level") + dt[, + `:=`( + file = basename(path), + reporting_level = as.character(reporting_level) + ) + ] + + dt <- join(dt, fdt, + on = "reporting_level", + validate = "m:1", + how = "left", + verbose = 0) + + dt[, welfare := welfare/(cpi * ppp) + ][, + c("cpi", "ppp") := NULL] + + }) + +} diff --git a/R/pip_new_lineups.R b/R/pip_new_lineups.R index 6b0c6139..aea166e5 100644 --- a/R/pip_new_lineups.R +++ b/R/pip_new_lineups.R @@ -142,7 +142,18 @@ pip_new_lineups <- function(country = "ALL", # Cache new data #--------------------------------------------- - cached_data <- qDT(out$data_in_cache) + cached_data <- if (is.null(out$data_in_cache)) { + NULL + } else if (is.data.frame(out$data_in_cache)) { + if (fnrow(out$data_in_cache) == 0) { + NULL + } else { + qDT(out$data_in_cache) + } + } else { + cli::cli_abort("{.code out$data_in_cache} must be NULL or data.frame not {.field {class(out$data_in_cache)}}") + } + main_data <- qDT(out$main_data) if (nrow(main_data) > 0) { diff --git a/R/rg_pip.R b/R/rg_pip.R index 8b555753..769878ae 100644 --- a/R/rg_pip.R +++ b/R/rg_pip.R @@ -85,136 +85,3 @@ rg_pip <- function(country, -# Efficient FGT calculation for a data.table and vector of poverty lines -#' Title -#' -#' @param dt data frame with `welfare` and `weight` columns -#' @param welfare character: welfare variable name -#' @param weight character: weight variable name -#' @param povlines double: vector with poveryt lines -#' -#' @return data.table with estimates poverty estimates -#' @keywords internal -compute_fgt_dt <- function(dt, welfare, weight, povlines, mean_and_med = FALSE) { - w <- dt[[welfare]] - wt <- dt[[weight]] - n <- length(w) - m <- length(povlines) - - # Pre-allocate result matrix - res <- matrix(NA_real_, nrow = m, ncol = 3) - colnames(res) <- c("FGT0", "FGT1", "FGT2") - watts_vec <- numeric(m) - - # Precompute log(w) for efficiency - logw <- rep(NA_real_, n) - pos <- w > 0 - logw[pos] <- log(w[pos]) - - for (i in seq_along(povlines)) { - pov <- povlines[i] - poor <- w < pov - rel_dist <- 1 - (w / pov) - rel_dist[!poor] <- 0 - res[i, 1] <- fmean(poor, w = wt) # FGT0 - res[i, 2] <- fmean(rel_dist, w = wt) # FGT1 - res[i, 3] <- fmean(rel_dist^2, w = wt) # FGT2 - - # Optimized Watts index calculation - keep <- poor & pos - if (any(keep, na.rm = TRUE)) { - watts_vec[i] <- (fsum((log(pov) - logw[keep]) * wt[keep])) / fsum(wt) - } else { - watts_vec[i] <- 0 - } - } - - if (mean_and_med) { - mn <- funique(dt$mean) - med <- funique(dt$median) - cy <- funique(dt$coutnry_code) - ry <- funique(dt$reporting_year) - out <- data.table( - povline = povlines, - headcount = res[, 1], - poverty_gap = res[, 2], - poverty_severity = res[, 3], - watts = watts_vec, - mean = mn, - median = med, - country_code = cy, - reporting_year = ry) - } else { - out <- data.table( - povline = povlines, - headcount = res[, 1], - poverty_gap = res[, 2], - poverty_severity = res[, 3], - watts = watts_vec) - } - - out - -} - - -process_dt <- function(dt, povline, mean_and_med = FALSE) { - dt[, compute_fgt_dt(.SD, "welfare", "weight", povline, mean_and_med), - by = .(file, reporting_level)] -} - -#' load survey year files and store them in a list -#' -#' @param metadata data frame from `subset_lkup()` -#' -#' @return list with survey years data -#' @keywords internal -load_data_list <- \(metadata) { - - # unique values - mdout <- metadata[, lapply(.SD, list), by = path] - upaths <- mdout$path - urep_level <- mdout$reporting_level - uppp <- mdout$ppp - ucpi <- mdout$cpi - - seq_along(upaths) |> - lapply(\(f) { - path <- upaths[f] - rep_level <- urep_level[f][[1]] - ppp <- uppp[f][[1]] - cpi <- ucpi[f][[1]] - - # Build a data.table to merge cpi and ppp - fdt <- data.table(reporting_level = as.character(rep_level), - ppp = ppp, - cpi = cpi) - - # load data and format - dt <- fst::read_fst(path, as.data.table = TRUE) - - if (length(rep_level) == 1) { - if (rep_level == "national") dt[, area := "national"] - } - setnames(dt, "area", "reporting_level") - dt[, - `:=`( - file = basename(path), - reporting_level = as.character(reporting_level) - ) - ] - - dt <- join(dt, fdt, - on = "reporting_level", - validate = "m:1", - how = "left", - verbose = 0) - - dt[, welfare := welfare/(cpi * ppp) - ][, - c("cpi", "ppp") := NULL] - - }) - -} - diff --git a/R/rg_pip_old.R b/R/rg_pip_old.R index e5f71be3..da17b015 100644 --- a/R/rg_pip_old.R +++ b/R/rg_pip_old.R @@ -85,115 +85,3 @@ rg_pip_old <- function(country, -# OLD: Efficient FGT calculation for a data.table and vector of poverty lines -#' Title -#' -#' @param dt data frame with `welfare` and `weight` columns -#' @param welfare character: welfare variable name -#' @param weight character: weight variable name -#' @param povlines double: vector with poveryt lines -#' -#' @return data.table with estimates poverty estimates -#' @keywords internal -compute_fgt_dt_old <- function(dt, welfare, weight, povlines) { - w <- dt[[welfare]] - wt <- dt[[weight]] - n <- length(w) - m <- length(povlines) - - # Pre-allocate result matrix - res <- matrix(NA_real_, nrow = m, ncol = 3) - colnames(res) <- c("FGT0", "FGT1", "FGT2") - watts_vec <- numeric(m) - - # Precompute log(w) for efficiency - logw <- rep(NA_real_, n) - pos <- w > 0 - logw[pos] <- log(w[pos]) - - for (i in seq_along(povlines)) { - pov <- povlines[i] - poor <- w < pov - rel_dist <- 1 - (w / pov) - rel_dist[!poor] <- 0 - res[i, 1] <- fmean(poor, w = wt) # FGT0 - res[i, 2] <- fmean(rel_dist, w = wt) # FGT1 - res[i, 3] <- fmean(rel_dist^2, w = wt) # FGT2 - - # Optimized Watts index calculation - keep <- poor & pos - if (any(keep, na.rm = TRUE)) { - watts_vec[i] <- (fsum((log(pov) - logw[keep]) * wt[keep])) / fsum(wt) - } else { - watts_vec[i] <- 0 - } - } - data.table( - povline = povlines, - headcount = res[, 1], - poverty_gap = res[, 2], - poverty_severity = res[, 3], - watts = watts_vec - ) -} - - -process_dt_old <- function(dt, povline) { - dt[, compute_fgt_dt_old(.SD, "welfare", "weight", povline), - by = .(file, reporting_level)] -} - -#' OLD: load survey year files and store them in a list -#' -#' @param metadata data frame from `subset_lkup()` -#' -#' @return list with survey years data -#' @keywords internal -load_data_list_old <- \(metadata) { - - # unique values - mdout <- metadata[, lapply(.SD, list), by = path] - upaths <- mdout$path - urep_level <- mdout$reporting_level - uppp <- mdout$ppp - ucpi <- mdout$cpi - - seq_along(upaths) |> - lapply(\(f) { - path <- upaths[f] - rep_level <- urep_level[f][[1]] - ppp <- uppp[f][[1]] - cpi <- ucpi[f][[1]] - - # Build a data.table to merge cpi and ppp - fdt <- data.table(reporting_level = as.character(rep_level), - ppp = ppp, - cpi = cpi) - - # load data and format - dt <- fst::read_fst(path, as.data.table = TRUE) - - if (length(rep_level) == 1) { - if (rep_level == "national") dt[, area := "national"] - } - setnames(dt, "area", "reporting_level") - dt[, - `:=`( - file = basename(path), - reporting_level = as.character(reporting_level) - ) - ] - - dt <- join(dt, fdt, - on = "reporting_level", - validate = "m:1", - how = "left", - verbose = 0) - - dt[, welfare := welfare/(cpi * ppp) - ][, - c("cpi", "ppp") := NULL] - - }) - -} diff --git a/man/compute_fgt_dt.Rd b/man/compute_fgt_dt.Rd index a57b0748..ac27c0f1 100644 --- a/man/compute_fgt_dt.Rd +++ b/man/compute_fgt_dt.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/rg_pip.R +% Please edit documentation in R/compute_fgt_new.R \name{compute_fgt_dt} \alias{compute_fgt_dt} \title{Title} diff --git a/man/compute_fgt_dt_old.Rd b/man/compute_fgt_dt_old.Rd index 82c9babd..58f12e5e 100644 --- a/man/compute_fgt_dt_old.Rd +++ b/man/compute_fgt_dt_old.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/rg_pip_old.R +% Please edit documentation in R/compute_fgt_old.R \name{compute_fgt_dt_old} \alias{compute_fgt_dt_old} \title{Title} diff --git a/man/load_data_list.Rd b/man/load_data_list.Rd index 26a142d2..8098daa9 100644 --- a/man/load_data_list.Rd +++ b/man/load_data_list.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/rg_pip.R +% Please edit documentation in R/compute_fgt_new.R \name{load_data_list} \alias{load_data_list} \title{load survey year files and store them in a list} diff --git a/man/load_data_list_old.Rd b/man/load_data_list_old.Rd index b6ea172f..23dc3862 100644 --- a/man/load_data_list_old.Rd +++ b/man/load_data_list_old.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/rg_pip_old.R +% Please edit documentation in R/compute_fgt_old.R \name{load_data_list_old} \alias{load_data_list_old} \title{OLD: load survey year files and store them in a list} From d4dea9f6130df6799fb02e54e8c8fce238c46686 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Fri, 22 Aug 2025 16:01:57 -0400 Subject: [PATCH 053/203] improve efficiency a little --- R/compute_fgt_new.R | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/R/compute_fgt_new.R b/R/compute_fgt_new.R index f4651f21..f79cf6cf 100644 --- a/R/compute_fgt_new.R +++ b/R/compute_fgt_new.R @@ -19,10 +19,9 @@ compute_fgt_dt <- function(dt, welfare, weight, povlines, mean_and_med = FALSE) colnames(res) <- c("FGT0", "FGT1", "FGT2") watts_vec <- numeric(m) - # Precompute log(w) for efficiency - logw <- rep(NA_real_, n) + # Precompute log(w) for efficiency (vectorized) + logw <- fifelse(w > 0, log(w), NA_real_) pos <- w > 0 - logw[pos] <- log(w[pos]) for (i in seq_along(povlines)) { pov <- povlines[i] From 9b688bb1cc8d4589b608fed1347206b18cdb519c Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Fri, 22 Aug 2025 16:30:56 -0400 Subject: [PATCH 054/203] more optimization --- R/compute_fgt_new.R | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/R/compute_fgt_new.R b/R/compute_fgt_new.R index f79cf6cf..f0ce396d 100644 --- a/R/compute_fgt_new.R +++ b/R/compute_fgt_new.R @@ -20,8 +20,12 @@ compute_fgt_dt <- function(dt, welfare, weight, povlines, mean_and_med = FALSE) watts_vec <- numeric(m) # Precompute log(w) for efficiency (vectorized) - logw <- fifelse(w > 0, log(w), NA_real_) - pos <- w > 0 + + pos <- w > 0 + # logw <- log(w) + logw <- copyv(log(w), pos, NA_real_, invert = TRUE) |> + suppressWarnings() + # logw <- fifelse(w > 0, log(w), NA_real_) for (i in seq_along(povlines)) { pov <- povlines[i] From b194914ef5ca750bbebec6683be56c47d9b1367d Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Fri, 22 Aug 2025 17:28:44 -0400 Subject: [PATCH 055/203] remove rep.int for efficiency --- R/add_attributes_as_columns.R | 237 ------------------- R/compute_fgt_new.R | 11 +- R/fg_pip.R | 5 +- R/utils-pipdata.R | 245 ++++++++++++++++++++ man/add_attributes_as_columns_multi.Rd | 2 +- man/add_attributes_as_columns_vectorized.Rd | 2 +- man/assign_stat.Rd | 2 +- 7 files changed, 256 insertions(+), 248 deletions(-) delete mode 100644 R/add_attributes_as_columns.R diff --git a/R/add_attributes_as_columns.R b/R/add_attributes_as_columns.R deleted file mode 100644 index f1cd461f..00000000 --- a/R/add_attributes_as_columns.R +++ /dev/null @@ -1,237 +0,0 @@ -#' Add attributes as columns (vectorized, in-place) -#' -#' @description -#' Converts survey attributes on a `data.table`—including -#' `reporting_level_rows`, `country_code`, `reporting_year`, and `dist_stats`— -#' into columns using a **loop-free, segment-replication** strategy. Designed -#' for very large tables and objects loaded via `readRDS()`/`load()`: -#' uses `setDT()` and `alloc.col()` to ensure in-place assignment. -#' -#' @details -#' The function expects an attribute `reporting_level_rows`, a list with: -#' - `reporting_level`: character vector of the level label for each segment -#' (e.g., `c("rural","urban","rural", ...)`). -#' - `rows`: integer vector of **cumulative** row-ends (e.g., -#' `c(100000, 200000, 300000, ...)`). -#' -#' Segment lengths are computed as `diff(c(0L, rows))`, and `reporting_level` -#' is replicated with `rep.int(lev, counts)`. Constants `country_code`, -#' `reporting_year`, and `file` (`paste0(country_code, "_", reporting_year)`) -#' are added to all rows. If `dist_stats$mean` / `dist_stats$median` are -#' provided (as named vectors/lists keyed by level), they are mapped by level -#' name and replicated per segment. If a level is missing from the names, -#' `NA` values may result for that segment. -#' -#' This implementation avoids loops and `findInterval()` edge cases, and -#' modifies `dt` by reference. -#' -#' @param dt A `data.table` carrying the attributes described above. -#' -#' @return The same `data.table`, modified by reference, with added columns: -#' `reporting_level`, `country_code`, `reporting_year`, `file`, and (if -#' present) `mean`, `median`. -#' -#' @section Assumptions: -#' * `length(reporting_level_rows$reporting_level) == length(reporting_level_rows$rows)`. -#' * `rows` are cumulative and non-decreasing, and their segment lengths sum to `nrow(dt)`. -#' * If `dist_stats$mean` / `dist_stats$median` have multiple values, their names -#' align with the level labels. -#' -#' @note For objects loaded from disk (e.g., via `readRDS()`), `alloc.col(dt)` -#' ensures there is spare column capacity for by-reference assignment. -#' -#' @seealso [add_attributes_as_columns_multi()], [assign_stat()] -#' -#' @examples -#' \dontrun{ -#' library(data.table) -#' dt <- data.table(weight = 1:6, welfare = runif(6)) -#' attr(dt, "reporting_level_rows") <- list( -#' reporting_level = c("rural","urban","rural"), -#' rows = c(2L, 4L, 6L) -#' ) -#' attr(dt, "country_code") <- "XXY" -#' attr(dt, "reporting_year") <- 2000L -#' attr(dt, "dist_stats") <- list( -#' mean = list(rural = 2.5, urban = 5.0), -#' median = list(rural = 2.0, urban = 4.5) -#' ) -#' -#' add_attributes_as_columns_vectorized(dt) -#' head(dt) -#' } -#' -#' @import data.table -#' @export -add_attributes_as_columns_vectorized <- function(dt) { - - # Ensure proper internal state & spare column capacity (handles readRDS/load cases) - setDT(dt) # harmless if already a data.table - alloc.col(dt) # pre-allocate room for new columns - - rl <- attr(dt, "reporting_level_rows") - lev <- rl$reporting_level - rows <- as.integer(rl$rows) - n <- nrow(dt) - - counts <- diff(c(0L, rows)) - if (sum(counts) != n) cli::cli_abort("Sum of 'rows' in attribute does not equal nrow(dt).") - - # reporting_level: vectorized, no loops, no findInterval edge cases - dt[, - reporting_level := rep.int(lev, counts)] - - # constants - cc <- attr(dt, "country_code") - ry <- attr(dt, "reporting_year") - dt[, `:=`( - country_code = cc, - reporting_year = ry, - file = paste0(cc, "_", ry) - )] - - # dist_stats per reporting_level (align by names, then replicate by counts) - ds <- attr(dt, "dist_stats") - if (length(ds)) { - if (!is.null(ds$mean)) { - m <- unlist(ds$mean, use.names = TRUE) - dt[, - mean := rep.int(unname(m[match(lev, - names(m))]), - counts)] - } - if (!is.null(ds$median)) { - md <- unlist(ds$median, use.names = TRUE) - dt[, - median := rep.int(unname(md[match(lev, - names(md))]), - counts)] - } - } - - dt -} - - - -#' Add attributes as columns for multi-segment reporting levels -#' -#' @description -#' Converts attributes on a survey `data.table` (e.g., `reporting_level_rows`, -#' `country_code`, `reporting_year`, and `dist_stats`) into columns, handling -#' **multiple alternating segments** (e.g., CHN rural/urban/rural/urban) or -#' single-segment cases (e.g., ZAF). -#' -#' @param dt A `data.table` with attributes: -#' - `reporting_level_rows`: list with `reporting_level` (character) and -#' `rows` (integer cumulative row ends). -#' - `country_code` (character). -#' - `reporting_year` (integer/numeric). -#' - `dist_stats` (list) optionally containing `mean` and/or `median`, each as -#' a named list/vector keyed by reporting level, or a single scalar. -#' -#' @return The same `data.table`, modified by reference, with new columns: -#' `reporting_level`, `country_code`, `reporting_year`, `file`, and -#' optionally `mean`, `median`. -#' -#' @examples -#' # chn2000_cols <- add_attributes_as_columns_multi(chn2000) -#' # zaf2000_cols <- add_attributes_as_columns_multi(zaf2000) -#' @import data.table -#' @export -add_attributes_as_columns_multi <- function(dt) { - # Ensure DT internals and spare capacity for new columns - setDT(dt) - alloc.col(dt) - - # --- Pull + validate segment metadata --- - rl <- attr(dt, "reporting_level_rows") - if (is.null(rl) || is.null(rl$reporting_level) || is.null(rl$rows)) { - cli::cli_abort("Missing 'reporting_level_rows' attribute with $reporting_level and $rows.") - } - lev <- as.character(rl$reporting_level) - rows <- as.integer(rl$rows) - n <- nrow(dt) - - if (length(lev) != length(rows)) cli::cli_abort("'reporting_level' and 'rows' lengths differ.") - if (length(rows) == 0L) cli::cli_abort("'rows' is empty.") - if (any(diff(rows) < 0L)) cli::cli_abort("'rows' must be non-decreasing.") - if (rows[length(rows)] != n) cli::cli_abort("Last element of 'rows' must equal nrow(dt).") - - counts <- diff(c(0L, rows)) - if (any(counts <= 0L)) cli::cli_abort("Computed non-positive segment length(s).") - - # --- reporting_level: vectorized per-segment replication --- - dt[, reporting_level := rep.int(lev, counts)] - - # --- constants --- - cc <- attr(dt, "country_code") - ry <- attr(dt, "reporting_year") - dt[, `:=`( - country_code = cc, - reporting_year = ry, - file = paste0(cc, "_", ry) - )] - - # --- distribution stats --- - ds <- attr(dt, "dist_stats") - if (length(ds)) { - assign_stat(dt, lev, counts, ds$mean, "mean") - assign_stat(dt, lev, counts, ds$median, "median") - } - - dt -} - - - -#' Assign a per-level statistic to a data.table column (by reference) -#' -#' @description -#' Replicates a statistic per reporting-level segment and assigns it to a new -#' column in `dt`, **in place**. `stat` can be a scalar (broadcast), a named -#' vector, or a named list (one value per level). -#' -#' @param dt A `data.table`. Modified by reference. -#' @param lev Character vector of reporting-level labels per segment -#' (e.g., `c("rural","urban","rural", ...)`). -#' @param counts Integer vector of segment lengths matching `lev` -#' (e.g., `c(100000, 100000, 100000, ...)`). -#' @param stat A numeric scalar, named vector, or named list with one value per -#' level (names must match `lev` values). -#' @param colname Name of the column to create/overwrite. -#' -#' @return Invisibly returns `dt` (modified by reference). -#' @examples -#' # assign_stat(dt, lev, counts, list(rural = 2.6, urban = 5.5), "mean") -#' @import data.table -#' @export -assign_stat <- function(dt, lev, counts, stat, colname) { - if (is.null(stat)) return(invisible(dt)) - n <- nrow(dt) - - v <- if (is.list(stat)) unlist(stat, use.names = TRUE) else stat - - # Single scalar: broadcast - if (length(v) == 1L && is.null(names(v))) { - dt[, (colname) := rep.int(unname(v), n)] - return(invisible(dt)) - } - - # Need names to map values to levels - if (is.null(names(v))) { - stop("`stat` has length > 1 but no names; cannot map to levels.") - } - - map_idx <- match(lev, names(v)) - if (anyNA(map_idx)) { - missing_levels <- unique(lev[is.na(map_idx)]) - stop( - sprintf("`stat` missing value(s) for level(s): %s", - paste(missing_levels, collapse = ", ")) - ) - } - - dt[, (colname) := rep.int(unname(v[map_idx]), counts)] - invisible(dt) -} diff --git a/R/compute_fgt_new.R b/R/compute_fgt_new.R index f0ce396d..0e066e85 100644 --- a/R/compute_fgt_new.R +++ b/R/compute_fgt_new.R @@ -31,7 +31,8 @@ compute_fgt_dt <- function(dt, welfare, weight, povlines, mean_and_med = FALSE) pov <- povlines[i] poor <- w < pov rel_dist <- 1 - (w / pov) - rel_dist[!poor] <- 0 + setv(rel_dist, poor, 0, invert = TRUE) + # rel_dist[!poor] <- 0 res[i, 1] <- fmean(poor, w = wt) # FGT0 res[i, 2] <- fmean(rel_dist, w = wt) # FGT1 res[i, 3] <- fmean(rel_dist^2, w = wt) # FGT2 @@ -46,10 +47,10 @@ compute_fgt_dt <- function(dt, welfare, weight, povlines, mean_and_med = FALSE) } if (mean_and_med) { - mn <- funique(dt$mean) - med <- funique(dt$median) - cy <- funique(dt$coutnry_code) - ry <- funique(dt$reporting_year) + mn <- ffirst(dt$mean) + med <- ffirst(dt$median) + cy <- ffirst(dt$coutnry_code) + ry <- ffirst(dt$reporting_year) out <- data.table( povline = povlines, headcount = res[, 1], diff --git a/R/fg_pip.R b/R/fg_pip.R index e8c3edb2..f177a995 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -75,9 +75,8 @@ fg_pip <- function(country, res <- lapply(lt, process_dt, povline = povline, - mean_and_med = TRUE) - res <- rbindlist(res, - fill = TRUE) + mean_and_med = TRUE) |> + rbindlist(fill = TRUE) # ZP Add: join to metadata #------------------------- diff --git a/R/utils-pipdata.R b/R/utils-pipdata.R index 16b996ce..6c961cd3 100644 --- a/R/utils-pipdata.R +++ b/R/utils-pipdata.R @@ -47,3 +47,248 @@ transform_input <- function(input_list){ output_list <- unlist(output_list, recursive = FALSE) return(output_list) } + + +#' Add attributes as columns (vectorized, in-place) +#' +#' @description +#' Converts survey attributes on a `data.table`—including +#' `reporting_level_rows`, `country_code`, `reporting_year`, and `dist_stats`— +#' into columns using a **loop-free, segment-replication** strategy. Designed +#' for very large tables and objects loaded via `readRDS()`/`load()`: +#' uses `setDT()` and `alloc.col()` to ensure in-place assignment. +#' +#' @details +#' The function expects an attribute `reporting_level_rows`, a list with: +#' - `reporting_level`: character vector of the level label for each segment +#' (e.g., `c("rural","urban","rural", ...)`). +#' - `rows`: integer vector of **cumulative** row-ends (e.g., +#' `c(100000, 200000, 300000, ...)`). +#' +#' Segment lengths are computed as `diff(c(0L, rows))`, and `reporting_level` +#' is replicated with `rep.int(lev, counts)`. Constants `country_code`, +#' `reporting_year`, and `file` (`paste0(country_code, "_", reporting_year)`) +#' are added to all rows. If `dist_stats$mean` / `dist_stats$median` are +#' provided (as named vectors/lists keyed by level), they are mapped by level +#' name and replicated per segment. If a level is missing from the names, +#' `NA` values may result for that segment. +#' +#' This implementation avoids loops and `findInterval()` edge cases, and +#' modifies `dt` by reference. +#' +#' @param dt A `data.table` carrying the attributes described above. +#' +#' @return The same `data.table`, modified by reference, with added columns: +#' `reporting_level`, `country_code`, `reporting_year`, `file`, and (if +#' present) `mean`, `median`. +#' +#' @section Assumptions: +#' * `length(reporting_level_rows$reporting_level) == length(reporting_level_rows$rows)`. +#' * `rows` are cumulative and non-decreasing, and their segment lengths sum to `nrow(dt)`. +#' * If `dist_stats$mean` / `dist_stats$median` have multiple values, their names +#' align with the level labels. +#' +#' @note For objects loaded from disk (e.g., via `readRDS()`), `alloc.col(dt)` +#' ensures there is spare column capacity for by-reference assignment. +#' +#' @seealso [add_attributes_as_columns_multi()], [assign_stat()] +#' +#' @examples +#' \dontrun{ +#' library(data.table) +#' dt <- data.table(weight = 1:6, welfare = runif(6)) +#' attr(dt, "reporting_level_rows") <- list( +#' reporting_level = c("rural","urban","rural"), +#' rows = c(2L, 4L, 6L) +#' ) +#' attr(dt, "country_code") <- "XXY" +#' attr(dt, "reporting_year") <- 2000L +#' attr(dt, "dist_stats") <- list( +#' mean = list(rural = 2.5, urban = 5.0), +#' median = list(rural = 2.0, urban = 4.5) +#' ) +#' +#' add_attributes_as_columns_vectorized(dt) +#' head(dt) +#' } +#' +#' @import data.table +#' @export +add_attributes_as_columns_vectorized <- function(dt) { + + # Ensure proper internal state & spare column capacity (handles readRDS/load cases) + setDT(dt) # harmless if already a data.table + alloc.col(dt) # pre-allocate room for new columns + + rl <- attr(dt, "reporting_level_rows") + lev <- rl$reporting_level + rows <- as.integer(rl$rows) + n <- fnrow(dt) + + counts <- diff(c(0L, rows)) + if (sum(counts) != n) cli::cli_abort("Sum of 'rows' in attribute does not equal nrow(dt).") + + # reporting_level: optimized assignment by range + reporting_level_vec <- character(n) + start <- 1L + for (i in seq_along(lev)) { + end <- rows[i] + reporting_level_vec[start:end] <- lev[i] + start <- end + 1L + } + dt[, reporting_level := reporting_level_vec] + + # constants + cc <- attr(dt, "country_code") + ry <- attr(dt, "reporting_year") + dt[, `:=`( + country_code = cc, + reporting_year = ry, + file = paste0(cc, "_", ry) + )] + + # dist_stats per reporting_level (align by names, then replicate by counts) + ds <- attr(dt, "dist_stats") + if (length(ds)) { + if (!is.null(ds$mean)) { + m <- unlist(ds$mean, use.names = TRUE) + dt[, + mean := rep.int(unname(m[match(lev, + names(m))]), + counts)] + } + if (!is.null(ds$median)) { + md <- unlist(ds$median, use.names = TRUE) + dt[, + median := rep.int(unname(md[match(lev, + names(md))]), + counts)] + } + } + + dt +} + + + +#' Add attributes as columns for multi-segment reporting levels +#' +#' @description +#' Converts attributes on a survey `data.table` (e.g., `reporting_level_rows`, +#' `country_code`, `reporting_year`, and `dist_stats`) into columns, handling +#' **multiple alternating segments** (e.g., CHN rural/urban/rural/urban) or +#' single-segment cases (e.g., ZAF). +#' +#' @param dt A `data.table` with attributes: +#' - `reporting_level_rows`: list with `reporting_level` (character) and +#' `rows` (integer cumulative row ends). +#' - `country_code` (character). +#' - `reporting_year` (integer/numeric). +#' - `dist_stats` (list) optionally containing `mean` and/or `median`, each as +#' a named list/vector keyed by reporting level, or a single scalar. +#' +#' @return The same `data.table`, modified by reference, with new columns: +#' `reporting_level`, `country_code`, `reporting_year`, `file`, and +#' optionally `mean`, `median`. +#' +#' @examples +#' # chn2000_cols <- add_attributes_as_columns_multi(chn2000) +#' # zaf2000_cols <- add_attributes_as_columns_multi(zaf2000) +#' @import data.table +#' @export +add_attributes_as_columns_multi <- function(dt) { + # Ensure DT internals and spare capacity for new columns + setDT(dt) + alloc.col(dt) + + # --- Pull + validate segment metadata --- + rl <- attr(dt, "reporting_level_rows") + if (is.null(rl) || is.null(rl$reporting_level) || is.null(rl$rows)) { + cli::cli_abort("Missing 'reporting_level_rows' attribute with $reporting_level and $rows.") + } + lev <- as.character(rl$reporting_level) + rows <- as.integer(rl$rows) + n <- nrow(dt) + + if (length(lev) != length(rows)) cli::cli_abort("'reporting_level' and 'rows' lengths differ.") + if (length(rows) == 0L) cli::cli_abort("'rows' is empty.") + if (any(diff(rows) < 0L)) cli::cli_abort("'rows' must be non-decreasing.") + if (rows[length(rows)] != n) cli::cli_abort("Last element of 'rows' must equal nrow(dt).") + + counts <- diff(c(0L, rows)) + if (any(counts <= 0L)) cli::cli_abort("Computed non-positive segment length(s).") + + # --- reporting_level: vectorized per-segment replication --- + dt[, reporting_level := rep.int(lev, counts)] + + # --- constants --- + cc <- attr(dt, "country_code") + ry <- attr(dt, "reporting_year") + dt[, `:=`( + country_code = cc, + reporting_year = ry, + file = paste0(cc, "_", ry) + )] + + # --- distribution stats --- + ds <- attr(dt, "dist_stats") + if (length(ds)) { + assign_stat(dt, lev, counts, ds$mean, "mean") + assign_stat(dt, lev, counts, ds$median, "median") + } + + dt +} + + + +#' Assign a per-level statistic to a data.table column (by reference) +#' +#' @description +#' Replicates a statistic per reporting-level segment and assigns it to a new +#' column in `dt`, **in place**. `stat` can be a scalar (broadcast), a named +#' vector, or a named list (one value per level). +#' +#' @param dt A `data.table`. Modified by reference. +#' @param lev Character vector of reporting-level labels per segment +#' (e.g., `c("rural","urban","rural", ...)`). +#' @param counts Integer vector of segment lengths matching `lev` +#' (e.g., `c(100000, 100000, 100000, ...)`). +#' @param stat A numeric scalar, named vector, or named list with one value per +#' level (names must match `lev` values). +#' @param colname Name of the column to create/overwrite. +#' +#' @return Invisibly returns `dt` (modified by reference). +#' @examples +#' # assign_stat(dt, lev, counts, list(rural = 2.6, urban = 5.5), "mean") +#' @import data.table +#' @export +assign_stat <- function(dt, lev, counts, stat, colname) { + if (is.null(stat)) return(invisible(dt)) + n <- nrow(dt) + + v <- if (is.list(stat)) unlist(stat, use.names = TRUE) else stat + + # Single scalar: broadcast + if (length(v) == 1L && is.null(names(v))) { + dt[, (colname) := rep.int(unname(v), n)] + return(invisible(dt)) + } + + # Need names to map values to levels + if (is.null(names(v))) { + stop("`stat` has length > 1 but no names; cannot map to levels.") + } + + map_idx <- match(lev, names(v)) + if (anyNA(map_idx)) { + missing_levels <- unique(lev[is.na(map_idx)]) + stop( + sprintf("`stat` missing value(s) for level(s): %s", + paste(missing_levels, collapse = ", ")) + ) + } + + dt[, (colname) := rep.int(unname(v[map_idx]), counts)] + invisible(dt) +} diff --git a/man/add_attributes_as_columns_multi.Rd b/man/add_attributes_as_columns_multi.Rd index 7df2c2ed..27a0d5a7 100644 --- a/man/add_attributes_as_columns_multi.Rd +++ b/man/add_attributes_as_columns_multi.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/add_attributes_as_columns.R +% Please edit documentation in R/utils-pipdata.R \name{add_attributes_as_columns_multi} \alias{add_attributes_as_columns_multi} \title{Add attributes as columns for multi-segment reporting levels} diff --git a/man/add_attributes_as_columns_vectorized.Rd b/man/add_attributes_as_columns_vectorized.Rd index 66c51a00..40e8008d 100644 --- a/man/add_attributes_as_columns_vectorized.Rd +++ b/man/add_attributes_as_columns_vectorized.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/add_attributes_as_columns.R +% Please edit documentation in R/utils-pipdata.R \name{add_attributes_as_columns_vectorized} \alias{add_attributes_as_columns_vectorized} \title{Add attributes as columns (vectorized, in-place)} diff --git a/man/assign_stat.Rd b/man/assign_stat.Rd index 987b08f4..36cb81b9 100644 --- a/man/assign_stat.Rd +++ b/man/assign_stat.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/add_attributes_as_columns.R +% Please edit documentation in R/utils-pipdata.R \name{assign_stat} \alias{assign_stat} \title{Assign a per-level statistic to a data.table column (by reference)} From ed78664e3e34574321719d5d9fc6d6d0b89638b7 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Fri, 22 Aug 2025 18:00:49 -0400 Subject: [PATCH 056/203] slight optimization --- R/fg_pip.R | 9 +++++---- R/utils-pipdata.R | 20 +++++++++----------- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/R/fg_pip.R b/R/fg_pip.R index f177a995..1d824164 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -72,10 +72,11 @@ fg_pip <- function(country, # ZP Add: do fgt estimations using `res <- lapply(lt, process_dt, povline = povline)` #------------------------- - res <- lapply(lt, - process_dt, - povline = povline, - mean_and_med = TRUE) |> + res <- lapply(lt, \(x) { + process_dt(x, + povline = povline, + mean_and_med = TRUE) + }) |> rbindlist(fill = TRUE) # ZP Add: join to metadata diff --git a/R/utils-pipdata.R b/R/utils-pipdata.R index 6c961cd3..5e76f2c0 100644 --- a/R/utils-pipdata.R +++ b/R/utils-pipdata.R @@ -118,7 +118,7 @@ add_attributes_as_columns_vectorized <- function(dt) { # Ensure proper internal state & spare column capacity (handles readRDS/load cases) setDT(dt) # harmless if already a data.table - alloc.col(dt) # pre-allocate room for new columns + setalloccol(dt) # pre-allocate room for new columns... #AC, I am still not sure about this. rl <- attr(dt, "reporting_level_rows") lev <- rl$reporting_level @@ -151,18 +151,16 @@ add_attributes_as_columns_vectorized <- function(dt) { ds <- attr(dt, "dist_stats") if (length(ds)) { if (!is.null(ds$mean)) { - m <- unlist(ds$mean, use.names = TRUE) - dt[, - mean := rep.int(unname(m[match(lev, - names(m))]), - counts)] + for (l in lev) { + dt[reporting_level == l, + mean := ds$mean[[l]]] + } } if (!is.null(ds$median)) { - md <- unlist(ds$median, use.names = TRUE) - dt[, - median := rep.int(unname(md[match(lev, - names(md))]), - counts)] + for (l in lev) { + dt[reporting_level == l, + median := ds$median[[l]]] + } } } From 563cc34cef7ebc840a2cbe3b563bcb24a158ffb8 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Fri, 22 Aug 2025 18:24:40 -0400 Subject: [PATCH 057/203] more optimization --- R/utils-pipdata.R | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/R/utils-pipdata.R b/R/utils-pipdata.R index 5e76f2c0..57f01c96 100644 --- a/R/utils-pipdata.R +++ b/R/utils-pipdata.R @@ -149,18 +149,22 @@ add_attributes_as_columns_vectorized <- function(dt) { # dist_stats per reporting_level (align by names, then replicate by counts) ds <- attr(dt, "dist_stats") + + + if (length(ds)) { - if (!is.null(ds$mean)) { - for (l in lev) { - dt[reporting_level == l, - mean := ds$mean[[l]]] - } - } - if (!is.null(ds$median)) { - for (l in lev) { - dt[reporting_level == l, - median := ds$median[[l]]] - } + dstats <- c("mean", "median") + for (l in lev) { + wrl <- whichv(dt$reporting_level, l) + ld <- lapply(dstats, \(d) { + if (is.null(ds[[d]][[l]])) { + NA + } else { + ds[[d]][[l]] + } + }) + + dt[wrl, (dstats) := ld] } } From 95e9f21689837b9d2682ae71df4e5e6dd4815e7f Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Fri, 22 Aug 2025 19:08:06 -0400 Subject: [PATCH 058/203] init reformat --- R/compute_fgt_new.R | 57 +++++++++++++++++++++++++++++++++++++++++++++ R/fg_pip.R | 14 ++++++++--- R/utils-pipdata.R | 19 +++++++-------- 3 files changed, 77 insertions(+), 13 deletions(-) diff --git a/R/compute_fgt_new.R b/R/compute_fgt_new.R index 0e066e85..f4d0135c 100644 --- a/R/compute_fgt_new.R +++ b/R/compute_fgt_new.R @@ -75,6 +75,63 @@ compute_fgt_dt <- function(dt, welfare, weight, povlines, mean_and_med = FALSE) } + +# Efficient FGT calculation for vectors (No data.table) +#' +#' @param w character: welfare variable name +#' @param wt character: weight variable name +#' @param povlines double: vector with poverty lines +#' +#' @return data.table with estimates poverty estimates +#' @keywords internal +compute_fgt <- function(w, wt, povlines) { + n <- length(w) + m <- length(povlines) + + # Pre-allocate result matrix + res <- matrix(NA_real_, nrow = m, ncol = 3) + colnames(res) <- c("FGT0", "FGT1", "FGT2") + watts_vec <- numeric(m) + + # Precompute log(w) for efficiency (vectorized) + + pos <- w > 0 + # logw <- log(w) + logw <- copyv(log(w), pos, NA_real_, invert = TRUE) |> + suppressWarnings() + # logw <- fifelse(w > 0, log(w), NA_real_) + + for (i in seq_along(povlines)) { + pov <- povlines[i] + poor <- w < pov + rel_dist <- 1 - (w / pov) + setv(rel_dist, poor, 0, invert = TRUE) + # rel_dist[!poor] <- 0 + res[i, 1] <- fmean(poor, w = wt) # FGT0 + res[i, 2] <- fmean(rel_dist, w = wt) # FGT1 + res[i, 3] <- fmean(rel_dist^2, w = wt) # FGT2 + + # Optimized Watts index calculation + keep <- poor & pos + if (any(keep, na.rm = TRUE)) { + watts_vec[i] <- (fsum((log(pov) - logw[keep]) * wt[keep])) / fsum(wt) + } else { + watts_vec[i] <- 0 + } + } + + + data.table( + povline = povlines, + headcount = res[, 1], + poverty_gap = res[, 2], + poverty_severity = res[, 3], + watts = watts_vec) + +} + + + process_dt <- function(dt, povline, mean_and_med = FALSE) { dt[, compute_fgt_dt(.SD, "welfare", "weight", povline, mean_and_med), by = .(file, reporting_level)] diff --git a/R/fg_pip.R b/R/fg_pip.R index 1d824164..a2dc64a0 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -66,9 +66,17 @@ fg_pip <- function(country, load_list_refy(input_list = full_list, path = fs::path(data_dir, "lineup_data")) - lt <- lapply(lt, \(x) { - add_attributes_as_columns_vectorized(x) - }) + # lt <- lapply(lt, \(x) { + # add_attributes_as_columns_vectorized(x) + # }) + + lt_att <- lapply(lt, \(.) { + list( + dist_stats = attr(., "dt_dist_stats"), + rl_rows = attr(., "reporting_level_rows") + ) + }) + # ZP Add: do fgt estimations using `res <- lapply(lt, process_dt, povline = povline)` #------------------------- diff --git a/R/utils-pipdata.R b/R/utils-pipdata.R index 57f01c96..3459c1b9 100644 --- a/R/utils-pipdata.R +++ b/R/utils-pipdata.R @@ -8,16 +8,15 @@ load_list_refy <- \(input_list, path){ input_list <- transform_input(input_list) - dl <- lapply(input_list, FUN = function(x) { - qs::qread(file = fs::path(path, paste0(x$country_code, "_", - x$year), - ext = "qs")) - }) + inames <- lapply(input_list, \(x) { + paste0(x$country_code, "_", x$year) + }) + + dl <- lapply(inames, \(x) { + qs::qread(file = fs::path(path, x, ext = "qs")) + }) |> + setNames(inames) - names(dl) <- vapply(input_list, \(x) { - paste0(x$country_code, x$year) - }, - FUN.VALUE = character(1)) dl } @@ -151,7 +150,7 @@ add_attributes_as_columns_vectorized <- function(dt) { ds <- attr(dt, "dist_stats") - + # AC: I think this is not necessary at this stage. Can we merge at a later point? if (length(ds)) { dstats <- c("mean", "median") for (l in lev) { From d5b3370642fdddbf2ee60c94f438649cf85f747c Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Fri, 22 Aug 2025 22:06:25 -0400 Subject: [PATCH 059/203] new optimization --- R/compute_fgt_new.R | 45 ++++++++++++++++++++++++++++++++++++++++++--- R/fg_pip.R | 19 +++++++------------ R/utils-pipdata.R | 45 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 94 insertions(+), 15 deletions(-) diff --git a/R/compute_fgt_new.R b/R/compute_fgt_new.R index f4d0135c..e78d8296 100644 --- a/R/compute_fgt_new.R +++ b/R/compute_fgt_new.R @@ -97,9 +97,9 @@ compute_fgt <- function(w, wt, povlines) { pos <- w > 0 # logw <- log(w) - logw <- copyv(log(w), pos, NA_real_, invert = TRUE) |> - suppressWarnings() - # logw <- fifelse(w > 0, log(w), NA_real_) + # logw <- copyv(log(w), pos, NA_real_, invert = TRUE) |> + # suppressWarnings() + logw <- fifelse(w > 0, log(w), NA_real_) for (i in seq_along(povlines)) { pov <- povlines[i] @@ -130,8 +130,47 @@ compute_fgt <- function(w, wt, povlines) { } +#' compute FGT using indices by reporting level +#' +#' This function is intended to be used inside [map_fgt] +#' +#' @param x data.table from lt list, with welfare and weight vectors +#' @param y list of indices for each reporting level +#' @param nx name of data table. Usuall country code and year in the form "CCC_YYYY" +#' +#' @return data.table with FGT estimates by reporting level +#' @keywords internal +DT_fgt_by_rl <- \(x, y, nx) { + DT_fgt <- lapply(names(y), \(rl) { + + idx <- y[[rl]] + w <- x[idx, welfare] + wt <- x[idx, weight] + RL <- compute_fgt(w = w, wt = wt, povlines = povline) + RL[, reporting_level := rl] + + }) |> + rbindlist(fill = TRUE) + DT_fgt[, `:=`( + country_code = gsub("([^_]+)(_.+)", "\\1", nx), + reporting_year = gsub("(.+_)([^_]+)", "\\2", nx) + )] +} + +#' map over list of data.tables and indices to compute FGT by reporting_level +#' +#' @param lt list of data.tables with welfare and weight data +#' @param l_rl_rows list of indeces +#' +#' @return data.table with all measured +#' @keywords internal +map_fgt <- \(lt, l_rl_rows) { + Map(DT_fgt_by_rl, lt, l_rl_rows, names(lt)) |> + rbindlist(fill = TRUE) +} + process_dt <- function(dt, povline, mean_and_med = FALSE) { dt[, compute_fgt_dt(.SD, "welfare", "weight", povline, mean_and_med), by = .(file, reporting_level)] diff --git a/R/fg_pip.R b/R/fg_pip.R index a2dc64a0..036c21c3 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -70,22 +70,17 @@ fg_pip <- function(country, # add_attributes_as_columns_vectorized(x) # }) - lt_att <- lapply(lt, \(.) { - list( - dist_stats = attr(., "dt_dist_stats"), - rl_rows = attr(., "reporting_level_rows") - ) - }) + # Extract some attributes + lt_att <- get_lt_attr(lt) + + # get rows indices + l_rl_rows <- get_rl_rows(lt_att) + # ZP Add: do fgt estimations using `res <- lapply(lt, process_dt, povline = povline)` #------------------------- - res <- lapply(lt, \(x) { - process_dt(x, - povline = povline, - mean_and_med = TRUE) - }) |> - rbindlist(fill = TRUE) + res <- map_fgt(lt, l_rl_rows) # ZP Add: join to metadata #------------------------- diff --git a/R/utils-pipdata.R b/R/utils-pipdata.R index 3459c1b9..45f65b24 100644 --- a/R/utils-pipdata.R +++ b/R/utils-pipdata.R @@ -293,3 +293,48 @@ assign_stat <- function(dt, lev, counts, stat, colname) { dt[, (colname) := rep.int(unname(v[map_idx]), counts)] invisible(dt) } + + + + + +#' extract rows indices +#' +#' @param a list with attributes from lt +#' +#' @return names list with indices for reporting level +#' @keywords internal +get_rl_rows_single <- function(a) { + rl <- a$rl_rows + rl_rows <- vector("list", length(rl$reporting_level)) + + start <- 1L + for (i in seq_along(rl$reporting_level)) { + end <- rl$rows[i] + rl_rows[[i]] <- start:end + start <- end + 1L + } + setNames(rl_rows, rl$reporting_level) +} + + +#' apply get_rl_rows_single +#' @rdname get_rl_rows_single +get_rl_rows <- \(lt_att) { + lapply(lt_att, get_rl_rows_single) +} + + + +#' Get some attributes from lt lis +#' +#' @param lt list +#' @keywords internal +get_lt_attr <- function(lt) { + lapply(lt, \(.) { + list( + dist_stats = attr(., "dt_dist_stats"), + rl_rows = attr(., "reporting_level_rows") + ) + }) +} From 0040957620aaa12fe483a193d9e272211d4feb6c Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Fri, 22 Aug 2025 22:24:23 -0400 Subject: [PATCH 060/203] optimize how to estimate FGT by reporting level --- R/fg_pip.R | 25 ++++++------------------- R/utils-pipdata.R | 14 ++++++++++++++ 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/R/fg_pip.R b/R/fg_pip.R index 036c21c3..b121b722 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -76,17 +76,14 @@ fg_pip <- function(country, # get rows indices l_rl_rows <- get_rl_rows(lt_att) + # get data.table with dist stats + dt_dist_stats <- get_dt_dist_stats(la_att) # ZP Add: do fgt estimations using `res <- lapply(lt, process_dt, povline = povline)` #------------------------- res <- map_fgt(lt, l_rl_rows) - # ZP Add: join to metadata - #------------------------- - metadata[, - file := basename(path)] - # try metadata unique code tmp_metadata <- metadata # Handle multiple distribution types (for aggregated distributions) @@ -112,25 +109,19 @@ fg_pip <- function(country, # Remove duplicate rows by reporting_year (keep only one row per # reporting_year) tmp_metadata_unique <- funique(tmp_metadata) - tmp_metadata_unique[, - file := paste0(country_code, - "_", - reporting_year)] + out <- join(res, tmp_metadata_unique, - on = c("file", + on = c("country_code", "reporting_year", "reporting_level"), how = "left", # ZP: change from full to left, # this rm nowcast years - i.e. years not included # as lineup years validate = "m:1", drop.dup.cols = TRUE, - verbose = 0) - - out[, `:=`( - file = NULL - )] + verbose = 0, + overid = 2) setnames(out, "povline", @@ -141,10 +132,6 @@ fg_pip <- function(country, use_new_lineup_version = lkup$use_new_lineup_version) - # Fix issue with rounding of poverty lines - out[, - poverty_line := round(poverty_line, digits = 3) ] - # Formatting. MUST be done in data.table tom modify by reference out[, path := as.character(path)] diff --git a/R/utils-pipdata.R b/R/utils-pipdata.R index 45f65b24..e3ee1f62 100644 --- a/R/utils-pipdata.R +++ b/R/utils-pipdata.R @@ -325,6 +325,20 @@ get_rl_rows <- \(lt_att) { } +#' get data.table with distribution stats +#' +#' this is a loop over lt attributes +#' +#' @param lt_att list of attributes of lt list +#' +#' @return data.table +#' @keywords internal +get_dt_dist_stats <- \(lt_att) { + lapply(lt_att, \(.) { + .$dist_stats + }) |> + rbindlist(fill = TRUE) +} #' Get some attributes from lt lis #' From f8b718ce7e419c33191c7a585c6ea29730509642 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Fri, 22 Aug 2025 22:28:29 -0400 Subject: [PATCH 061/203] merge dist stats --- R/fg_pip.R | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/R/fg_pip.R b/R/fg_pip.R index b121b722..0b6de622 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -77,12 +77,23 @@ fg_pip <- function(country, l_rl_rows <- get_rl_rows(lt_att) # get data.table with dist stats - dt_dist_stats <- get_dt_dist_stats(la_att) + dt_dist_stats <- get_dt_dist_stats(lt_att) # ZP Add: do fgt estimations using `res <- lapply(lt, process_dt, povline = povline)` #------------------------- - res <- map_fgt(lt, l_rl_rows) + fgt <- map_fgt(lt, l_rl_rows) + + # add dist stats + res <- join(fgt, dt_dist_stats, + on = c("country_code", "reporting_year", + "reporting_level"), + how = "left", + validate = "m:1", + drop.dup.cols = TRUE, + verbose = 0, + overid = 2) + # try metadata unique code tmp_metadata <- metadata @@ -132,7 +143,7 @@ fg_pip <- function(country, use_new_lineup_version = lkup$use_new_lineup_version) - # Formatting. MUST be done in data.table tom modify by reference + # Formatting. MUST be done in data.table to modify by reference out[, path := as.character(path)] if ("max_year" %in% names(out)) { From 4b564d07717caae5ef823cc04c2352e1ed0f45f0 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Fri, 22 Aug 2025 22:54:39 -0400 Subject: [PATCH 062/203] update --- R/fg_pip.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/fg_pip.R b/R/fg_pip.R index 0b6de622..df0987cd 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -94,9 +94,11 @@ fg_pip <- function(country, verbose = 0, overid = 2) + # convert reporting year to numeris + res[, reporting_year := as.numeric(reporting_year)] # try metadata unique code - tmp_metadata <- metadata + tmp_metadata <- copy(metadata) # I think we can avoid this inefficiency. # Handle multiple distribution types (for aggregated distributions) if (length(unique(tmp_metadata$distribution_type)) > 1) { tmp_metadata[, distribution_type := "mixed"] From e362307091e8b0f498f8ead815da78ec54ba9ff2 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Sat, 23 Aug 2025 10:34:49 -0400 Subject: [PATCH 063/203] create pipenv enviroment --- R/fg_pip.R | 34 ++++++++++++++++++++-------------- R/pip_new_lineups.R | 6 ++++++ 2 files changed, 26 insertions(+), 14 deletions(-) diff --git a/R/fg_pip.R b/R/fg_pip.R index df0987cd..86bdcebc 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -12,7 +12,8 @@ fg_pip <- function(country, welfare_type, reporting_level, ppp, - lkup) { + lkup, + pipenv = NULL) { valid_regions <- lkup$query_controls$region$values interpolation_list <- lkup$interpolation_list @@ -76,23 +77,28 @@ fg_pip <- function(country, # get rows indices l_rl_rows <- get_rl_rows(lt_att) - # get data.table with dist stats - dt_dist_stats <- get_dt_dist_stats(lt_att) + if (exists("pipenv", envir = .GlobalEnv)) { + # get data.table with dist stats + dt_dist_stats <- get_dt_dist_stats(lt_att) + # Assign them to pipenv to use later + pipenv <- get("pipenv", envir = .GlobalEnv) + rlang::env_poke(pipenv, "dt_dist_stats", dt_dist_stats) + } # ZP Add: do fgt estimations using `res <- lapply(lt, process_dt, povline = povline)` #------------------------- - fgt <- map_fgt(lt, l_rl_rows) - - # add dist stats - res <- join(fgt, dt_dist_stats, - on = c("country_code", "reporting_year", - "reporting_level"), - how = "left", - validate = "m:1", - drop.dup.cols = TRUE, - verbose = 0, - overid = 2) + res <- map_fgt(lt, l_rl_rows) + + # # add dist stats + # res <- join(fgt, dt_dist_stats, + # on = c("country_code", "reporting_year", + # "reporting_level"), + # how = "left", + # validate = "m:1", + # drop.dup.cols = TRUE, + # verbose = 0, + # overid = 2) # convert reporting year to numeris res[, reporting_year := as.numeric(reporting_year)] diff --git a/R/pip_new_lineups.R b/R/pip_new_lineups.R index aea166e5..cb51cd6b 100644 --- a/R/pip_new_lineups.R +++ b/R/pip_new_lineups.R @@ -76,6 +76,12 @@ pip_new_lineups <- function(country = "ALL", povline <- round(povline, digits = 3) + # Create temp environment + pipenv <- rlang::env() + assign("pipenv", pipenv, envir = .GlobalEnv) + on.exit(rm("pipenv", envir = .GlobalEnv), add = TRUE) + + # TEMPORARY UNTIL SELECTION MECHANISM IS BEING IMPROVED country <- toupper(country) From 6d637b489be317d9469f634ab0da2be5b14d6963 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Sat, 23 Aug 2025 10:46:09 -0400 Subject: [PATCH 064/203] remove mean and median from dickdb --- R/duckdb_func.R | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/R/duckdb_func.R b/R/duckdb_func.R index 663056c9..4758db15 100644 --- a/R/duckdb_func.R +++ b/R/duckdb_func.R @@ -217,8 +217,8 @@ update_master_file <- function(dat, keep_vars <- c( "interpolation_id", "poverty_line", - "mean", - "median", + + "headcount", "poverty_gap", "poverty_severity", @@ -233,8 +233,8 @@ update_master_file <- function(dat, "cache_id", "reporting_level", "poverty_line", - "mean", - "median", + + "headcount", "poverty_gap", "poverty_severity", @@ -339,8 +339,7 @@ create_duckdb_file <- function(cache_file_path) { cache_id VARCHAR, reporting_level VARCHAR, poverty_line DOUBLE, - mean DOUBLE, - median DOUBLE, + headcount DOUBLE, poverty_gap DOUBLE, poverty_severity DOUBLE, @@ -350,8 +349,7 @@ create_duckdb_file <- function(cache_file_path) { DBI::dbExecute(con, "CREATE OR REPLACE table fg_master_file ( interpolation_id VARCHAR, poverty_line DOUBLE, - mean DOUBLE, - median DOUBLE, + headcount DOUBLE, poverty_gap DOUBLE, poverty_severity DOUBLE, From 0315985517c04b045f65f03ef8a3f687b467d4b5 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Sat, 23 Aug 2025 11:14:19 -0400 Subject: [PATCH 065/203] add just mean and median --- R/fg_pip.R | 44 ++++++++++++++++++++++++++++++++++++-------- man/fg_pip.Rd | 3 ++- 2 files changed, 38 insertions(+), 9 deletions(-) diff --git a/R/fg_pip.R b/R/fg_pip.R index 86bdcebc..e4e575d7 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -77,18 +77,19 @@ fg_pip <- function(country, # get rows indices l_rl_rows <- get_rl_rows(lt_att) - if (exists("pipenv", envir = .GlobalEnv)) { - # get data.table with dist stats - dt_dist_stats <- get_dt_dist_stats(lt_att) - # Assign them to pipenv to use later - pipenv <- get("pipenv", envir = .GlobalEnv) - rlang::env_poke(pipenv, "dt_dist_stats", dt_dist_stats) - } + # if (exists("pipenv", envir = .GlobalEnv)) { + # # get data.table with dist stats + # dt_dist_stats <- get_dt_dist_stats(lt_att) + # # Assign them to pipenv to use later + # pipenv <- get("pipenv", envir = .GlobalEnv) + # rlang::env_poke(pipenv, "dt_dist_stats", dt_dist_stats) + # } # ZP Add: do fgt estimations using `res <- lapply(lt, process_dt, povline = povline)` #------------------------- - res <- map_fgt(lt, l_rl_rows) + fgt <- map_fgt(lt, l_rl_rows) |> + funique() # TO REMOVE # # add dist stats # res <- join(fgt, dt_dist_stats, @@ -103,6 +104,10 @@ fg_pip <- function(country, # convert reporting year to numeris res[, reporting_year := as.numeric(reporting_year)] + # Add just mean and median + res <- fg_get_mean_median(fgt, lkup) + + # try metadata unique code tmp_metadata <- copy(metadata) # I think we can avoid this inefficiency. # Handle multiple distribution types (for aggregated distributions) @@ -318,3 +323,26 @@ create_full_list <- function(country, year, refy_lkup, data_present_in_master) { full_list } + + + + +#' merge into fgt table the mean and median from dist stats table in lkup +#' +#' @param fgt data,table with fgt measures +#' @param lkup lkup +#' +#' @return data.table with with fgt, mean and median +#' @keywords internal +fg_get_mean_median <- \(fgt, lkup) { + joyn::joyn(x = fgt, + y = lkup$lineup_dist_stats[, + .(country_code, reporting_year, + reporting_level, mean, median)], + by = c('country_code', "reporting_year", "reporting_level"), + match_type = "1:1", + keep = "left", + reportvar = FALSE, + verbose = FALSE) +} + diff --git a/man/fg_pip.Rd b/man/fg_pip.Rd index 9012b01b..17faee02 100644 --- a/man/fg_pip.Rd +++ b/man/fg_pip.Rd @@ -12,7 +12,8 @@ fg_pip( welfare_type, reporting_level, ppp, - lkup + lkup, + pipenv = NULL ) } \arguments{ From f6bb8e4020670c23224ad574fae97c187422e7a9 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Sat, 23 Aug 2025 11:19:26 -0400 Subject: [PATCH 066/203] super efficient code now --- R/fg_pip.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fg_pip.R b/R/fg_pip.R index e4e575d7..2a4ddef3 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -102,7 +102,7 @@ fg_pip <- function(country, # overid = 2) # convert reporting year to numeris - res[, reporting_year := as.numeric(reporting_year)] + fgt[, reporting_year := as.numeric(reporting_year)] # Add just mean and median res <- fg_get_mean_median(fgt, lkup) From bf9f87f096f7f50bb43c0c918497b3c387a12c74 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Sat, 23 Aug 2025 11:20:31 -0400 Subject: [PATCH 067/203] remove idea of the pipenv --- R/fg_pip.R | 18 ------------------ R/pip_new_lineups.R | 7 ------- 2 files changed, 25 deletions(-) diff --git a/R/fg_pip.R b/R/fg_pip.R index 2a4ddef3..dad368bc 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -77,30 +77,12 @@ fg_pip <- function(country, # get rows indices l_rl_rows <- get_rl_rows(lt_att) - # if (exists("pipenv", envir = .GlobalEnv)) { - # # get data.table with dist stats - # dt_dist_stats <- get_dt_dist_stats(lt_att) - # # Assign them to pipenv to use later - # pipenv <- get("pipenv", envir = .GlobalEnv) - # rlang::env_poke(pipenv, "dt_dist_stats", dt_dist_stats) - # } - # ZP Add: do fgt estimations using `res <- lapply(lt, process_dt, povline = povline)` #------------------------- fgt <- map_fgt(lt, l_rl_rows) |> funique() # TO REMOVE - # # add dist stats - # res <- join(fgt, dt_dist_stats, - # on = c("country_code", "reporting_year", - # "reporting_level"), - # how = "left", - # validate = "m:1", - # drop.dup.cols = TRUE, - # verbose = 0, - # overid = 2) - # convert reporting year to numeris fgt[, reporting_year := as.numeric(reporting_year)] diff --git a/R/pip_new_lineups.R b/R/pip_new_lineups.R index cb51cd6b..5b255ec6 100644 --- a/R/pip_new_lineups.R +++ b/R/pip_new_lineups.R @@ -76,13 +76,6 @@ pip_new_lineups <- function(country = "ALL", povline <- round(povline, digits = 3) - # Create temp environment - pipenv <- rlang::env() - assign("pipenv", pipenv, envir = .GlobalEnv) - on.exit(rm("pipenv", envir = .GlobalEnv), add = TRUE) - - - # TEMPORARY UNTIL SELECTION MECHANISM IS BEING IMPROVED country <- toupper(country) if (is.character(year)) { From aea71b72b8915554ed6b5997dc6e33d5fb688c05 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Sat, 23 Aug 2025 11:49:14 -0400 Subject: [PATCH 068/203] make sure it saves mean and median depending on the type of data. --- R/duckdb_func.R | 12 ++++++++---- R/fg_pip.R | 2 +- man/DT_fgt_by_rl.Rd | 22 ++++++++++++++++++++++ man/fg_get_mean_median.Rd | 20 ++++++++++++++++++++ man/get_dt_dist_stats.Rd | 18 ++++++++++++++++++ man/get_lt_attr.Rd | 15 +++++++++++++++ man/get_rl_rows_single.Rd | 23 +++++++++++++++++++++++ man/map_fgt.Rd | 20 ++++++++++++++++++++ 8 files changed, 127 insertions(+), 5 deletions(-) create mode 100644 man/DT_fgt_by_rl.Rd create mode 100644 man/fg_get_mean_median.Rd create mode 100644 man/get_dt_dist_stats.Rd create mode 100644 man/get_lt_attr.Rd create mode 100644 man/get_rl_rows_single.Rd create mode 100644 man/map_fgt.Rd diff --git a/R/duckdb_func.R b/R/duckdb_func.R index 4758db15..5071e1ca 100644 --- a/R/duckdb_func.R +++ b/R/duckdb_func.R @@ -217,8 +217,6 @@ update_master_file <- function(dat, keep_vars <- c( "interpolation_id", "poverty_line", - - "headcount", "poverty_gap", "poverty_severity", @@ -233,8 +231,6 @@ update_master_file <- function(dat, "cache_id", "reporting_level", "poverty_line", - - "headcount", "poverty_gap", "poverty_severity", @@ -242,6 +238,14 @@ update_master_file <- function(dat, ) } + # Get column names from DuckDB table + table_info <- DBI::dbGetQuery(write_con, glue("PRAGMA table_info({target_file})")) + col_names <- table_info$name + # Add mean and median if present in table + if (all(c("mean", "median") %in% col_names)) { + keep_vars <- c(keep_vars, "mean", "median") + } + # Select variables dat <- dat[, ..keep_vars] diff --git a/R/fg_pip.R b/R/fg_pip.R index dad368bc..2ec21575 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -83,7 +83,7 @@ fg_pip <- function(country, fgt <- map_fgt(lt, l_rl_rows) |> funique() # TO REMOVE - # convert reporting year to numeris + # convert reporting year to numeric fgt[, reporting_year := as.numeric(reporting_year)] # Add just mean and median diff --git a/man/DT_fgt_by_rl.Rd b/man/DT_fgt_by_rl.Rd new file mode 100644 index 00000000..72663294 --- /dev/null +++ b/man/DT_fgt_by_rl.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compute_fgt_new.R +\name{DT_fgt_by_rl} +\alias{DT_fgt_by_rl} +\title{compute FGT using indices by reporting level} +\usage{ +DT_fgt_by_rl(x, y, nx) +} +\arguments{ +\item{x}{data.table from lt list, with welfare and weight vectors} + +\item{y}{list of indices for each reporting level} + +\item{nx}{name of data table. Usuall country code and year in the form "CCC_YYYY"} +} +\value{ +data.table with FGT estimates by reporting level +} +\description{ +This function is intended to be used inside \link{map_fgt} +} +\keyword{internal} diff --git a/man/fg_get_mean_median.Rd b/man/fg_get_mean_median.Rd new file mode 100644 index 00000000..88667176 --- /dev/null +++ b/man/fg_get_mean_median.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fg_pip.R +\name{fg_get_mean_median} +\alias{fg_get_mean_median} +\title{merge into fgt table the mean and median from dist stats table in lkup} +\usage{ +fg_get_mean_median(fgt, lkup) +} +\arguments{ +\item{fgt}{data,table with fgt measures} + +\item{lkup}{lkup} +} +\value{ +data.table with with fgt, mean and median +} +\description{ +merge into fgt table the mean and median from dist stats table in lkup +} +\keyword{internal} diff --git a/man/get_dt_dist_stats.Rd b/man/get_dt_dist_stats.Rd new file mode 100644 index 00000000..f59e6b9f --- /dev/null +++ b/man/get_dt_dist_stats.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-pipdata.R +\name{get_dt_dist_stats} +\alias{get_dt_dist_stats} +\title{get data.table with distribution stats} +\usage{ +get_dt_dist_stats(lt_att) +} +\arguments{ +\item{lt_att}{list of attributes of lt list} +} +\value{ +data.table +} +\description{ +this is a loop over lt attributes +} +\keyword{internal} diff --git a/man/get_lt_attr.Rd b/man/get_lt_attr.Rd new file mode 100644 index 00000000..303808ab --- /dev/null +++ b/man/get_lt_attr.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-pipdata.R +\name{get_lt_attr} +\alias{get_lt_attr} +\title{Get some attributes from lt lis} +\usage{ +get_lt_attr(lt) +} +\arguments{ +\item{lt}{list} +} +\description{ +Get some attributes from lt lis +} +\keyword{internal} diff --git a/man/get_rl_rows_single.Rd b/man/get_rl_rows_single.Rd new file mode 100644 index 00000000..e11afce9 --- /dev/null +++ b/man/get_rl_rows_single.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-pipdata.R +\name{get_rl_rows_single} +\alias{get_rl_rows_single} +\alias{get_rl_rows} +\title{extract rows indices} +\usage{ +get_rl_rows_single(a) + +get_rl_rows(lt_att) +} +\arguments{ +\item{a}{list with attributes from lt} +} +\value{ +names list with indices for reporting level +} +\description{ +extract rows indices + +apply get_rl_rows_single +} +\keyword{internal} diff --git a/man/map_fgt.Rd b/man/map_fgt.Rd new file mode 100644 index 00000000..a192f3f2 --- /dev/null +++ b/man/map_fgt.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compute_fgt_new.R +\name{map_fgt} +\alias{map_fgt} +\title{map over list of data.tables and indices to compute FGT by reporting_level} +\usage{ +map_fgt(lt, l_rl_rows) +} +\arguments{ +\item{lt}{list of data.tables with welfare and weight data} + +\item{l_rl_rows}{list of indeces} +} +\value{ +data.table with all measured +} +\description{ +map over list of data.tables and indices to compute FGT by reporting_level +} +\keyword{internal} From 84ab37fd44ec6fcd47cddf74473b5861690cb5a8 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Sat, 23 Aug 2025 12:22:42 -0400 Subject: [PATCH 069/203] update --- R/fg_pip.R | 2 +- R/utils.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/fg_pip.R b/R/fg_pip.R index 2ec21575..07f0e9df 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -322,7 +322,7 @@ fg_get_mean_median <- \(fgt, lkup) { .(country_code, reporting_year, reporting_level, mean, median)], by = c('country_code', "reporting_year", "reporting_level"), - match_type = "1:1", + match_type = "m:1", # multiple povlines keep = "left", reportvar = FALSE, verbose = FALSE) diff --git a/R/utils.R b/R/utils.R index 65f2df74..33969942 100644 --- a/R/utils.R +++ b/R/utils.R @@ -304,7 +304,7 @@ add_dist_stats <- function(df, lkup, fill_gaps) { by = c("country_code", "reporting_level", "reporting_year"), - match_type = "1:1", + match_type = "m:1", # multiple poverty lines keep_common_vars = FALSE, reportvar = FALSE, verbose = FALSE, From 851b83c7d53548c2a36d534667c64ecad934bac5 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Sat, 23 Aug 2025 12:46:24 -0400 Subject: [PATCH 070/203] init new strcuture for aggregates --- NAMESPACE | 1 + R/pip_agg.R | 54 ++++++++++++++++++++++++++++ R/pip_grp.R | 6 ++-- R/pip_grp_logic.R | 6 ++-- R/pip_grp_new.R | 14 ++++++++ R/utils-pipdata.R | 3 +- inst/plumber/v1/endpoints.R | 2 +- man/{pip_grp_logic.Rd => pip_agg.Rd} | 25 +++++++++++-- man/pip_gg.Rd | 22 ++++++++++++ 9 files changed, 123 insertions(+), 10 deletions(-) create mode 100644 R/pip_agg.R create mode 100644 R/pip_grp_new.R rename man/{pip_grp_logic.Rd => pip_agg.Rd} (70%) create mode 100644 man/pip_gg.Rd diff --git a/NAMESPACE b/NAMESPACE index 2ee13c9d..133f86b1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -28,6 +28,7 @@ export(is_empty) export(is_forked) export(load_inter_cache) export(pip) +export(pip_agg) export(pip_grp) export(pip_grp_logic) export(pip_new_lineups) diff --git a/R/pip_agg.R b/R/pip_agg.R new file mode 100644 index 00000000..5c4421dc --- /dev/null +++ b/R/pip_agg.R @@ -0,0 +1,54 @@ +#' Logic for computing new aggregate +#' +#' @inheritParams pip +#' @return data.table +#' @examples +#' \dontrun{ +#' # Create lkups +#' } +#' @export +pip_agg <- function(country = "ALL", + year = "ALL", + povline = 1.9, + group_by = c("wb", "none"), + welfare_type = c("all", "consumption", "income"), + reporting_level = c("all", "national"), + lkup, + censor = TRUE, + lkup_hash = lkup$cache_data_id$hash_pip_grp, + additional_ind = FALSE) { + + # Should pip_old or pip_new be used? + #------------------------------------- + use_new <- lkup$use_new_lineup_version + + # Run correct function + #------------------------------------- + out <- if (use_new) { + pip_grp_new(country = country, + year = year, + povline = povline, + group_by = group_by, + welfare_type = welfare_type, + reporting_level = reporting_level, + lkup = lkup, + censor = censor, + lkup_hash = lkup_hash, + additional_ind = additional_ind) + } else { + pip_grp_logic(country = country, + year = year, + povline = povline, + group_by = group_by, + welfare_type = welfare_type, + reporting_level = reporting_level, + lkup = lkup, + censor = censor, + lkup_hash = lkup_hash, + additional_ind = additional_ind) + } + + # Return + #------------------------------------- + out +} diff --git a/R/pip_grp.R b/R/pip_grp.R index 8fc68331..91037580 100644 --- a/R/pip_grp.R +++ b/R/pip_grp.R @@ -50,7 +50,7 @@ pip_grp <- function(country = "ALL", reporting_level <- "national" } - out <- fg_pip( + out <- fg_pip_old( country = country, year = year, povline = povline, @@ -208,7 +208,7 @@ pip_aggregate <- function(df, by = NULL, return_cols) { } #' Aggregate by predefined groups -#' @param df data.frame: Response from `fg_pip()` or `rg_pip()`. +#' @param df data.frame: Response from `fg_pip_old()` or `rg_pip()`. #' @param group_lkup data.frame: Group lkup table (pop_region) #' @param country character: Selected countries / regions #' @param return_cols list: lkup$return_cols$pip_grp object. Controls returned @@ -308,7 +308,7 @@ compute_world_aggregates <- function(rgn, cols) { #' Filter relevant rows for aggregating by predefined groups -#' @param df data.frame: Response from `fg_pip()` +#' @param df data.frame: Response from `fg_pip_old()` #' @noRd filter_for_aggregate_by <- function(df) { # This algorithm is incorrect, but should mostly work as a first iteration diff --git a/R/pip_grp_logic.R b/R/pip_grp_logic.R index 9cc9bc34..5503b4c7 100644 --- a/R/pip_grp_logic.R +++ b/R/pip_grp_logic.R @@ -1,6 +1,6 @@ -#' Logic for computing new aggregate +#' Old way to estimate aggregate data +#' @rdname pip_agg #' -#' @inheritParams pip #' @return data.table #' @examples #' \dontrun{ @@ -73,7 +73,7 @@ pip_grp_logic <- function(country = "ALL", ## STEP 3.2: Compute fg_pip for ALL required countries ---- ## This will then be re-used in various part of the function ## This is to avoid re-computing and re-loading the same data over and over - fg_pip_master <- fg_pip( + fg_pip_master <- fg_pip_old( country = c(lcv$md_off_reg, lcv$user_off_reg), year = year, povline = povline, diff --git a/R/pip_grp_new.R b/R/pip_grp_new.R new file mode 100644 index 00000000..c4a2045e --- /dev/null +++ b/R/pip_grp_new.R @@ -0,0 +1,14 @@ +#' New way to estimate Aggregate data +#' @rdname pip_gg +pip_grp_new <- \(country = "ALL", + year = "ALL", + povline = 1.9, + group_by = c("wb", "none"), + welfare_type = c("all", "consumption", "income"), + reporting_level = c("all", "national"), + lkup, + censor = TRUE, + lkup_hash = lkup$cache_data_id$hash_pip_grp, + additional_ind = FALSE) { + +} diff --git a/R/utils-pipdata.R b/R/utils-pipdata.R index e3ee1f62..d430d49b 100644 --- a/R/utils-pipdata.R +++ b/R/utils-pipdata.R @@ -150,7 +150,8 @@ add_attributes_as_columns_vectorized <- function(dt) { ds <- attr(dt, "dist_stats") - # AC: I think this is not necessary at this stage. Can we merge at a later point? + # This block processes distribution statistics (mean, median) for each reporting level. + # If this is not required at this stage, consider removing it or deferring it to a later step. if (length(ds)) { dstats <- c("mean", "median") for (l in lev) { diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R index 626f4b6a..77c83c07 100644 --- a/inst/plumber/v1/endpoints.R +++ b/inst/plumber/v1/endpoints.R @@ -282,7 +282,7 @@ function(req, res) { params$format <- NULL params$version <- NULL - out <- do.call(pipapi::pip_grp_logic, params) + out <- do.call(pipapi::pip_agg, params) out } diff --git a/man/pip_grp_logic.Rd b/man/pip_agg.Rd similarity index 70% rename from man/pip_grp_logic.Rd rename to man/pip_agg.Rd index a7d861f1..9c197ecb 100644 --- a/man/pip_grp_logic.Rd +++ b/man/pip_agg.Rd @@ -1,9 +1,23 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pip_grp_logic.R -\name{pip_grp_logic} +% Please edit documentation in R/pip_agg.R, R/pip_grp_logic.R +\name{pip_agg} +\alias{pip_agg} \alias{pip_grp_logic} \title{Logic for computing new aggregate} \usage{ +pip_agg( + country = "ALL", + year = "ALL", + povline = 1.9, + group_by = c("wb", "none"), + welfare_type = c("all", "consumption", "income"), + reporting_level = c("all", "national"), + lkup, + censor = TRUE, + lkup_hash = lkup$cache_data_id$hash_pip_grp, + additional_ind = FALSE +) + pip_grp_logic( country = "ALL", year = "ALL", @@ -41,13 +55,20 @@ sub-groups} FALSE} } \value{ +data.table + data.table } \description{ Logic for computing new aggregate + +Old way to estimate aggregate data } \examples{ \dontrun{ # Create lkups } +\dontrun{ +# Create lkups +} } diff --git a/man/pip_gg.Rd b/man/pip_gg.Rd new file mode 100644 index 00000000..ce32bdd5 --- /dev/null +++ b/man/pip_gg.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pip_grp_new.R +\name{pip_grp_new} +\alias{pip_grp_new} +\title{New way to estimate Aggregate data} +\usage{ +pip_grp_new( + country = "ALL", + year = "ALL", + povline = 1.9, + group_by = c("wb", "none"), + welfare_type = c("all", "consumption", "income"), + reporting_level = c("all", "national"), + lkup, + censor = TRUE, + lkup_hash = lkup$cache_data_id$hash_pip_grp, + additional_ind = FALSE +) +} +\description{ +New way to estimate Aggregate data +} From eb796c1e1048c5f0525d6548f03a0885ee122a2b Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Sat, 23 Aug 2025 12:53:37 -0400 Subject: [PATCH 071/203] small progress. --- R/pip_agg.R | 2 -- R/pip_grp_new.R | 15 +++++++++++++-- man/pip_gg.Rd | 2 -- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/R/pip_agg.R b/R/pip_agg.R index 5c4421dc..52b18bcd 100644 --- a/R/pip_agg.R +++ b/R/pip_agg.R @@ -28,12 +28,10 @@ pip_agg <- function(country = "ALL", pip_grp_new(country = country, year = year, povline = povline, - group_by = group_by, welfare_type = welfare_type, reporting_level = reporting_level, lkup = lkup, censor = censor, - lkup_hash = lkup_hash, additional_ind = additional_ind) } else { pip_grp_logic(country = country, diff --git a/R/pip_grp_new.R b/R/pip_grp_new.R index c4a2045e..8b76c1b7 100644 --- a/R/pip_grp_new.R +++ b/R/pip_grp_new.R @@ -3,12 +3,23 @@ pip_grp_new <- \(country = "ALL", year = "ALL", povline = 1.9, - group_by = c("wb", "none"), welfare_type = c("all", "consumption", "income"), reporting_level = c("all", "national"), lkup, censor = TRUE, - lkup_hash = lkup$cache_data_id$hash_pip_grp, additional_ind = FALSE) { + welfare_type <- match.arg(welfare_type) + reporting_level <- match.arg(reporting_level) + + # Custom aggregations only supported at the national level + # subgroups aggregations only supported for "all" countries + country <- toupper(country) + year <- toupper(year) + reporting_level <- "all" + if (!all(country %in% c("ALL", lkup$query_controls$region$values))) { + country <- "ALL" + } + + } diff --git a/man/pip_gg.Rd b/man/pip_gg.Rd index ce32bdd5..285bcec0 100644 --- a/man/pip_gg.Rd +++ b/man/pip_gg.Rd @@ -8,12 +8,10 @@ pip_grp_new( country = "ALL", year = "ALL", povline = 1.9, - group_by = c("wb", "none"), welfare_type = c("all", "consumption", "income"), reporting_level = c("all", "national"), lkup, censor = TRUE, - lkup_hash = lkup$cache_data_id$hash_pip_grp, additional_ind = FALSE ) } From 1f6ea6539e9875548117eee8baa71015e7e2f46c Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Mon, 25 Aug 2025 12:08:49 -0400 Subject: [PATCH 072/203] init function to subset country codes --- R/pip_grp_new.R | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/R/pip_grp_new.R b/R/pip_grp_new.R index 8b76c1b7..4b6e0a2b 100644 --- a/R/pip_grp_new.R +++ b/R/pip_grp_new.R @@ -23,3 +23,25 @@ pip_grp_new <- \(country = "ALL", } + + + +get_country_code_subset <- function(dt, country) { + # Find all *_code columns except 'country_code' + code_cols <- setdiff(grep("_code$", names(dt), value = TRUE), "country_code") + + # Initialize result vector + result <- character(0) + + # For each code column, check for matches and collect country_code + for (col in code_cols) { + # Find rows where the code column matches any value in country + idx <- dt[[col]] %in% country + if (any(idx, na.rm = TRUE)) { + result <- c(result, dt[idx, country_code]) + } + } + + # Return unique country_code values + funique(result) +} \ No newline at end of file From 0ed1bf417cfdc3c6a4f59e1d0438fd0d86132734 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Mon, 25 Aug 2025 12:34:29 -0400 Subject: [PATCH 073/203] document new get_country_code_subset function --- R/pip_grp_new.R | 34 +++++++++++++++++++++++++++++----- man/get_country_code_subset.Rd | 28 ++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 5 deletions(-) create mode 100644 man/get_country_code_subset.Rd diff --git a/R/pip_grp_new.R b/R/pip_grp_new.R index 4b6e0a2b..8baff230 100644 --- a/R/pip_grp_new.R +++ b/R/pip_grp_new.R @@ -21,18 +21,42 @@ pip_grp_new <- \(country = "ALL", country <- "ALL" } + # Select countries to estimate poverty + cts <- copy(lkup$aux_files$country_list) + country_code <- if (country != "ALL") { + get_country_code_subset(dt = cts, country = country) + } else { + "ALL" + } -} +} + +#' Subset country_code values based on matches in *_code columns +#' +#' This function searches all columns in a data.table ending with '_code' (except 'country_code') +#' and returns a unique character vector of 'country_code' values for rows where any of those +#' columns match a value in the provided 'country' vector. The input data.table 'dt' should be +#' 'lkup$aux_files$country_list', which contains country and region codes for subsetting. +#' +#' @param dt A data.table, typically lkup$aux_files$country_list, containing country_code and other *_code columns. +#' @param country Character vector of country or region codes to match against *_code columns. +#' +#' @return A unique character vector of country_code values corresponding to matches in any *_code column. +#' @examples +#' \dontrun{ +#' dt <- lkup$aux_files$country_list +#' get_country_code_subset(dt, c("USA", "EAP")) +#' } get_country_code_subset <- function(dt, country) { # Find all *_code columns except 'country_code' code_cols <- setdiff(grep("_code$", names(dt), value = TRUE), "country_code") - + # Initialize result vector result <- character(0) - + # For each code column, check for matches and collect country_code for (col in code_cols) { # Find rows where the code column matches any value in country @@ -41,7 +65,7 @@ get_country_code_subset <- function(dt, country) { result <- c(result, dt[idx, country_code]) } } - + # Return unique country_code values funique(result) -} \ No newline at end of file +} diff --git a/man/get_country_code_subset.Rd b/man/get_country_code_subset.Rd new file mode 100644 index 00000000..60d3be69 --- /dev/null +++ b/man/get_country_code_subset.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pip_grp_new.R +\name{get_country_code_subset} +\alias{get_country_code_subset} +\title{Subset country_code values based on matches in *_code columns} +\usage{ +get_country_code_subset(dt, country) +} +\arguments{ +\item{dt}{A data.table, typically lkup$aux_files$country_list, containing country_code and other *_code columns.} + +\item{country}{Character vector of country or region codes to match against *_code columns.} +} +\value{ +A unique character vector of country_code values corresponding to matches in any *_code column. +} +\description{ +This function searches all columns in a data.table ending with '_code' (except 'country_code') +and returns a unique character vector of 'country_code' values for rows where any of those +columns match a value in the provided 'country' vector. The input data.table 'dt' should be +'lkup$aux_files$country_list', which contains country and region codes for subsetting. +} +\examples{ +\dontrun{ +dt <- lkup$aux_files$country_list +get_country_code_subset(dt, c("USA", "EAP")) +} +} From 72de2ca11e2401522db05bb4930d1b630c120d0f Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Mon, 25 Aug 2025 12:55:55 -0400 Subject: [PATCH 074/203] add list_code_column_values --- R/pip_grp_new.R | 61 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 15 deletions(-) diff --git a/R/pip_grp_new.R b/R/pip_grp_new.R index 8baff230..cafe746c 100644 --- a/R/pip_grp_new.R +++ b/R/pip_grp_new.R @@ -34,38 +34,69 @@ pip_grp_new <- \(country = "ALL", } -#' Subset country_code values based on matches in *_code columns +#' Subset country_code values based on matches in *_code columns and country_code #' -#' This function searches all columns in a data.table ending with '_code' (except 'country_code') -#' and returns a unique character vector of 'country_code' values for rows where any of those -#' columns match a value in the provided 'country' vector. The input data.table 'dt' should be -#' 'lkup$aux_files$country_list', which contains country and region codes for subsetting. +#' This function searches all columns in a data.table ending with '_code' (except 'country_code'), +#' as well as 'country_code' itself, and returns a unique character vector of 'country_code' values +#' for rows where any of those columns match a value in the provided 'country' vector. If any value +#' in 'country' is not found in any *_code column or in 'country_code', an error is thrown. The input +#' data.table 'dt' should be 'lkup$aux_files$country_list', which contains country and region codes for subsetting. #' #' @param dt A data.table, typically lkup$aux_files$country_list, containing country_code and other *_code columns. -#' @param country Character vector of country or region codes to match against *_code columns. +#' @param country Character vector of country or region codes to match against *_code columns and country_code. #' -#' @return A unique character vector of country_code values corresponding to matches in any *_code column. +#' @return A unique character vector of country_code values corresponding to matches in any *_code column or country_code. #' @examples #' \dontrun{ #' dt <- lkup$aux_files$country_list #' get_country_code_subset(dt, c("USA", "EAP")) #' } get_country_code_subset <- function(dt, country) { - # Find all *_code columns except 'country_code' - code_cols <- setdiff(grep("_code$", names(dt), value = TRUE), "country_code") - - # Initialize result vector + code_cols <- grep("_code$", names(dt), value = TRUE) result <- character(0) + matched <- logical(length(country)) - # For each code column, check for matches and collect country_code for (col in code_cols) { - # Find rows where the code column matches any value in country idx <- dt[[col]] %in% country if (any(idx, na.rm = TRUE)) { result <- c(result, dt[idx, country_code]) + matched <- matched | country %in% dt[[col]] } } - - # Return unique country_code values + # Also check country_code itself + idx_cc <- dt$country_code %in% country + if (any(idx_cc, na.rm = TRUE)) { + result <- c(result, dt[idx_cc, country_code]) + matched <- matched | country %in% dt$country_code + } + # Error if any country not matched + if (any(!matched)) { + cli::cli_abort( + "The following values in {.arg country} were not found in any *_code column or country_code: + {country[!matched]}") + } funique(result) } + +#' List values in each *_code column that match the country vector +#' +#' Returns a named list where each element is the vector of unique values in each *_code column +#' that are present in the provided 'country' vector. +#' +#' @param dt A data.table, typically lkup$aux_files$country_list. +#' @param country Character vector of country or region codes to match against *_code columns. +#' +#' @return A named list of unique values for each *_code column that match 'country'. +#' @examples +#' \dontrun{ +#' dt <- lkup$aux_files$country_list +#' list_code_column_values(dt, c("USA", "EAP")) +#' } +list_code_column_values <- function(dt, country) { + code_cols <- grep("_code$", names(dt), value = TRUE) + lapply(code_cols, \(col) { + dt[get(col) %in% country, ..col] |> + funique() + }) |> + setNames(code_cols) +} From e9bfeb39926404b454facb39807a3642ce43c3c3 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Mon, 25 Aug 2025 12:56:11 -0400 Subject: [PATCH 075/203] document --- man/get_country_code_subset.Rd | 15 ++++++++------- man/list_code_column_values.Rd | 26 ++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 7 deletions(-) create mode 100644 man/list_code_column_values.Rd diff --git a/man/get_country_code_subset.Rd b/man/get_country_code_subset.Rd index 60d3be69..a1acd4dc 100644 --- a/man/get_country_code_subset.Rd +++ b/man/get_country_code_subset.Rd @@ -2,23 +2,24 @@ % Please edit documentation in R/pip_grp_new.R \name{get_country_code_subset} \alias{get_country_code_subset} -\title{Subset country_code values based on matches in *_code columns} +\title{Subset country_code values based on matches in *_code columns and country_code} \usage{ get_country_code_subset(dt, country) } \arguments{ \item{dt}{A data.table, typically lkup$aux_files$country_list, containing country_code and other *_code columns.} -\item{country}{Character vector of country or region codes to match against *_code columns.} +\item{country}{Character vector of country or region codes to match against *_code columns and country_code.} } \value{ -A unique character vector of country_code values corresponding to matches in any *_code column. +A unique character vector of country_code values corresponding to matches in any *_code column or country_code. } \description{ -This function searches all columns in a data.table ending with '_code' (except 'country_code') -and returns a unique character vector of 'country_code' values for rows where any of those -columns match a value in the provided 'country' vector. The input data.table 'dt' should be -'lkup$aux_files$country_list', which contains country and region codes for subsetting. +This function searches all columns in a data.table ending with '_code' (except 'country_code'), +as well as 'country_code' itself, and returns a unique character vector of 'country_code' values +for rows where any of those columns match a value in the provided 'country' vector. If any value +in 'country' is not found in any *_code column or in 'country_code', an error is thrown. The input +data.table 'dt' should be 'lkup$aux_files$country_list', which contains country and region codes for subsetting. } \examples{ \dontrun{ diff --git a/man/list_code_column_values.Rd b/man/list_code_column_values.Rd new file mode 100644 index 00000000..2b285c88 --- /dev/null +++ b/man/list_code_column_values.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pip_grp_new.R +\name{list_code_column_values} +\alias{list_code_column_values} +\title{List values in each *_code column that match the country vector} +\usage{ +list_code_column_values(dt, country) +} +\arguments{ +\item{dt}{A data.table, typically lkup$aux_files$country_list.} + +\item{country}{Character vector of country or region codes to match against *_code columns.} +} +\value{ +A named list of unique values for each *_code column that match 'country'. +} +\description{ +Returns a named list where each element is the vector of unique values in each *_code column +that are present in the provided 'country' vector. +} +\examples{ +\dontrun{ +dt <- lkup$aux_files$country_list +list_code_column_values(dt, c("USA", "EAP")) +} +} From 55feea82c7b7d203877fb00889fec9204c15474d Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Mon, 25 Aug 2025 16:21:34 -0400 Subject: [PATCH 076/203] add pip_agg instead of pip_grp_logic --- R/pip_grp.R | 2 +- R/pip_grp_new.R | 46 +++++++++++++++++++++ R/pip_new_lineups.R | 85 ++++++++++++++++++++++----------------- R/ui_poverty_indicators.R | 2 +- R/zzz.R | 2 + man/pip_aggregate_by.Rd | 22 ++++++++++ 6 files changed, 119 insertions(+), 40 deletions(-) create mode 100644 man/pip_aggregate_by.Rd diff --git a/R/pip_grp.R b/R/pip_grp.R index 91037580..b8b9abdc 100644 --- a/R/pip_grp.R +++ b/R/pip_grp.R @@ -213,7 +213,7 @@ pip_aggregate <- function(df, by = NULL, return_cols) { #' @param country character: Selected countries / regions #' @param return_cols list: lkup$return_cols$pip_grp object. Controls returned #' columns -#' @noRd +#' @keywords internal pip_aggregate_by <- function(df, group_lkup, country = "ALL", diff --git a/R/pip_grp_new.R b/R/pip_grp_new.R index cafe746c..1dcf557a 100644 --- a/R/pip_grp_new.R +++ b/R/pip_grp_new.R @@ -30,6 +30,52 @@ pip_grp_new <- \(country = "ALL", } + out <- fg_pip( + country = country_code, + year = year, + povline = povline, + popshare = NULL, + welfare_type = welfare_type, + reporting_level = reporting_level, + ppp = NULL, + lkup = lkup) + + out <- treat_cache_and_main(out) + + # return empty dataframe if no metadata is found + if (nrow(out) == 0) { + return(pipapi::empty_response_grp) + } + + # Handles aggregated distributions (like CHN and IND) + if (tolower(reporting_level) %in% c("national", "all")) { + out <- add_agg_stats(out, + return_cols = lkup$return_cols$ag_average_poverty_stats) + } + + add_vars_out_of_pipeline(out, fill_gaps = TRUE, lkup = lkup) + + # Handle potential (insignificant) difference in poverty_line values that + # may mess-up the grouping + # I don't think we need this out$poverty_line already has the correct values additionally, + # since povline is vectorized the below line does not work as expected + #out$poverty_line <- povline + + # Handle aggregations with sub-groups + + out <- pip_aggregate_by( + df = out, + group_lkup = lkup[["pop_region"]], + country = country, + return_cols = lkup$return_cols$pip_grp + ) + + out <- estimate_type_var(out,lkup) + + # Censor regional values + if (censor) { + out <- censor_rows(out, lkup[["censored"]], type = "regions") + } } diff --git a/R/pip_new_lineups.R b/R/pip_new_lineups.R index 5b255ec6..04d70acb 100644 --- a/R/pip_new_lineups.R +++ b/R/pip_new_lineups.R @@ -141,45 +141,8 @@ pip_new_lineups <- function(country = "ALL", # Cache new data #--------------------------------------------- - cached_data <- if (is.null(out$data_in_cache)) { - NULL - } else if (is.data.frame(out$data_in_cache)) { - if (fnrow(out$data_in_cache) == 0) { - NULL - } else { - qDT(out$data_in_cache) - } - } else { - cli::cli_abort("{.code out$data_in_cache} must be NULL or data.frame not {.field {class(out$data_in_cache)}}") - } - - main_data <- qDT(out$main_data) - - if (nrow(main_data) > 0) { - - if (is.null(out$data_in_cache)) { - - out <- main_data - } else { + out <- treat_cache_and_main(out) - if (fill_gaps) { - - cached_data <- fg_remove_duplicates(cached_data, - use_new_lineup_version = lkup$use_new_lineup_version) - } - - out <- main_data |> - rowbind(cached_data) - } - - update_master_file(main_data, cache_file_path, fill_gaps) - - } else { - out <- cached_data - } - if (!is.data.table(out)) { - setDT(out) - } # Early return for empty table--------------- if (nrow(out) == 0) return(pipapi::empty_response) @@ -310,3 +273,49 @@ pip_new_lineups <- function(country = "ALL", # return ------------- return(out) } + + + + + +treat_cache_and_main <- \(out) { + cached_data <- if (is.null(out$data_in_cache)) { + NULL + } else if (is.data.frame(out$data_in_cache)) { + if (fnrow(out$data_in_cache) == 0) { + NULL + } else { + qDT(out$data_in_cache) + } + } else { + cli::cli_abort("{.code out$data_in_cache} must be NULL or data.frame not {.field {class(out$data_in_cache)}}") + } + + main_data <- qDT(out$main_data) + + if (nrow(main_data) > 0) { + + if (is.null(out$data_in_cache)) { + + out <- main_data + } else { + + if (fill_gaps) { + + cached_data <- fg_remove_duplicates(cached_data, + use_new_lineup_version = lkup$use_new_lineup_version) + } + + out <- main_data |> + rowbind(cached_data) + } + + update_master_file(main_data, cache_file_path, fill_gaps) + + } else { + out <- cached_data + } + + + setDT(out) +} diff --git a/R/ui_poverty_indicators.R b/R/ui_poverty_indicators.R index 2588da6b..f2758a13 100644 --- a/R/ui_poverty_indicators.R +++ b/R/ui_poverty_indicators.R @@ -75,7 +75,7 @@ ui_pc_regional <- function(country = "ALL", year <- toupper(year) } - out <- pip_grp_logic(country = country, + out <- pip_agg(country = country, year = year, group_by = "wb", reporting_level = "national", diff --git a/R/zzz.R b/R/zzz.R index e534cced..576b58a4 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -19,6 +19,8 @@ pipapi_default_options <- list( pip <<- memoise::memoise(pip, cache = cd, omit_args = "lkup") ui_hp_stacked <<- memoise::memoise(ui_hp_stacked, cache = cd, omit_args = "lkup") + pip_agg <<- memoise::memoise(pip_agg, cache = cd, omit_args = "lkup") + pip_grp_new <<- memoise::memoise(pip_grp_new, cache = cd, omit_args = "lkup") pip_grp_logic <<- memoise::memoise(pip_grp_logic, cache = cd, omit_args = "lkup") pip_grp <<- memoise::memoise(pip_grp, cache = cd, omit_args = "lkup") ui_cp_charts <<- memoise::memoise(ui_cp_charts, cache = cd, omit_args = "lkup") diff --git a/man/pip_aggregate_by.Rd b/man/pip_aggregate_by.Rd new file mode 100644 index 00000000..98bf18f7 --- /dev/null +++ b/man/pip_aggregate_by.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pip_grp.R +\name{pip_aggregate_by} +\alias{pip_aggregate_by} +\title{Aggregate by predefined groups} +\usage{ +pip_aggregate_by(df, group_lkup, country = "ALL", return_cols) +} +\arguments{ +\item{df}{data.frame: Response from \code{fg_pip_old()} or \code{rg_pip()}.} + +\item{group_lkup}{data.frame: Group lkup table (pop_region)} + +\item{country}{character: Selected countries / regions} + +\item{return_cols}{list: lkup$return_cols$pip_grp object. Controls returned +columns} +} +\description{ +Aggregate by predefined groups +} +\keyword{internal} From 8ef8492e79516d7efbef1b9fc389954a4a2c4223 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Mon, 25 Aug 2025 16:32:43 -0400 Subject: [PATCH 077/203] format --- R/zzz.R | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/R/zzz.R b/R/zzz.R index 576b58a4..9bb99af6 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -17,13 +17,13 @@ pipapi_default_options <- list( max_size = as.numeric(Sys.getenv("PIPAPI_CACHE_MAX_SIZE")), prune_rate = 50) - pip <<- memoise::memoise(pip, cache = cd, omit_args = "lkup") - ui_hp_stacked <<- memoise::memoise(ui_hp_stacked, cache = cd, omit_args = "lkup") - pip_agg <<- memoise::memoise(pip_agg, cache = cd, omit_args = "lkup") - pip_grp_new <<- memoise::memoise(pip_grp_new, cache = cd, omit_args = "lkup") - pip_grp_logic <<- memoise::memoise(pip_grp_logic, cache = cd, omit_args = "lkup") - pip_grp <<- memoise::memoise(pip_grp, cache = cd, omit_args = "lkup") - ui_cp_charts <<- memoise::memoise(ui_cp_charts, cache = cd, omit_args = "lkup") + pip <<- memoise::memoise(pip, cache = cd, omit_args = "lkup") + ui_hp_stacked <<- memoise::memoise(ui_hp_stacked, cache = cd, omit_args = "lkup") + pip_agg <<- memoise::memoise(pip_agg, cache = cd, omit_args = "lkup") + pip_grp_new <<- memoise::memoise(pip_grp_new, cache = cd, omit_args = "lkup") + pip_grp_logic <<- memoise::memoise(pip_grp_logic, cache = cd, omit_args = "lkup") + pip_grp <<- memoise::memoise(pip_grp, cache = cd, omit_args = "lkup") + ui_cp_charts <<- memoise::memoise(ui_cp_charts, cache = cd, omit_args = "lkup") ui_cp_download <<- memoise::memoise(ui_cp_download, cache = cd, omit_args = "lkup") ui_cp_key_indicators <<- memoise::memoise(ui_cp_key_indicators, cache = cd, omit_args = "lkup") pos = 1L From 61170c4346f80235a1169066e1158de9e2203590 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Tue, 26 Aug 2025 10:30:36 -0400 Subject: [PATCH 078/203] add pipapienv --- R/aaa.R | 1 + R/compute_fgt_new.R | 4 +++- R/pip_new_lineups.R | 4 ++-- R/utils-pipdata.R | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) create mode 100644 R/aaa.R diff --git a/R/aaa.R b/R/aaa.R new file mode 100644 index 00000000..cd685993 --- /dev/null +++ b/R/aaa.R @@ -0,0 +1 @@ +.pipapienv <- new.env(parent = emptyenv()) diff --git a/R/compute_fgt_new.R b/R/compute_fgt_new.R index e78d8296..9e9de70b 100644 --- a/R/compute_fgt_new.R +++ b/R/compute_fgt_new.R @@ -99,7 +99,9 @@ compute_fgt <- function(w, wt, povlines) { # logw <- log(w) # logw <- copyv(log(w), pos, NA_real_, invert = TRUE) |> # suppressWarnings() - logw <- fifelse(w > 0, log(w), NA_real_) + # logw <- fifelse(w > 0, log(w), NA_real_) + logw <- log(w) |> + suppressWarnings() for (i in seq_along(povlines)) { pov <- povlines[i] diff --git a/R/pip_new_lineups.R b/R/pip_new_lineups.R index 04d70acb..33efc76f 100644 --- a/R/pip_new_lineups.R +++ b/R/pip_new_lineups.R @@ -141,7 +141,7 @@ pip_new_lineups <- function(country = "ALL", # Cache new data #--------------------------------------------- - out <- treat_cache_and_main(out) + out <- treat_cache_and_main(out, cache_file_path) # Early return for empty table--------------- if (nrow(out) == 0) return(pipapi::empty_response) @@ -278,7 +278,7 @@ pip_new_lineups <- function(country = "ALL", -treat_cache_and_main <- \(out) { +treat_cache_and_main <- \(out, cache_file_path) { cached_data <- if (is.null(out$data_in_cache)) { NULL } else if (is.data.frame(out$data_in_cache)) { diff --git a/R/utils-pipdata.R b/R/utils-pipdata.R index d430d49b..597663d6 100644 --- a/R/utils-pipdata.R +++ b/R/utils-pipdata.R @@ -13,7 +13,8 @@ load_list_refy <- \(input_list, path){ }) dl <- lapply(inames, \(x) { - qs::qread(file = fs::path(path, x, ext = "qs")) + qs::qread(file = fs::path(path, x, ext = "qs"), + nthreads = 4) }) |> setNames(inames) From e4f08835e4cc094b8da357da0a323dfadcdb44d5 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Tue, 26 Aug 2025 10:32:20 -0400 Subject: [PATCH 079/203] add functions to manage pipapienv --- NAMESPACE | 3 +++ R/pipapi-env.R | 42 +++++++++++++++++++++++++++++++++++++++ man/get_from_pipapienv.Rd | 21 ++++++++++++++++++++ man/get_pipapienv.Rd | 17 ++++++++++++++++ man/set_in_pipapienv.Rd | 22 ++++++++++++++++++++ 5 files changed, 105 insertions(+) create mode 100644 R/pipapi-env.R create mode 100644 man/get_from_pipapienv.Rd create mode 100644 man/get_pipapienv.Rd create mode 100644 man/set_in_pipapienv.Rd diff --git a/NAMESPACE b/NAMESPACE index 133f86b1..96b78000 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -16,11 +16,13 @@ export(get_aux_table) export(get_aux_table_ui) export(get_caller_names) export(get_ctr_alt_agg) +export(get_from_pipapienv) export(get_grp_to_compute) export(get_impl_ctrs) export(get_md_vars) export(get_param_values) export(get_pip_version) +export(get_pipapienv) export(get_user_alt_gt) export(get_user_x_code) export(get_valid_aux_long_format_tables) @@ -40,6 +42,7 @@ export(return_if_exists) export(select_off_alt_agg) export(select_reporting_level) export(select_user_aggs) +export(set_in_pipapienv) export(start_api) export(ui_cp_charts) export(ui_cp_download) diff --git a/R/pipapi-env.R b/R/pipapi-env.R new file mode 100644 index 00000000..630b277f --- /dev/null +++ b/R/pipapi-env.R @@ -0,0 +1,42 @@ +# Getter function: Returns the entire .pipapienv environment +#' Get the entire .pipapienv environment +#' +#' @return The .pipapienv environment +#' @export +#' +#' @examples +#' env <- get_pipapienv() +get_pipapienv <- function() { + .pipapienv +} + +# Getter for a specific key from .pipapienv +#' Get a value from .pipapienv +#' +#' @param key A character string representing the key +#' +#' @return The value associated with the key in .pipapienv +#' @export +#' +#' @examples +#' set_in_pipapienv("example_key", 42) +#' get_from_pipapienv("example_key") # returns 42 +get_from_pipapienv <- function(key) { + rlang::env_get(.pipapienv, key, default = NULL) # Returns NULL if key doesn't exist +} + +# Setter function: Assign a value in .pipapienv +#' Set a value in .pipapienv +#' +#' @param key A character string representing the key +#' @param value The value to store in .pipapienv +#' +#' @return The assigned value (invisibly) +#' @export +#' +#' @examples +#' set_in_pipapienv("example_key", 42) +set_in_pipapienv <- function(key, value) { + rlang::env_poke(.pipapienv, key, value) + invisible(value) # Return value invisibly to avoid clutter in console +} diff --git a/man/get_from_pipapienv.Rd b/man/get_from_pipapienv.Rd new file mode 100644 index 00000000..2876f683 --- /dev/null +++ b/man/get_from_pipapienv.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pipapi-env.R +\name{get_from_pipapienv} +\alias{get_from_pipapienv} +\title{Get a value from .pipapienv} +\usage{ +get_from_pipapienv(key) +} +\arguments{ +\item{key}{A character string representing the key} +} +\value{ +The value associated with the key in .pipapienv +} +\description{ +Get a value from .pipapienv +} +\examples{ +set_in_pipapienv("example_key", 42) +get_from_pipapienv("example_key") # returns 42 +} diff --git a/man/get_pipapienv.Rd b/man/get_pipapienv.Rd new file mode 100644 index 00000000..933e7044 --- /dev/null +++ b/man/get_pipapienv.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pipapi-env.R +\name{get_pipapienv} +\alias{get_pipapienv} +\title{Get the entire .pipapienv environment} +\usage{ +get_pipapienv() +} +\value{ +The .pipapienv environment +} +\description{ +Get the entire .pipapienv environment +} +\examples{ +env <- get_pipapienv() +} diff --git a/man/set_in_pipapienv.Rd b/man/set_in_pipapienv.Rd new file mode 100644 index 00000000..127287cb --- /dev/null +++ b/man/set_in_pipapienv.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pipapi-env.R +\name{set_in_pipapienv} +\alias{set_in_pipapienv} +\title{Set a value in .pipapienv} +\usage{ +set_in_pipapienv(key, value) +} +\arguments{ +\item{key}{A character string representing the key} + +\item{value}{The value to store in .pipapienv} +} +\value{ +The assigned value (invisibly) +} +\description{ +Set a value in .pipapienv +} +\examples{ +set_in_pipapienv("example_key", 42) +} From 01accd633d8ae7889e94ecdeb377f5e2837d9a76 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Tue, 26 Aug 2025 10:36:27 -0400 Subject: [PATCH 080/203] implement loading of qs using multiple cores --- DESCRIPTION | 3 ++- R/utils-pipdata.R | 4 +++- R/zzz.R | 8 ++++++++ 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index fa2ae7c1..0f625a0e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -70,7 +70,8 @@ Imports: DBI, duckdb, jsonlite, - digest + digest, + parallel Remotes: PIP-Technical-Team/wbpip@DEV Depends: diff --git a/R/utils-pipdata.R b/R/utils-pipdata.R index 597663d6..33ce4df6 100644 --- a/R/utils-pipdata.R +++ b/R/utils-pipdata.R @@ -12,9 +12,11 @@ load_list_refy <- \(input_list, path){ paste0(x$country_code, "_", x$year) }) + cores <- get_from_pipapienv("cores_to_use") + dl <- lapply(inames, \(x) { qs::qread(file = fs::path(path, x, ext = "qs"), - nthreads = 4) + nthreads = cores) }) |> setNames(inames) diff --git a/R/zzz.R b/R/zzz.R index 9bb99af6..d3c42be7 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -35,6 +35,14 @@ pipapi_default_options <- list( toset <- !(names(pipapi_default_options) %in% names(op)) if (any(toset)) options(pipapi_default_options[toset]) + + # set multi threats + available_cores <- parallel::detectCores() - 1 + + cores_to_use <- max(available_cores, 1) |> + min(8) + set_in_pipapienv("cores_to_use", cores_to_use) + invisible() } From 22679d1025d3cf065aac457b8ea09dcc2183d93a Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Tue, 26 Aug 2025 14:52:42 -0400 Subject: [PATCH 081/203] updates on pov_from_DT --- R/compute_fgt_new.R | 107 +++++++++++++++++++++++++++++++++++++++++++- R/fg_pip.R | 6 +++ R/pip_grp_new.R | 7 +++ man/DT_fgt_by_rl.Rd | 22 --------- man/map_fgt.Rd | 21 +++++++-- 5 files changed, 136 insertions(+), 27 deletions(-) delete mode 100644 man/DT_fgt_by_rl.Rd diff --git a/R/compute_fgt_new.R b/R/compute_fgt_new.R index 9e9de70b..14cd897b 100644 --- a/R/compute_fgt_new.R +++ b/R/compute_fgt_new.R @@ -103,6 +103,8 @@ compute_fgt <- function(w, wt, povlines) { logw <- log(w) |> suppressWarnings() + tot_pop <- fsum(wt) + for (i in seq_along(povlines)) { pov <- povlines[i] poor <- w < pov @@ -116,7 +118,7 @@ compute_fgt <- function(w, wt, povlines) { # Optimized Watts index calculation keep <- poor & pos if (any(keep, na.rm = TRUE)) { - watts_vec[i] <- (fsum((log(pov) - logw[keep]) * wt[keep])) / fsum(wt) + watts_vec[i] <- (fsum((log(pov) - logw[keep]) * wt[keep])) / tot_pop } else { watts_vec[i] <- 0 } @@ -140,7 +142,7 @@ compute_fgt <- function(w, wt, povlines) { #' @param y list of indices for each reporting level #' @param nx name of data table. Usuall country code and year in the form "CCC_YYYY" #' -#' @return data.table with FGT estimates by reporting level +#' @rdname map_fgt #' @keywords internal DT_fgt_by_rl <- \(x, y, nx) { DT_fgt <- lapply(names(y), \(rl) { @@ -161,6 +163,36 @@ DT_fgt_by_rl <- \(x, y, nx) { )] } + + +#' jkoin reporting level and lt list into one data.table +#' +#' @rdname map_fgt +lt_to_dt <- \(x, y, nx) { + DT <- lapply(names(y), \(rl) { + + idx <- y[[rl]] + x[idx, reporting_level := rl] + + }) |> + rbindlist(fill = TRUE) + + + DT[, `:=`( + country_code = gsub("([^_]+)(_.+)", "\\1", nx), + reporting_year = gsub("(.+_)([^_]+)", "\\2", nx) + )] +} + +#' Map lt_to_dt +#' +#' @rdname map_fgt +map_lt_to_dt <- \(lt, l_rl_rows) { + Map(lt_to_dt, lt, l_rl_rows, names(lt)) |> + rbindlist(fill = TRUE) +} + + #' map over list of data.tables and indices to compute FGT by reporting_level #' #' @param lt list of data.tables with welfare and weight data @@ -233,3 +265,74 @@ load_data_list <- \(metadata) { } + + +pov_from_DT <- function(DT, povline, g) { + w <- DT$welfare + wt <- DT$weight + n_pov <- length(povline) + + ng <- (g) + grp_ids <- (g) + + # Precompute log(w) for efficiency + pos <- w > 0 + logw <- fifelse(pos, log(w), NA_real_) + + # Prepare result lists + fgt0 <- vector("list", n_pov) + fgt1 <- vector("list", n_pov) + fgt2 <- vector("list", n_pov) + watts <- vector("list", n_pov) + + for (i in seq_along(povline)) { + pov <- povline[i] + poor <- w < pov + rel_dist <- fifelse(poor, 1 - w/pov, 0) + keep <- poor & pos + watts_val <- fmean((log(pov) - logw) * keep, g = g, w = wt) + fgt0[[i]] <- fmean(poor, g = g, w = wt) + fgt1[[i]] <- fmean(rel_dist, g = g, w = wt) + fgt2[[i]] <- fmean(rel_dist^2, g = g, w = wt) + watts[[i]] <- watts_val + } + + data.table( + povline = rep(povline, each = ng), + group_id = rep(grp_ids, times = n_pov), + fgt0 = unlist(fgt0), + fgt1 = unlist(fgt1), + fgt2 = unlist(fgt2), + watts = unlist(watts) + ) +} + + + + + +# pov_from_DT2 <- function(DT, povline, g) { +# fgt0 <- numeric(length(povline)) +# fgt1 <- numeric(length(povline)) +# fgt2 <- numeric(length(povline)) +# w <- DT$welfare +# wt <- DT$weight +# +# +# for (i in seq_along(povline)) { +# pov <- povline[i] +# poor <- w < pov +# rel_dist <- fifelse(poor, 1 - w/pov, 0) +# fgt0[i] <- fmean(poor, g = g, w = wt) +# fgt1[i] <- fmean(rel_dist, g = g, w = wt) +# fgt2[i] <- fmean(rel_dist^2, g = g, w = wt) +# } +# +# list(fgt0 = fgt0, fgt1 = fgt1, fgt2 = fgt2) +# } + + + + + + diff --git a/R/fg_pip.R b/R/fg_pip.R index 07f0e9df..5cab3696 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -78,6 +78,12 @@ fg_pip <- function(country, l_rl_rows <- get_rl_rows(lt_att) + DT <- map_lt_to_dt(lt, l_rl_rows) + g <- GRP(DT, ~ country_code + reporting_year + reporting_level) + + + + # ZP Add: do fgt estimations using `res <- lapply(lt, process_dt, povline = povline)` #------------------------- fgt <- map_fgt(lt, l_rl_rows) |> diff --git a/R/pip_grp_new.R b/R/pip_grp_new.R index 1dcf557a..51f5240e 100644 --- a/R/pip_grp_new.R +++ b/R/pip_grp_new.R @@ -30,6 +30,13 @@ pip_grp_new <- \(country = "ALL", } + + + + + + + out <- fg_pip( country = country_code, year = year, diff --git a/man/DT_fgt_by_rl.Rd b/man/DT_fgt_by_rl.Rd deleted file mode 100644 index 72663294..00000000 --- a/man/DT_fgt_by_rl.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/compute_fgt_new.R -\name{DT_fgt_by_rl} -\alias{DT_fgt_by_rl} -\title{compute FGT using indices by reporting level} -\usage{ -DT_fgt_by_rl(x, y, nx) -} -\arguments{ -\item{x}{data.table from lt list, with welfare and weight vectors} - -\item{y}{list of indices for each reporting level} - -\item{nx}{name of data table. Usuall country code and year in the form "CCC_YYYY"} -} -\value{ -data.table with FGT estimates by reporting level -} -\description{ -This function is intended to be used inside \link{map_fgt} -} -\keyword{internal} diff --git a/man/map_fgt.Rd b/man/map_fgt.Rd index a192f3f2..19935f6b 100644 --- a/man/map_fgt.Rd +++ b/man/map_fgt.Rd @@ -1,12 +1,27 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/compute_fgt_new.R -\name{map_fgt} +\name{DT_fgt_by_rl} +\alias{DT_fgt_by_rl} +\alias{lt_to_dt} +\alias{map_lt_to_dt} \alias{map_fgt} -\title{map over list of data.tables and indices to compute FGT by reporting_level} +\title{compute FGT using indices by reporting level} \usage{ +DT_fgt_by_rl(x, y, nx) + +lt_to_dt(x, y, nx) + +map_lt_to_dt(lt, l_rl_rows) + map_fgt(lt, l_rl_rows) } \arguments{ +\item{x}{data.table from lt list, with welfare and weight vectors} + +\item{y}{list of indices for each reporting level} + +\item{nx}{name of data table. Usuall country code and year in the form "CCC_YYYY"} + \item{lt}{list of data.tables with welfare and weight data} \item{l_rl_rows}{list of indeces} @@ -15,6 +30,6 @@ map_fgt(lt, l_rl_rows) data.table with all measured } \description{ -map over list of data.tables and indices to compute FGT by reporting_level +This function is intended to be used inside \link{map_fgt} } \keyword{internal} From b3a7c0500b3be5a9515ae8247f66fe6a1b8250e3 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Wed, 27 Aug 2025 18:09:47 -0400 Subject: [PATCH 082/203] change aggregation to collapse --- R/compute_fgt_new.R | 39 +++++++---- R/fg_pip.R | 31 ++------- R/pip_grp.R | 68 +++++++------------ R/pip_grp_new.R | 17 +++-- R/pip_new_lineups.R | 41 ++++++----- R/utils.R | 36 ++++++++++ ..._get_mean_median.Rd => get_mean_median.Rd} | 10 +-- man/map_fgt.Rd | 8 +-- 8 files changed, 134 insertions(+), 116 deletions(-) rename man/{fg_get_mean_median.Rd => get_mean_median.Rd} (66%) diff --git a/R/compute_fgt_new.R b/R/compute_fgt_new.R index 14cd897b..8879d018 100644 --- a/R/compute_fgt_new.R +++ b/R/compute_fgt_new.R @@ -144,7 +144,7 @@ compute_fgt <- function(w, wt, povlines) { #' #' @rdname map_fgt #' @keywords internal -DT_fgt_by_rl <- \(x, y, nx) { +DT_fgt_by_rl <- \(x, y, nx, povline) { DT_fgt <- lapply(names(y), \(rl) { idx <- y[[rl]] @@ -168,7 +168,7 @@ DT_fgt_by_rl <- \(x, y, nx) { #' jkoin reporting level and lt list into one data.table #' #' @rdname map_fgt -lt_to_dt <- \(x, y, nx) { +lt_to_dt <- \(x, y, nx, povline) { DT <- lapply(names(y), \(rl) { idx <- y[[rl]] @@ -187,8 +187,9 @@ lt_to_dt <- \(x, y, nx) { #' Map lt_to_dt #' #' @rdname map_fgt -map_lt_to_dt <- \(lt, l_rl_rows) { - Map(lt_to_dt, lt, l_rl_rows, names(lt)) |> +map_lt_to_dt <- \(lt, l_rl_rows, povline) { + Map(lt_to_dt, lt, l_rl_rows, names(lt), + MoreArgs = list(povline = povline)) |> rbindlist(fill = TRUE) } @@ -200,8 +201,9 @@ map_lt_to_dt <- \(lt, l_rl_rows) { #' #' @return data.table with all measured #' @keywords internal -map_fgt <- \(lt, l_rl_rows) { - Map(DT_fgt_by_rl, lt, l_rl_rows, names(lt)) |> +map_fgt <- \(lt, l_rl_rows, povline) { + Map(DT_fgt_by_rl, lt, l_rl_rows, names(lt), + MoreArgs = list(povline = povline)) |> rbindlist(fill = TRUE) } @@ -267,13 +269,13 @@ load_data_list <- \(metadata) { -pov_from_DT <- function(DT, povline, g) { +pov_from_DT <- function(DT, povline, g, cores = 1) { w <- DT$welfare wt <- DT$weight n_pov <- length(povline) - ng <- (g) - grp_ids <- (g) + ng <- g$N.groups + grp_ids <- g$groups # Precompute log(w) for efficiency pos <- w > 0 @@ -290,21 +292,28 @@ pov_from_DT <- function(DT, povline, g) { poor <- w < pov rel_dist <- fifelse(poor, 1 - w/pov, 0) keep <- poor & pos - watts_val <- fmean((log(pov) - logw) * keep, g = g, w = wt) - fgt0[[i]] <- fmean(poor, g = g, w = wt) - fgt1[[i]] <- fmean(rel_dist, g = g, w = wt) - fgt2[[i]] <- fmean(rel_dist^2, g = g, w = wt) + watts_val <- fmean((log(pov) - logw) * keep, + g = g, w = wt, nthreads = cores ) + fgt0[[i]] <- fmean(poor, g = g, w = wt, + nthreads = cores) + fgt1[[i]] <- fmean(rel_dist, g = g, w = wt, + nthreads = cores) + fgt2[[i]] <- fmean(rel_dist^2, g = g, w = wt, + nthreads = cores) watts[[i]] <- watts_val } - data.table( + out <- data.table( povline = rep(povline, each = ng), - group_id = rep(grp_ids, times = n_pov), fgt0 = unlist(fgt0), fgt1 = unlist(fgt1), fgt2 = unlist(fgt2), watts = unlist(watts) ) + # Repeat group columns for each povline + grp_dt <- grp_ids[rep(seq_len(ng), times = n_pov)] + add_vars(out, pos = "front") <- grp_dt + out } diff --git a/R/fg_pip.R b/R/fg_pip.R index 5cab3696..c367f207 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -78,22 +78,25 @@ fg_pip <- function(country, l_rl_rows <- get_rl_rows(lt_att) - DT <- map_lt_to_dt(lt, l_rl_rows) - g <- GRP(DT, ~ country_code + reporting_year + reporting_level) + # DT <- map_lt_to_dt(lt, l_rl_rows) + # setorder(DT, country_code, reporting_year, reporting_level, welfare) + # g <- GRP(DT, + # ~ country_code + reporting_year + reporting_level, + # sort = TRUE) # ZP Add: do fgt estimations using `res <- lapply(lt, process_dt, povline = povline)` #------------------------- - fgt <- map_fgt(lt, l_rl_rows) |> + fgt <- map_fgt(lt, l_rl_rows, povline) |> funique() # TO REMOVE # convert reporting year to numeric fgt[, reporting_year := as.numeric(reporting_year)] # Add just mean and median - res <- fg_get_mean_median(fgt, lkup) + res <- get_mean_median(fgt, lkup, fill_gaps = TRUE) # try metadata unique code @@ -314,23 +317,3 @@ create_full_list <- function(country, year, refy_lkup, data_present_in_master) { - -#' merge into fgt table the mean and median from dist stats table in lkup -#' -#' @param fgt data,table with fgt measures -#' @param lkup lkup -#' -#' @return data.table with with fgt, mean and median -#' @keywords internal -fg_get_mean_median <- \(fgt, lkup) { - joyn::joyn(x = fgt, - y = lkup$lineup_dist_stats[, - .(country_code, reporting_year, - reporting_level, mean, median)], - by = c('country_code', "reporting_year", "reporting_level"), - match_type = "m:1", # multiple povlines - keep = "left", - reportvar = FALSE, - verbose = FALSE) -} - diff --git a/R/pip_grp.R b/R/pip_grp.R index b8b9abdc..8d50b238 100644 --- a/R/pip_grp.R +++ b/R/pip_grp.R @@ -228,43 +228,31 @@ pip_aggregate_by <- function(df, to_keep <- all_cols[!all_cols %in% c("pop_in_poverty", "estimate_type")] - df <- df[, .SD, .SDcols = to_keep] - - group_lkup <- group_lkup[, c("region_code", - "reporting_year", - "reporting_pop")] + df <- df[, .SD, .SDcols = to_keep] # I think we can ommit this part # Compute stats weighted average by groups - rgn <- df[, lapply(.SD, stats::weighted.mean, - w = reporting_pop, - na.rm = TRUE), - by = .(region_name, - region_code, - reporting_year, - poverty_line), - .SDcols = weighted_cols - ] - - rgn <- group_lkup[rgn, - on = .(region_code, reporting_year), - allow.cartesian = TRUE - ] + rgn <- df |> + fgroup_by(region_name, + region_code, + reporting_year, + poverty_line) |> + fselect(c(weighted_cols, "reporting_pop")) |> + fmean(w = reporting_pop, stub = FALSE) if (any(c("ALL", "WLD") %in% country)) { # Compute world aggregates - wld <- compute_world_aggregates(rgn = rgn, - cols = weighted_cols) + wld <- compute_world_aggregates(rgn = rgn) if (length(country) == 1) { if (country == "WLD") { # Return only world aggregate out <- wld } else if (country == "ALL") { # Combine with other regional aggregates - out <- rbind(rgn, wld, fill = TRUE) + out <- rowbind(rgn, wld, fill = TRUE) } } else { # Combine with other regional aggregates - out <- rbind(rgn, wld, fill = TRUE) + out <- rowbind(rgn, wld, fill = TRUE) # Return selection only if (!"ALL" %in% country) { out <- out[region_code %in% country, ] @@ -283,24 +271,16 @@ pip_aggregate_by <- function(df, } -compute_world_aggregates <- function(rgn, cols) { +compute_world_aggregates <- function(rgn, cols = NULL) { # Compute stats # Grouping by poverty line as well since we now have vectorized poverty line values - wld <- rgn[, lapply(.SD, - stats::weighted.mean, - w = reporting_pop, - na.rm = TRUE), - by = .(reporting_year, poverty_line), - .SDcols = cols - ] - # Compute yearly population WLD totals - tmp <- rgn[, .(reporting_pop = sum(reporting_pop)), - by = .(reporting_year, poverty_line)] - - - wld <- wld[tmp, on = .(reporting_year = reporting_year, poverty_line = poverty_line)] - wld[["region_code"]] <- "WLD" - wld[["region_name"]] <- "World" + wld <- rgn |> + fgroup_by(reporting_year, + poverty_line) |> + num_vars() |> + fmean(w = reporting_pop, stub = FALSE) |> + ftransform(region_code = "WLD", + region_name = "World") return(wld) @@ -316,10 +296,10 @@ filter_for_aggregate_by <- function(df) { # If nationally representative survey is available, use it # Otherwise, use whatever is available - out <- df[, check := length(reporting_level), - by = c("country_code", "reporting_year", "poverty_line")] - out <- out[out$check == 1 | (out$check > 1 & reporting_level == "national"), ] - - return(out) + df[, check := length(reporting_level), + by = c("country_code", "reporting_year", "poverty_line") + ][ + check == 1 | (check > 1 & reporting_level == "national"), + ] } diff --git a/R/pip_grp_new.R b/R/pip_grp_new.R index 51f5240e..184c0583 100644 --- a/R/pip_grp_new.R +++ b/R/pip_grp_new.R @@ -29,14 +29,6 @@ pip_grp_new <- \(country = "ALL", "ALL" } - - - - - - - - out <- fg_pip( country = country_code, year = year, @@ -47,7 +39,14 @@ pip_grp_new <- \(country = "ALL", ppp = NULL, lkup = lkup) - out <- treat_cache_and_main(out) + cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb") + if (!file.exists(cache_file_path)) { + # Create an empty duckdb file + create_duckdb_file(cache_file_path) + } + out <- treat_cache_and_main(out, + cache_file_path = cache_file_path, + lkup = lkup, fill_gaps = fill_gaps) # return empty dataframe if no metadata is found if (nrow(out) == 0) { diff --git a/R/pip_new_lineups.R b/R/pip_new_lineups.R index 33efc76f..0d3cb528 100644 --- a/R/pip_new_lineups.R +++ b/R/pip_new_lineups.R @@ -141,7 +141,9 @@ pip_new_lineups <- function(country = "ALL", # Cache new data #--------------------------------------------- - out <- treat_cache_and_main(out, cache_file_path) + out <- treat_cache_and_main(out, + cache_file_path = cache_file_path, + lkup = lkup, fill_gaps = fill_gaps) # Early return for empty table--------------- if (nrow(out) == 0) return(pipapi::empty_response) @@ -276,36 +278,43 @@ pip_new_lineups <- function(country = "ALL", +treat_cache_and_main <- \(out, cache_file_path, + lkup, fill_gaps) { - -treat_cache_and_main <- \(out, cache_file_path) { - cached_data <- if (is.null(out$data_in_cache)) { + # early return of cache data if not available. + cached_data <- + if (is.null(out$data_in_cache)) { NULL } else if (is.data.frame(out$data_in_cache)) { + if (fnrow(out$data_in_cache) == 0) { NULL } else { - qDT(out$data_in_cache) + ft <- qDT(out$data_in_cache) + if (fill_gaps) { + ft <- + fg_remove_duplicates(ft, + use_new_lineup_version = lkup$use_new_lineup_version) + + } + + # Add just mean and median + get_mean_median(ft, lkup, fill_gaps = fill_gaps) + } } else { - cli::cli_abort("{.code out$data_in_cache} must be NULL or data.frame not {.field {class(out$data_in_cache)}}") + cli::cli_abort( + "{.code out$data_in_cache} must be NULL or data.frame not + {.field {class(out$data_in_cache)}}" + ) } main_data <- qDT(out$main_data) if (nrow(main_data) > 0) { - - if (is.null(out$data_in_cache)) { - + if (is.null(cached_data)) { out <- main_data } else { - - if (fill_gaps) { - - cached_data <- fg_remove_duplicates(cached_data, - use_new_lineup_version = lkup$use_new_lineup_version) - } - out <- main_data |> rowbind(cached_data) } diff --git a/R/utils.R b/R/utils.R index 33969942..0d84c2ac 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1462,3 +1462,39 @@ unnest_dt_longer <- function(tbl, cols) { + + + +#' merge into fgt table the mean and median from dist stats table in lkup +#' +#' @param fgt data,table with fgt measures +#' @param lkup lkup +#' @param fill_gaps logical. whether to use lineup estimates +#' +#' @return data.table with with fgt, mean and median +#' @keywords internal +get_mean_median <- \(fgt, lkup, fill_gaps) { + + if (isFALSE(lkup$use_new_lineup_version)) return(fgt) + + if (fill_gaps) { + dist <- lkup$lineup_dist_stats[, + .(country_code, reporting_year, + reporting_level, mean, median)] + } else { + dist <- lkup$dist_stats[, + .(country_code, reporting_year, + reporting_level, mean, + median = survey_median_ppp)] + + } + joyn::joyn(x = fgt, + y = dist, + by = c('country_code', "reporting_year", "reporting_level"), + match_type = "m:1", # multiple povlines + keep = "left", + reportvar = FALSE, + verbose = FALSE) +} + + diff --git a/man/fg_get_mean_median.Rd b/man/get_mean_median.Rd similarity index 66% rename from man/fg_get_mean_median.Rd rename to man/get_mean_median.Rd index 88667176..f4fed163 100644 --- a/man/fg_get_mean_median.Rd +++ b/man/get_mean_median.Rd @@ -1,15 +1,17 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/fg_pip.R -\name{fg_get_mean_median} -\alias{fg_get_mean_median} +% Please edit documentation in R/utils.R +\name{get_mean_median} +\alias{get_mean_median} \title{merge into fgt table the mean and median from dist stats table in lkup} \usage{ -fg_get_mean_median(fgt, lkup) +get_mean_median(fgt, lkup, fill_gaps) } \arguments{ \item{fgt}{data,table with fgt measures} \item{lkup}{lkup} + +\item{fill_gaps}{logical. whether to use lineup estimates} } \value{ data.table with with fgt, mean and median diff --git a/man/map_fgt.Rd b/man/map_fgt.Rd index 19935f6b..e4d1c224 100644 --- a/man/map_fgt.Rd +++ b/man/map_fgt.Rd @@ -7,13 +7,13 @@ \alias{map_fgt} \title{compute FGT using indices by reporting level} \usage{ -DT_fgt_by_rl(x, y, nx) +DT_fgt_by_rl(x, y, nx, povline) -lt_to_dt(x, y, nx) +lt_to_dt(x, y, nx, povline) -map_lt_to_dt(lt, l_rl_rows) +map_lt_to_dt(lt, l_rl_rows, povline) -map_fgt(lt, l_rl_rows) +map_fgt(lt, l_rl_rows, povline) } \arguments{ \item{x}{data.table from lt list, with welfare and weight vectors} From 83ed0fff1cbc8c3f8763c787902cebac02b49c05 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Sat, 30 Aug 2025 09:08:47 -0400 Subject: [PATCH 083/203] add validate_country_codes function --- R/compute_fgt_new.R | 2 +- R/create_lkups.R | 3 ++- R/pip_new_lineups.R | 28 +++++++++++++++++++--------- 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/R/compute_fgt_new.R b/R/compute_fgt_new.R index 8879d018..2d09a1e8 100644 --- a/R/compute_fgt_new.R +++ b/R/compute_fgt_new.R @@ -275,7 +275,7 @@ pov_from_DT <- function(DT, povline, g, cores = 1) { n_pov <- length(povline) ng <- g$N.groups - grp_ids <- g$groups + grp_ids <- qDT(g$groups) # Precompute log(w) for efficiency pos <- w > 0 diff --git a/R/create_lkups.R b/R/create_lkups.R index 531f7933..890a7c04 100644 --- a/R/create_lkups.R +++ b/R/create_lkups.R @@ -16,7 +16,8 @@ create_versioned_lkups <- versions <- names(data_dirs) # versions[1] <- "latest_release" - versions_paths <- lapply(data_dirs, create_lkups, versions = versions) + versions_paths <- mapply(create_lkups, data_dirs, versions, + SIMPLIFY = FALSE, USE.NAMES = FALSE) names(versions_paths) <- versions return(list(versions = versions, diff --git a/R/pip_new_lineups.R b/R/pip_new_lineups.R index 0d3cb528..df71ad34 100644 --- a/R/pip_new_lineups.R +++ b/R/pip_new_lineups.R @@ -98,13 +98,10 @@ pip_new_lineups <- function(country = "ALL", # **** TO BE REMOVED **** REMOVAL ENDS HERE # Countries vector ------------ - lcv <- # List with countries vectors - create_countries_vctr( - country = country, - year = year, - valid_years = lkup$valid_years, - aux_files = lkup$aux_files - ) + validate_country_codes(country = country, lkup = lkup) + + + # lcv$est_ctrs has all the country_code that we are interested in cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb") @@ -116,7 +113,7 @@ pip_new_lineups <- function(country = "ALL", if (fill_gaps) { ## lineup years----------------- out <- fg_pip( - country = lcv$est_ctrs, + country = country, year = year, povline = povline, popshare = popshare, @@ -128,7 +125,7 @@ pip_new_lineups <- function(country = "ALL", } else { ## survey years ------------------ out <- rg_pip( - country = lcv$est_ctrs, + country = country, year = year, povline = povline, popshare = popshare, @@ -328,3 +325,16 @@ treat_cache_and_main <- \(out, cache_file_path, setDT(out) } + + + +validate_country_codes <- \(country, lkup) { + cls <- lkup$aux_files$country_list$country_code |> + unique() + + if (any(!country %in% cls)) { + wcls <- which(!country %in% cls) + cli::cli_abort("{.field {country[wcls]}} {?is/are} not {?a/} valid country code{?s}") + } + invisible(TRUE) +} From c0b5da3b4e4d4379d8371ee26ff62611316e447e Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Tue, 2 Sep 2025 10:09:32 -0400 Subject: [PATCH 084/203] temporal fix to include CMD info --- R/create_lkups.R | 111 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 95 insertions(+), 16 deletions(-) diff --git a/R/create_lkups.R b/R/create_lkups.R index 890a7c04..e547e277 100644 --- a/R/create_lkups.R +++ b/R/create_lkups.R @@ -195,6 +195,83 @@ create_lkups <- function(data_dir, versions) { refy_lkup <- fst::read_fst(refy_lkup_path, as.data.table = TRUE) + + # ZP ADD - CREATE OBJECT: lineup years + #___________________________________________________________________________ + lineup_years_path <- + fs::path(data_dir, + "estimations/lineup_years.fst") + + lineup_years <- fst::read_fst(lineup_years_path) |> + as.list() # Why Is this a list? + + + + # --- START NOTE AC> Include here the refy_lkup for CMD + ncountries <- nrow(country_list) + ly <- lineup_years$lineup_years + + + # Add reporting year + cmd2 <- CJ(country_list$country_code, + ly, c("national", "urban", "rural")) + + + cmd <- + country_list[rep(1:.N, + each = length(ly)) + ][, + reporting_year := rep(ly, times = ncountries) + # Add reporting level + ][rep(1:.N, each = 3), + ][, + reporting_level := + rep(c("national", "urban", "rural"), + times = (length(ly)*ncountries))] + + cmd <- joyn::joyn(x = cmd, + y = refy_lkup, + by = c('country_code', + 'reporting_year', + 'reporting_level'), + keep = "anti", + # reportvar = FALSE, + match_type = "1:1") + + # Delete unnecessary reporting levels + cmd <- cmd[(reporting_level == "national" & .joyn == "x") + ][, + .joyn := NULL] + + # build some variables + cmd[, + `:=`( + cache_id = paste(country_code, reporting_year,"NOSVY_D1_CON_CMD", + sep = "_"), + survey_coverage = "national", + welfare_type = "consumption", + distribution_type = "CMD distribution", + is_interpolated = FALSE, + is_used_for_line_up = TRUE, + is_used_for_aggregation = FALSE, + estimation_type = "CMD estimation", + display_cp = "0", + monotonic = TRUE, # ? + same_direction = TRUE, # NA ? + relative_distance = 1, + lineup_approach = "CMD", + mult_factor = 1 + + )] + + # Append lineup and CMD info + + refy_lkup <- rbindlist(list(refy_lkup, cmd), + use.names = TRUE, + fill = TRUE) + + + # Create additional variables refy_lkup[ , path := { fs::path(data_dir, @@ -212,10 +289,10 @@ create_lkups <- function(data_dir, versions) { reporting_level, sep = "_")] - if ("region_code" %in% names(refy_lkup)) { - refy_lkup[, - region_code := NULL] - } + # if ("region_code" %in% names(refy_lkup)) { + # refy_lkup[, + # region_code := NULL] + # } refy_lkup[, @@ -229,14 +306,24 @@ create_lkups <- function(data_dir, versions) { collapse = "|"), by = .(interpolation_id)] - refy_lkup <- joyn::joyn(x = refy_lkup, - y = countries, - by = 'country_code', + + # Temporal fix + refy_lkup <- joyn::joyn(refy_lkup, country_list, + by = c('country_code', + 'reporting_year', + 'reporting_level'), keep = "left", reportvar = FALSE, - match_type = "m:1") + match_type = "1:1", + update_values = TRUE) + + + # --- END inclussion of CMD data. + + refy_lkup <- refy_lkup[reporting_year %in% lineup_years$lineup_years, ] + gv(refy_lkup, c("monotonic", "same_direction", @@ -256,15 +343,7 @@ create_lkups <- function(data_dir, versions) { "relative_distance")) <- NULL - # ZP ADD - CREATE OBJECT: lineup years - #___________________________________________________________________________ - lineup_years_path <- - fs::path(data_dir, - "estimations/lineup_years.fst") - lineup_years <- fst::read_fst(lineup_years_path) |> - as.list() - refy_lkup <- refy_lkup[reporting_year %in% lineup_years$lineup_years, ] # ZP ADD - CREATE OBJECT: lineup dist stats #___________________________________________________________________________ From cb756d3e4b91078478ba2e746be80bbc561084cc Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Tue, 2 Sep 2025 10:10:37 -0400 Subject: [PATCH 085/203] improve efficiency --- R/create_lkups.R | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/R/create_lkups.R b/R/create_lkups.R index e547e277..4cd9e306 100644 --- a/R/create_lkups.R +++ b/R/create_lkups.R @@ -213,21 +213,12 @@ create_lkups <- function(data_dir, versions) { # Add reporting year - cmd2 <- CJ(country_list$country_code, + cmd <- CJ(country_list$country_code, ly, c("national", "urban", "rural")) - - cmd <- - country_list[rep(1:.N, - each = length(ly)) - ][, - reporting_year := rep(ly, times = ncountries) - # Add reporting level - ][rep(1:.N, each = 3), - ][, - reporting_level := - rep(c("national", "urban", "rural"), - times = (length(ly)*ncountries))] + setnames(cmd, new = c('country_code', + 'reporting_year', + 'reporting_level')) cmd <- joyn::joyn(x = cmd, y = refy_lkup, From 11c5e440390c476f9a1e230044701d19383c5927 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Tue, 2 Sep 2025 10:14:08 -0400 Subject: [PATCH 086/203] tempooral fix complete --- R/create_lkups.R | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/R/create_lkups.R b/R/create_lkups.R index 4cd9e306..d4d26413 100644 --- a/R/create_lkups.R +++ b/R/create_lkups.R @@ -300,12 +300,10 @@ create_lkups <- function(data_dir, versions) { # Temporal fix refy_lkup <- joyn::joyn(refy_lkup, country_list, - by = c('country_code', - 'reporting_year', - 'reporting_level'), + by = "country_code", keep = "left", reportvar = FALSE, - match_type = "1:1", + match_type = "m:1", update_values = TRUE) From 6cfcca9e55338d33305b15390df4cb3da3c4734f Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Tue, 2 Sep 2025 16:35:50 -0400 Subject: [PATCH 087/203] update population --- R/create_lkups.R | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/R/create_lkups.R b/R/create_lkups.R index d4d26413..2e30c646 100644 --- a/R/create_lkups.R +++ b/R/create_lkups.R @@ -307,6 +307,25 @@ create_lkups <- function(data_dir, versions) { update_values = TRUE) + # merge population + popl <- pivot(pop, + ids = c("country_code", "data_level"), + names = list(variable = "reporting_year", + value = "reporting_pop"), + how = "longer") |> + frename(data_level = reporting_level) + + refy_lkup <- joyn::joyn(refy_lkup, popl, + by = c('country_code', + 'reporting_year', + 'reporting_level'), + keep = "left", + reportvar = FALSE, + match_type = "1:1", + update_values = TRUE) + + + # --- END inclussion of CMD data. From ea094e1c0eb554e67682513774fbbccd51e9c410 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Wed, 3 Sep 2025 12:08:09 -0400 Subject: [PATCH 088/203] make sure we don't have factors in years of population --- R/create_lkups.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/create_lkups.R b/R/create_lkups.R index 2e30c646..7556f2ad 100644 --- a/R/create_lkups.R +++ b/R/create_lkups.R @@ -313,6 +313,7 @@ create_lkups <- function(data_dir, versions) { names = list(variable = "reporting_year", value = "reporting_pop"), how = "longer") |> + ftransform(reporting_year = as_integer_factor(reporting_year)) |> frename(data_level = reporting_level) refy_lkup <- joyn::joyn(refy_lkup, popl, From 9ad8a6dd40cd7b56abe49fd6e811e84202e92bb3 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Thu, 4 Sep 2025 17:15:55 -0400 Subject: [PATCH 089/203] fix bug in country_list --- R/create_lkups.R | 1 + R/pip_grp_new.R | 4 ++-- man/pip_agg.Rd | 16 +++++++++++++++- man/pip_gg.Rd | 20 -------------------- 4 files changed, 18 insertions(+), 23 deletions(-) delete mode 100644 man/pip_gg.Rd diff --git a/R/create_lkups.R b/R/create_lkups.R index 7556f2ad..e335511a 100644 --- a/R/create_lkups.R +++ b/R/create_lkups.R @@ -92,6 +92,7 @@ create_lkups <- function(data_dir, versions) { ## country_list ---- cl_lkup_path <- fs::path(data_dir, "_aux/country_list.fst") country_list <- fst::read_fst(cl_lkup_path, as.data.table = TRUE) + data.table::setnames(country_list, 'region', 'region_name') # Why is this necessary? ## countries ---- cts_path <- fs::path(data_dir, "_aux/countries.fst") diff --git a/R/pip_grp_new.R b/R/pip_grp_new.R index 184c0583..352e652d 100644 --- a/R/pip_grp_new.R +++ b/R/pip_grp_new.R @@ -1,5 +1,5 @@ #' New way to estimate Aggregate data -#' @rdname pip_gg +#' @rdname pip_agg pip_grp_new <- \(country = "ALL", year = "ALL", povline = 1.9, @@ -46,7 +46,7 @@ pip_grp_new <- \(country = "ALL", } out <- treat_cache_and_main(out, cache_file_path = cache_file_path, - lkup = lkup, fill_gaps = fill_gaps) + lkup = lkup, fill_gaps = TRUE) # return empty dataframe if no metadata is found if (nrow(out) == 0) { diff --git a/man/pip_agg.Rd b/man/pip_agg.Rd index 9c197ecb..3eb27292 100644 --- a/man/pip_agg.Rd +++ b/man/pip_agg.Rd @@ -1,8 +1,9 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pip_agg.R, R/pip_grp_logic.R +% Please edit documentation in R/pip_agg.R, R/pip_grp_logic.R, R/pip_grp_new.R \name{pip_agg} \alias{pip_agg} \alias{pip_grp_logic} +\alias{pip_grp_new} \title{Logic for computing new aggregate} \usage{ pip_agg( @@ -30,6 +31,17 @@ pip_grp_logic( lkup_hash = lkup$cache_data_id$hash_pip_grp, additional_ind = FALSE ) + +pip_grp_new( + country = "ALL", + year = "ALL", + povline = 1.9, + welfare_type = c("all", "consumption", "income"), + reporting_level = c("all", "national"), + lkup, + censor = TRUE, + additional_ind = FALSE +) } \arguments{ \item{country}{character: Country ISO 3 codes} @@ -63,6 +75,8 @@ data.table Logic for computing new aggregate Old way to estimate aggregate data + +New way to estimate Aggregate data } \examples{ \dontrun{ diff --git a/man/pip_gg.Rd b/man/pip_gg.Rd deleted file mode 100644 index 285bcec0..00000000 --- a/man/pip_gg.Rd +++ /dev/null @@ -1,20 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pip_grp_new.R -\name{pip_grp_new} -\alias{pip_grp_new} -\title{New way to estimate Aggregate data} -\usage{ -pip_grp_new( - country = "ALL", - year = "ALL", - povline = 1.9, - welfare_type = c("all", "consumption", "income"), - reporting_level = c("all", "national"), - lkup, - censor = TRUE, - additional_ind = FALSE -) -} -\description{ -New way to estimate Aggregate data -} From 772de99c58ebcd8ee5e32eaa3b82b0b5640d0112 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Thu, 4 Sep 2025 20:17:31 -0400 Subject: [PATCH 090/203] preliminary results... working but ugly --- R/fg_pip.R | 73 +++++------------------------------------ R/utils-pipdata.R | 6 +--- man/create_full_list.Rd | 12 ++----- 3 files changed, 12 insertions(+), 79 deletions(-) diff --git a/R/fg_pip.R b/R/fg_pip.R index c367f207..8d46dc31 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -58,10 +58,7 @@ fg_pip <- function(country, data_in_cache = data_present_in_master)) } - full_list <- create_full_list(country = country, - year = year, - refy_lkup = refy_lkup, - data_present_in_master = data_present_in_master) + full_list <- create_full_list(metadata = metadata) lt <- load_list_refy(input_list = full_list, @@ -251,68 +248,14 @@ fg_assign_nas_values_to_dup_cols <- function(df, #' Create full list for fg data load, not including country-years in cache #' -#' @param country Country selected in [fg_pip] function -#' @param year Year/s selected in [fg_pip] function -#' @param refy_lkup reference year lkup table with full lineups and new method -#' @param data_present_in_master cache data -#' -#' @return list -create_full_list <- function(country, year, refy_lkup, data_present_in_master) { - - if (!is.null(data_present_in_master)) { - data_not_in_cache <- - joyn::anti_join(x = refy_lkup, - y = data_present_in_master, - by = c("country_code", "reporting_year"), - reportvar = FALSE, - verbose = FALSE) - } else { - data_not_in_cache <- refy_lkup - } - - # Extract unique combinations of country-year - if (any(c("ALL", "WLD") %in% country)) { - cntry <- data_not_in_cache$country_code |> - funique() - } else { - cntry <- data_not_in_cache[country_code %in% country, - ]$country_code |> - funique() - } - if (any(c("ALL") %in% year)) { - yr <- data_not_in_cache$reporting_year |> - unique() - } else { - yr <- data_not_in_cache[reporting_year %in% year, - ]$reporting_year |> - funique() - } - dtemp <- - data_not_in_cache |> - fsubset(country_code %in% cntry & - reporting_year %in% yr) |> - fsubset(reporting_year %in% lkup$valid_years$lineup_years) |> - fselect(country_code, - year = reporting_year) |> - funique() - - # Split years by country - full_list <- dtemp[, - .(year = list(year)), - by = country_code - ][, - .(country_code, year = year) - ] - - # Convert to desired structure - full_list <- list( - country_code = full_list$country_code, - year = lapply(full_list$year, - as.numeric)) - - - full_list +#' @param metadata data table from subset_lkup()$lkup +#' @return data.table +create_full_list <- function(metadata) { + x <- metadata[, .(country_code, reporting_year) + ][, + inames := paste0(country_code, "_", reporting_year)] + x } diff --git a/R/utils-pipdata.R b/R/utils-pipdata.R index 33ce4df6..86f82230 100644 --- a/R/utils-pipdata.R +++ b/R/utils-pipdata.R @@ -6,13 +6,9 @@ #' @return character vector #' @keywords internal load_list_refy <- \(input_list, path){ - input_list <- transform_input(input_list) - - inames <- lapply(input_list, \(x) { - paste0(x$country_code, "_", x$year) - }) cores <- get_from_pipapienv("cores_to_use") + inames <- input_list$inames dl <- lapply(inames, \(x) { qs::qread(file = fs::path(path, x, ext = "qs"), diff --git a/man/create_full_list.Rd b/man/create_full_list.Rd index 249ae8b6..5fe3dbc9 100644 --- a/man/create_full_list.Rd +++ b/man/create_full_list.Rd @@ -4,19 +4,13 @@ \alias{create_full_list} \title{Create full list for fg data load, not including country-years in cache} \usage{ -create_full_list(country, year, refy_lkup, data_present_in_master) +create_full_list(metadata) } \arguments{ -\item{country}{Country selected in \link{fg_pip} function} - -\item{year}{Year/s selected in \link{fg_pip} function} - -\item{refy_lkup}{reference year lkup table with full lineups and new method} - -\item{data_present_in_master}{cache data} +\item{metadata}{data table from subset_lkup()$lkup} } \value{ -list +data.table } \description{ Create full list for fg data load, not including country-years in cache From bac2abffa738d11a038039d1e9172ac2b168ef65 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Thu, 4 Sep 2025 21:37:19 -0400 Subject: [PATCH 091/203] fix issue with missclassified CMDs --- R/create_lkups.R | 18 +++++++++++++----- R/pip_grp.R | 30 ++++++++++++++++++++---------- R/pip_grp_logic.R | 1 - R/pip_grp_new.R | 1 - R/pip_new_lineups.R | 1 - R/pip_old.R | 1 - R/pip_old_lineups.R | 1 - 7 files changed, 33 insertions(+), 20 deletions(-) diff --git a/R/create_lkups.R b/R/create_lkups.R index e335511a..9f2a847e 100644 --- a/R/create_lkups.R +++ b/R/create_lkups.R @@ -228,12 +228,18 @@ create_lkups <- function(data_dir, versions) { 'reporting_level'), keep = "anti", # reportvar = FALSE, - match_type = "1:1") + match_type = "1:1") |> + setDT() + + # get number or reporing levels left after anti join + # if less than three, it should NOT be CMD (e.g., ARG or CHN) + cmd[, n := .N, + by = c("country_code", "reporting_year")] # Delete unnecessary reporting levels - cmd <- cmd[(reporting_level == "national" & .joyn == "x") - ][, - .joyn := NULL] + cmd <- cmd[(reporting_level == "national" & + .joyn == "x" & n == 3) + ] # build some variables cmd[, @@ -252,7 +258,9 @@ create_lkups <- function(data_dir, versions) { same_direction = TRUE, # NA ? relative_distance = 1, lineup_approach = "CMD", - mult_factor = 1 + mult_factor = 1, + .joyn = NULL, + n = NULL )] diff --git a/R/pip_grp.R b/R/pip_grp.R index 8d50b238..5a470a37 100644 --- a/R/pip_grp.R +++ b/R/pip_grp.R @@ -86,7 +86,6 @@ pip_grp <- function(country = "ALL", out <- pip_aggregate_by( df = out, - group_lkup = lkup[["pop_region"]], country = country, return_cols = lkup$return_cols$pip_grp ) @@ -209,13 +208,11 @@ pip_aggregate <- function(df, by = NULL, return_cols) { #' Aggregate by predefined groups #' @param df data.frame: Response from `fg_pip_old()` or `rg_pip()`. -#' @param group_lkup data.frame: Group lkup table (pop_region) #' @param country character: Selected countries / regions #' @param return_cols list: lkup$return_cols$pip_grp object. Controls returned #' columns #' @keywords internal pip_aggregate_by <- function(df, - group_lkup, country = "ALL", return_cols) { @@ -228,7 +225,7 @@ pip_aggregate_by <- function(df, to_keep <- all_cols[!all_cols %in% c("pop_in_poverty", "estimate_type")] - df <- df[, .SD, .SDcols = to_keep] # I think we can ommit this part + # df <- df[, .SD, .SDcols = to_keep] # I think we can ommit this part # Compute stats weighted average by groups rgn <- df |> @@ -239,6 +236,19 @@ pip_aggregate_by <- function(df, fselect(c(weighted_cols, "reporting_pop")) |> fmean(w = reporting_pop, stub = FALSE) + # Africas aggregation + afr <- df |> + fgroup_by(africa_split, + africa_split_code, + reporting_year, + poverty_line) |> + fselect(c(weighted_cols, "reporting_pop")) |> + fmean(w = reporting_pop, stub = FALSE) |> + fsubset(!is.na(africa_split_code)) + + + + if (any(c("ALL", "WLD") %in% country)) { # Compute world aggregates wld <- compute_world_aggregates(rgn = rgn) @@ -248,11 +258,11 @@ pip_aggregate_by <- function(df, out <- wld } else if (country == "ALL") { # Combine with other regional aggregates - out <- rowbind(rgn, wld, fill = TRUE) + out <- rowbind(rgn, afr, wld, fill = TRUE) } } else { # Combine with other regional aggregates - out <- rowbind(rgn, wld, fill = TRUE) + out <- rowbind(rgn, afr, wld, fill = TRUE) # Return selection only if (!"ALL" %in% country) { out <- out[region_code %in% country, ] @@ -260,14 +270,14 @@ pip_aggregate_by <- function(df, } } else { # Return only selected regions - out <- rgn - } + out <- rowbind(rgn, afr, fill = TRUE) |> + _[region_code %in% country, ] + } # Compute population living in poverty - out$pop_in_poverty <- round(out$headcount * out$reporting_pop, 0) + out[, pop_in_poverty := round(headcount * reporting_pop, 0)] - return(out) } diff --git a/R/pip_grp_logic.R b/R/pip_grp_logic.R index 5503b4c7..497dc907 100644 --- a/R/pip_grp_logic.R +++ b/R/pip_grp_logic.R @@ -315,7 +315,6 @@ pip_grp_helper <- function(lcv_country, out <- pip_aggregate_by( df = out, - group_lkup = lkup[["pop_region"]], country = country, return_cols = lkup$return_cols$pip_grp ) diff --git a/R/pip_grp_new.R b/R/pip_grp_new.R index 352e652d..e4880abb 100644 --- a/R/pip_grp_new.R +++ b/R/pip_grp_new.R @@ -71,7 +71,6 @@ pip_grp_new <- \(country = "ALL", out <- pip_aggregate_by( df = out, - group_lkup = lkup[["pop_region"]], country = country, return_cols = lkup$return_cols$pip_grp ) diff --git a/R/pip_new_lineups.R b/R/pip_new_lineups.R index df71ad34..9687901a 100644 --- a/R/pip_new_lineups.R +++ b/R/pip_new_lineups.R @@ -171,7 +171,6 @@ pip_new_lineups <- function(country = "ALL", out <- pip_aggregate_by( df = out, - group_lkup = lkup[["pop_region"]], return_cols = lkup$return_cols$pip_grp) # Censor regional values diff --git a/R/pip_old.R b/R/pip_old.R index 3fc5cee7..aa9c28fd 100644 --- a/R/pip_old.R +++ b/R/pip_old.R @@ -184,7 +184,6 @@ pip_old <- function(country = "ALL", out <- pip_aggregate_by( df = out, - group_lkup = lkup[["pop_region"]], return_cols = lkup$return_cols$pip_grp ) # Censor regional values diff --git a/R/pip_old_lineups.R b/R/pip_old_lineups.R index 8994a3ba..6424d6c5 100644 --- a/R/pip_old_lineups.R +++ b/R/pip_old_lineups.R @@ -182,7 +182,6 @@ pip_old_lineups <- function(country = "ALL", out <- pip_aggregate_by( df = out, - group_lkup = lkup[["pop_region"]], return_cols = lkup$return_cols$pip_grp ) # Censor regional values From 264e62b82746c755e0b6aba63664cb4d39f2022b Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Thu, 4 Sep 2025 21:51:31 -0400 Subject: [PATCH 092/203] fix small issue --- R/pip_grp.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/pip_grp.R b/R/pip_grp.R index 5a470a37..ac9040ac 100644 --- a/R/pip_grp.R +++ b/R/pip_grp.R @@ -244,7 +244,9 @@ pip_aggregate_by <- function(df, poverty_line) |> fselect(c(weighted_cols, "reporting_pop")) |> fmean(w = reporting_pop, stub = FALSE) |> - fsubset(!is.na(africa_split_code)) + fsubset(!is.na(africa_split_code)) |> + frename(africa_split_code = region_code, + africa_split = region_name) From 1c7d05474e5b2155136c31fbe350d54bf7bd59e1 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Fri, 5 Sep 2025 09:01:48 -0400 Subject: [PATCH 093/203] set default for data_dir in get_aux_table() --- R/get_aux_table.R | 12 +++++++++++- R/pip_grp.R | 3 ++- R/pip_new_lineups.R | 3 ++- R/utils-pipdata.R | 9 ++++++++- 4 files changed, 23 insertions(+), 4 deletions(-) diff --git a/R/get_aux_table.R b/R/get_aux_table.R index 93604585..5f2bae95 100644 --- a/R/get_aux_table.R +++ b/R/get_aux_table.R @@ -7,7 +7,17 @@ #' @return data.frame #' @export #' -get_aux_table <- function(data_dir, table, long_format = FALSE) { +get_aux_table <- function(data_dir = NULL, + table, long_format = FALSE) { + + if (is.null(data_dir)) { + if (exists("lkup", inherits = TRUE)) { + data_dir <- get("lkup", inherits = TRUE)$data_root + } else { + cli::cli_abort("{.code data_dir} not defined and {.field lkup} not found.") + } +} + if (long_format && !table %in% get_valid_aux_long_format_tables()) { long_format <- FALSE } diff --git a/R/pip_grp.R b/R/pip_grp.R index ac9040ac..842a1e16 100644 --- a/R/pip_grp.R +++ b/R/pip_grp.R @@ -237,6 +237,7 @@ pip_aggregate_by <- function(df, fmean(w = reporting_pop, stub = FALSE) # Africas aggregation + if (any(c("ALL", "WLD") %in% country)) { afr <- df |> fgroup_by(africa_split, africa_split_code, @@ -248,7 +249,7 @@ pip_aggregate_by <- function(df, frename(africa_split_code = region_code, africa_split = region_name) - + } if (any(c("ALL", "WLD") %in% country)) { diff --git a/R/pip_new_lineups.R b/R/pip_new_lineups.R index 9687901a..5153f037 100644 --- a/R/pip_new_lineups.R +++ b/R/pip_new_lineups.R @@ -329,7 +329,8 @@ treat_cache_and_main <- \(out, cache_file_path, validate_country_codes <- \(country, lkup) { cls <- lkup$aux_files$country_list$country_code |> - unique() + unique() |> + c("ALL") if (any(!country %in% cls)) { wcls <- which(!country %in% cls) diff --git a/R/utils-pipdata.R b/R/utils-pipdata.R index 86f82230..39def155 100644 --- a/R/utils-pipdata.R +++ b/R/utils-pipdata.R @@ -10,7 +10,14 @@ load_list_refy <- \(input_list, path){ cores <- get_from_pipapienv("cores_to_use") inames <- input_list$inames - dl <- lapply(inames, \(x) { + seq_flex <- if (interactive()) { + cli::cli_progress_along + } else { + base::seq_along + } + + dl <- lapply(seq_flex(inames), \(i) { + x <- inames[i] qs::qread(file = fs::path(path, x, ext = "qs"), nthreads = cores) }) |> From 9397c8e4fd28fa2eeeefa8947b091522544638b4 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Fri, 5 Sep 2025 16:03:25 -0400 Subject: [PATCH 094/203] this is the most recent --- R/pip_grp.R | 48 ++++++++++++++++++++++-------------------------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/R/pip_grp.R b/R/pip_grp.R index 842a1e16..15281b76 100644 --- a/R/pip_grp.R +++ b/R/pip_grp.R @@ -214,7 +214,7 @@ pip_aggregate <- function(df, by = NULL, return_cols) { #' @keywords internal pip_aggregate_by <- function(df, country = "ALL", - return_cols) { + return_cols = NULL) { all_cols <- return_cols$cols weighted_cols <- return_cols$weighted_average_cols @@ -236,9 +236,17 @@ pip_aggregate_by <- function(df, fselect(c(weighted_cols, "reporting_pop")) |> fmean(w = reporting_pop, stub = FALSE) - # Africas aggregation + # World aggregation if (any(c("ALL", "WLD") %in% country)) { - afr <- df |> + # Compute world aggregates + wld <- compute_world_aggregates(rgn = rgn) + } else { + wld <- NULL + } + + # Africas aggregation + if (any(c("ALL", "AFE", "AFW") %in% country)) { + rgn <- df |> fgroup_by(africa_split, africa_split_code, reporting_year, @@ -247,40 +255,28 @@ pip_aggregate_by <- function(df, fmean(w = reporting_pop, stub = FALSE) |> fsubset(!is.na(africa_split_code)) |> frename(africa_split_code = region_code, - africa_split = region_name) - + africa_split = region_name) |> + rowbind(rgn, fill = TRUE) } - if (any(c("ALL", "WLD") %in% country)) { - # Compute world aggregates - wld <- compute_world_aggregates(rgn = rgn) if (length(country) == 1) { if (country == "WLD") { # Return only world aggregate - out <- wld - } else if (country == "ALL") { - # Combine with other regional aggregates - out <- rowbind(rgn, afr, wld, fill = TRUE) - } - } else { - # Combine with other regional aggregates - out <- rowbind(rgn, afr, wld, fill = TRUE) - # Return selection only - if (!"ALL" %in% country) { - out <- out[region_code %in% country, ] + wld[, pop_in_poverty := round(headcount * reporting_pop, 0)] + return(wld) } } - } else { - # Return only selected regions - out <- rowbind(rgn, afr, fill = TRUE) |> - _[region_code %in% country, ] - } + # Combine with other regional aggregates + out <- rowbind(rgn, wld, fill = TRUE) + out[, pop_in_poverty := round(headcount * reporting_pop, 0)] - # Compute population living in poverty - out[, pop_in_poverty := round(headcount * reporting_pop, 0)] + if ("ALL" %in% country) { + return(out) + } + out[region_code %in% country, ] } From 0c6b838d9fdd1949dc96f4caec0db65b76c0970b Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Sat, 6 Sep 2025 10:14:55 -0400 Subject: [PATCH 095/203] fix create_countries_vctr addint lkup --- R/create_countries_vctr.R | 7 ++++--- R/pip_grp_logic.R | 7 ++++--- R/pip_old.R | 3 +-- R/pip_old_lineups.R | 3 +-- tests/testthat/test-create_countries_vctr.R | 21 +++++++-------------- 5 files changed, 17 insertions(+), 24 deletions(-) diff --git a/R/create_countries_vctr.R b/R/create_countries_vctr.R index 230118c4..72eb5c8e 100644 --- a/R/create_countries_vctr.R +++ b/R/create_countries_vctr.R @@ -6,14 +6,15 @@ #' #' @inheritParams pip #' @param valid_years list: Valid years information provided through lkup object -#' @param aux_files list: List of auxiliary tables provided through lkup object +#' @param lkup lkup object #' #' @return a list of vectors with countries and regions code to be used in #' `pip()` and `pip_grp()` create_countries_vctr <- function(country, year, - valid_years, - aux_files) { + lkup) { + valid_years <- lkup$valid_years + aux_files <- lkup$aux_files # STEP 1: Setup ---- ## init Return list ---- diff --git a/R/pip_grp_logic.R b/R/pip_grp_logic.R index 497dc907..216bb0f1 100644 --- a/R/pip_grp_logic.R +++ b/R/pip_grp_logic.R @@ -44,8 +44,7 @@ pip_grp_logic <- function(country = "ALL", create_countries_vctr( country = country, year = year, - valid_years = lkup$valid_years, - aux_files = lkup$aux_files + lkup = lkup ) # use the same names as before to avoid inconsistencies @@ -84,7 +83,9 @@ pip_grp_logic <- function(country = "ALL", lkup = lkup ) # For now just rowbinding two dataframes, but we would need to use it more smartly in the future - fg_pip_master <- collapse::rowbind(fg_pip_master) + fg_pip_master <- collapse::rowbind(fg_pip_master, + # THis should not be necessary + fill = TRUE) if (!data.table::is.data.table(fg_pip_master)) { setDT(fg_pip_master) diff --git a/R/pip_old.R b/R/pip_old.R index aa9c28fd..ae62bb22 100644 --- a/R/pip_old.R +++ b/R/pip_old.R @@ -100,8 +100,7 @@ pip_old <- function(country = "ALL", create_countries_vctr( country = country, year = year, - valid_years = lkup$valid_years, - aux_files = lkup$aux_files + lkup = lkup ) # lcv$est_ctrs has all the country_code that we are interested in diff --git a/R/pip_old_lineups.R b/R/pip_old_lineups.R index 6424d6c5..fc01620b 100644 --- a/R/pip_old_lineups.R +++ b/R/pip_old_lineups.R @@ -102,8 +102,7 @@ pip_old_lineups <- function(country = "ALL", create_countries_vctr( country = country, year = year, - valid_years = lkup$valid_years, - aux_files = lkup$aux_files + lkup = lkup ) # lcv$est_ctrs has all the country_code that we are interested in diff --git a/tests/testthat/test-create_countries_vctr.R b/tests/testthat/test-create_countries_vctr.R index fae4b133..2c4dbc7b 100644 --- a/tests/testthat/test-create_countries_vctr.R +++ b/tests/testthat/test-create_countries_vctr.R @@ -457,8 +457,7 @@ test_that("create_vector_countries output the expected object", { year = "2010" out <- create_countries_vctr(country = country, year = year, - valid_years = valid_years, - aux_files = aux_files) + lkup = lkup) expect_true(is.list(out)) expect_equal(sort(names(out)), sort(c("ctr_off_reg", @@ -479,8 +478,7 @@ test_that("create_vector_countries works for countries selection", { year = "ALL" out <- create_countries_vctr(country = country, year = year, - valid_years = valid_years, - aux_files = aux_files) + lkup = lkup) # Selects all countries with survey data when country="ALL" expect_equal(sort(out$est_ctrs), sort(aux_files$countries$country_code)) @@ -492,8 +490,7 @@ test_that("create_vector_countries Returns correct results when country = ALL", year = "2010" out <- create_countries_vctr(country = country, year = year, - valid_years = valid_years, - aux_files = aux_files) + lkup = lkup) # Correct official regions expect_equal(out$user_off_reg, c("ALL", @@ -514,8 +511,7 @@ test_that("create_vector_countries Returns correct results when country = WLD", year = "2010" out <- create_countries_vctr(country = country, year = year, - valid_years = valid_years, - aux_files = aux_files) + lkup = lkup) # Correct official regions expect_equal(out$user_off_reg, c("ALL", @@ -534,8 +530,7 @@ test_that("create_vector_countries Returns correct results when country = altern year = "ALL" out <- create_countries_vctr(country = country, year = year, - valid_years = valid_years, - aux_files = aux_files) + lkup = lkup) # Returns no official region expect_true(is.null(out$user_off_reg)) @@ -554,8 +549,7 @@ test_that("create_vector_countries returns correct results when country = offica year = "ALL" out <- create_countries_vctr(country = country, year = year, - valid_years = valid_years, - aux_files = aux_files) + lkup = lkup) # Returns no official region expect_equal(out$user_off_reg, country) @@ -574,8 +568,7 @@ test_that("create_vector_countries returns correct results when country = aggreg year = "ALL" out <- create_countries_vctr(country = country, year = year, - valid_years = valid_years, - aux_files = aux_files) + lkup = lkup) # Returns no official region expect_equal(out$user_off_reg, off_country) From f3bd966d44eecf7c51448c0167baba9b135cca28 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Sat, 6 Sep 2025 16:42:22 -0400 Subject: [PATCH 096/203] fix test --- tests/testthat/test-create_countries_vctr.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-create_countries_vctr.R b/tests/testthat/test-create_countries_vctr.R index 2c4dbc7b..fb88a02e 100644 --- a/tests/testthat/test-create_countries_vctr.R +++ b/tests/testthat/test-create_countries_vctr.R @@ -568,7 +568,7 @@ test_that("create_vector_countries returns correct results when country = aggreg year = "ALL" out <- create_countries_vctr(country = country, year = year, - lkup = lkup) + lkup = lkup) # Returns no official region expect_equal(out$user_off_reg, off_country) From 6d15ac19d896225cf7a6e1bbd27caa7afd061a4a Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Sat, 6 Sep 2025 17:19:47 -0400 Subject: [PATCH 097/203] remporal fix in number of aggregates to include --- R/create_countries_vctr.R | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/R/create_countries_vctr.R b/R/create_countries_vctr.R index 72eb5c8e..4cb985f1 100644 --- a/R/create_countries_vctr.R +++ b/R/create_countries_vctr.R @@ -54,7 +54,12 @@ create_countries_vctr <- function(country, } # STEP 2: Identify regions ---- ## All available aggregates ---- - aggs <- aux_files$regions ## all aggregates + aggs <- aux_files$regions |> ## all aggregates + # temporal filter + _[grouping_type %in% c("region", "regionpcn", "africa_split", "word")] + + + ## Official grouping type ---- off_gt <- c("region", "world") #c("region") ## Official valid region codes ---- From 6435d70ad1cb6fe8cc8b17101d300aba5f3ad796 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Sat, 6 Sep 2025 18:00:03 -0400 Subject: [PATCH 098/203] include region and regionpcn in old way --- R/fg_pip_old.R | 3 ++- R/pip_grp_logic.R | 12 ++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/R/fg_pip_old.R b/R/fg_pip_old.R index b5fab88b..2da40213 100644 --- a/R/fg_pip_old.R +++ b/R/fg_pip_old.R @@ -52,7 +52,8 @@ fg_pip_old <- function(country, # Return empty dataframe if no metadata is found if (nrow(metadata) == 0) { - return(list(main_data = pipapi::empty_response_fg, data_in_cache = data_present_in_master)) + return(list(main_data = pipapi::empty_response_fg, + data_in_cache = data_present_in_master)) } unique_survey_files <- unique(metadata$data_interpolation_id) diff --git a/R/pip_grp_logic.R b/R/pip_grp_logic.R index 216bb0f1..e6f54e9b 100644 --- a/R/pip_grp_logic.R +++ b/R/pip_grp_logic.R @@ -83,13 +83,13 @@ pip_grp_logic <- function(country = "ALL", lkup = lkup ) # For now just rowbinding two dataframes, but we would need to use it more smartly in the future - fg_pip_master <- collapse::rowbind(fg_pip_master, - # THis should not be necessary - fill = TRUE) + fg_pip_master <- rowbind(fg_pip_master, + # THis should not be necessary + fill = TRUE) |> + setDT() - if (!data.table::is.data.table(fg_pip_master)) { - setDT(fg_pip_master) - } + fg_pip_master[is.na(mean), + mean := predicted_mean_ppp] add_vars_out_of_pipeline(fg_pip_master, fill_gaps = TRUE, lkup = lkup) From b6d24cf610235fad266c085cd2de53f298f46053 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Sat, 6 Sep 2025 18:13:55 -0400 Subject: [PATCH 099/203] aggregation working for old, new, and previous... not pretty but works --- R/pip_grp.R | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/R/pip_grp.R b/R/pip_grp.R index 15281b76..df0797c2 100644 --- a/R/pip_grp.R +++ b/R/pip_grp.R @@ -258,6 +258,20 @@ pip_aggregate_by <- function(df, africa_split = region_name) |> rowbind(rgn, fill = TRUE) } + # Vintage aggregation + if ("ALL" %in% country & "regionpcn_code" %in% names(df)) { + rgn <- df |> + fgroup_by(regionpcn, + regionpcn_code, + reporting_year, + poverty_line) |> + fselect(c(weighted_cols, "reporting_pop")) |> + fmean(w = reporting_pop, stub = FALSE) |> + fsubset(!is.na(regionpcn_code)) |> + frename(regionpcn_code = region_code, + regionpcn = region_name) |> + rowbind(rgn, fill = TRUE) + } if (length(country) == 1) { From 82804149d96d99e01169676a02519e50a33b5153 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Mon, 8 Sep 2025 18:20:53 -0400 Subject: [PATCH 100/203] first version of cumsum instead of mean --- NAMESPACE | 1 + R/compute_fgt_new.R | 133 ++++++++++++++++++++++++++++++++++- R/fg_pip.R | 14 ++-- R/utils-pipdata.R | 16 +++-- man/create_countries_vctr.Rd | 6 +- man/fgt_watts_cumsum.Rd | 21 ++++++ man/get_aux_table.Rd | 2 +- man/load_list_refy.Rd | 2 +- man/pip_aggregate_by.Rd | 4 +- 9 files changed, 176 insertions(+), 23 deletions(-) create mode 100644 man/fgt_watts_cumsum.Rd diff --git a/NAMESPACE b/NAMESPACE index 96b78000..e9c62630 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -11,6 +11,7 @@ export(citation_from_version) export(create_etag_header) export(create_return_cols) export(create_versioned_lkups) +export(fgt_watts_cumsum) export(fillin_list) export(get_aux_table) export(get_aux_table_ui) diff --git a/R/compute_fgt_new.R b/R/compute_fgt_new.R index 2d09a1e8..8b4f0b0c 100644 --- a/R/compute_fgt_new.R +++ b/R/compute_fgt_new.R @@ -1,3 +1,5 @@ +# OLD APPROACH WITH MEAN -------------- + # Efficient FGT calculation for a data.table and vector of poverty lines #' Title #' @@ -145,7 +147,9 @@ compute_fgt <- function(w, wt, povlines) { #' @rdname map_fgt #' @keywords internal DT_fgt_by_rl <- \(x, y, nx, povline) { - DT_fgt <- lapply(names(y), \(rl) { + uni_rl <- names(y) |> + unique() + DT_fgt <- lapply(uni_rl, \(rl) { idx <- y[[rl]] w <- x[idx, welfare] @@ -343,5 +347,132 @@ pov_from_DT <- function(DT, povline, g, cores = 1) { +# NEW ARPPOACH USING CUMSUM ------------------ + +#' compute fgt and watts using cumulative welfare rather than means +#' +#' @param y numeric welfare (sorted ascending within the subgroup) +#' @param w numeric weights (same order as y) +#' @param lines numeric vector of poverty lines +#' +#' @return Returns a data.table with columns: line, fgt0,fgt1,fgt2,watts +#' @export +fgt_watts_cumsum <- function(y, w, lines) { + # types + + y <- as.double(y) + w <- as.double(w) + lines <- as.double(lines) + n <- length(y) + + if (n == 0L) { + return(data.table(line = lines, + fgt0 = 0, fgt1 = 0, fgt2 = 0, watts = 0)) + } + + # total weight (collapse: fsum is very fast) + W <- fsum(w) + + # cumulative sums (collapse: fcumsum is multithreaded-aware, very fast) + cw <- fcumsum(w) + cwy <- fcumsum(w * y) + cwy2 <- fcumsum(w * (y * y)) + + # Watts needs log(y) with y>0; clamp tiny positives for safety + y_pos <- pmax(y, 1e-12) + cwlog <- fcumsum(w * log(y_pos)) + + # index of last obs <= line for each z (0..n) + i <- findInterval(lines, y) + + take <- function(cs) { + out <- cs[pmax.int(i, 0L)] + out[i == 0L] <- 0 + out + } + + cw_i <- take(cw) + cwy_i <- take(cwy) + cwy2_i <- take(cwy2) + cwlog_i <- take(cwlog) + + z <- lines + z2 <- z * z + z_s <- pmax(z, 1e-12) + z2_s <- pmax(z2, 1e-24) + + data.table( + line = lines, + fgt0 = cw_i / W, + fgt1 = (z * cw_i - cwy_i) / (z_s * W), + fgt2 = (z2 * cw_i - 2 * z * cwy_i + cwy2_i) / (z2_s * W), + watts = (log(z_s) * cw_i - cwlog_i) / W + ) +} + + + +# tl: list of data.tables, each with columns id, reporting_level, welfare, weight +# Assumes: within each element, data are sorted by welfare within each reporting_level +fgt_watts_list <- function(tl, lines) { + rbindlist(lapply(tl, function(dt) { + # compute per (id, reporting_level) + dt[, fgt_watts_cumsum(welfare, weight, lines), + by = .(id, reporting_level)] + }), + fill = TRUE) +} + + +# DT: one big data.table with id, reporting_level, welfare, weight +# Assumes: within each (id, reporting_level), rows are sorted by welfare + + +# DT <- rbindlist(lt) + + +fgt_watts_dt <- function(DT, lines) { + DT[, fgt_watts_cumsum(welfare, weight, lines), + by = .(id, reporting_level)] +} + + +# lines <- c(1:100) +# rlt <- fgt_watts_list(lt, lines) +# rdt <- fgt_watts_dt(DT, lines) +# +# waldo::compare(rlt, rdt) +# +# +# +# +# bench <- microbenchmark::microbenchmark( +# times = 50, +# lt = { +# fgt_watts_list(lt, lines) +# }, +# dt = { +# fgt_watts_dt(DT, lines) +# +# } +# ) +# if (requireNamespace("highcharter")) { +# hc_dt <- highcharter::data_to_boxplot(bench, +# time, +# expr, +# add_outliers = FALSE, +# name = "Time in milliseconds") +# +# highcharter::highchart() |> +# highcharter::hc_xAxis(type = "category") |> +# highcharter::hc_chart(inverted=TRUE) |> +# highcharter::hc_add_series_list(hc_dt) +# +# } else { +# boxplot(bench, outline = FALSE) +# } +# + + diff --git a/R/fg_pip.R b/R/fg_pip.R index 8d46dc31..10d993e1 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -61,18 +61,17 @@ fg_pip <- function(country, full_list <- create_full_list(metadata = metadata) lt <- - load_list_refy(input_list = full_list, - path = fs::path(data_dir, "lineup_data")) + load_list_refy(input_list = full_list) # lt <- lapply(lt, \(x) { # add_attributes_as_columns_vectorized(x) # }) # Extract some attributes - lt_att <- get_lt_attr(lt) + # lt_att <- get_lt_attr(lt) # get rows indices - l_rl_rows <- get_rl_rows(lt_att) + # l_rl_rows <- get_rl_rows(lt_att) # DT <- map_lt_to_dt(lt, l_rl_rows) @@ -252,10 +251,9 @@ fg_assign_nas_values_to_dup_cols <- function(df, #' @return data.table create_full_list <- function(metadata) { - x <- metadata[, .(country_code, reporting_year) - ][, - inames := paste0(country_code, "_", reporting_year)] - x + metadata[, path] |> + funique() + } diff --git a/R/utils-pipdata.R b/R/utils-pipdata.R index 39def155..a30d1160 100644 --- a/R/utils-pipdata.R +++ b/R/utils-pipdata.R @@ -5,10 +5,11 @@ #' #' @return character vector #' @keywords internal -load_list_refy <- \(input_list, path){ +load_list_refy <- \(input_list){ cores <- get_from_pipapienv("cores_to_use") - inames <- input_list$inames + inames <- fs::path_file(input_list) |> + fs::path_ext_remove() seq_flex <- if (interactive()) { cli::cli_progress_along @@ -16,10 +17,11 @@ load_list_refy <- \(input_list, path){ base::seq_along } - dl <- lapply(seq_flex(inames), \(i) { - x <- inames[i] - qs::qread(file = fs::path(path, x, ext = "qs"), - nthreads = cores) + dl <- lapply(seq_flex(input_list), \(x) { + qfile <- input_list[x] + qname <- inames[x] + qs::qread(file = qfile, nthreads = cores) |> + _[, id := qname] }) |> setNames(inames) @@ -27,6 +29,8 @@ load_list_refy <- \(input_list, path){ } + + #' transform input list #' #' @inheritParams load_list_refy diff --git a/man/create_countries_vctr.Rd b/man/create_countries_vctr.Rd index 87a18e80..c62f8276 100644 --- a/man/create_countries_vctr.Rd +++ b/man/create_countries_vctr.Rd @@ -4,16 +4,16 @@ \alias{create_countries_vctr} \title{Create countries vectors} \usage{ -create_countries_vctr(country, year, valid_years, aux_files) +create_countries_vctr(country, year, lkup) } \arguments{ \item{country}{character: Country ISO 3 codes} \item{year}{integer: Reporting year} -\item{valid_years}{list: Valid years information provided through lkup object} +\item{lkup}{lkup object} -\item{aux_files}{list: List of auxiliary tables provided through lkup object} +\item{valid_years}{list: Valid years information provided through lkup object} } \value{ a list of vectors with countries and regions code to be used in diff --git a/man/fgt_watts_cumsum.Rd b/man/fgt_watts_cumsum.Rd new file mode 100644 index 00000000..74d7ecdf --- /dev/null +++ b/man/fgt_watts_cumsum.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compute_fgt_new.R +\name{fgt_watts_cumsum} +\alias{fgt_watts_cumsum} +\title{compute fgt and watts using cumulative welfare rather than means} +\usage{ +fgt_watts_cumsum(y, w, lines) +} +\arguments{ +\item{y}{numeric welfare (sorted ascending within the subgroup)} + +\item{w}{numeric weights (same order as y)} + +\item{lines}{numeric vector of poverty lines} +} +\value{ +Returns a data.table with columns: line, fgt0,fgt1,fgt2,watts +} +\description{ +compute fgt and watts using cumulative welfare rather than means +} diff --git a/man/get_aux_table.Rd b/man/get_aux_table.Rd index 66a90941..f11a9261 100644 --- a/man/get_aux_table.Rd +++ b/man/get_aux_table.Rd @@ -4,7 +4,7 @@ \alias{get_aux_table} \title{Return specified auxiliary data} \usage{ -get_aux_table(data_dir, table, long_format = FALSE) +get_aux_table(data_dir = NULL, table, long_format = FALSE) } \arguments{ \item{data_dir}{character: Data directory} diff --git a/man/load_list_refy.Rd b/man/load_list_refy.Rd index 59a37d7c..b1533a4f 100644 --- a/man/load_list_refy.Rd +++ b/man/load_list_refy.Rd @@ -4,7 +4,7 @@ \alias{load_list_refy} \title{load refy list} \usage{ -load_list_refy(input_list, path) +load_list_refy(input_list) } \arguments{ \item{input_list}{list. output from \link{create_full_list}} diff --git a/man/pip_aggregate_by.Rd b/man/pip_aggregate_by.Rd index 98bf18f7..642567e7 100644 --- a/man/pip_aggregate_by.Rd +++ b/man/pip_aggregate_by.Rd @@ -4,13 +4,11 @@ \alias{pip_aggregate_by} \title{Aggregate by predefined groups} \usage{ -pip_aggregate_by(df, group_lkup, country = "ALL", return_cols) +pip_aggregate_by(df, country = "ALL", return_cols = NULL) } \arguments{ \item{df}{data.frame: Response from \code{fg_pip_old()} or \code{rg_pip()}.} -\item{group_lkup}{data.frame: Group lkup table (pop_region)} - \item{country}{character: Selected countries / regions} \item{return_cols}{list: lkup$return_cols$pip_grp object. Controls returned From 8d736fc8ec36496d3b572caca0393f152ecd0034 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Wed, 10 Sep 2025 11:43:43 -0400 Subject: [PATCH 101/203] small update --- R/compute_fgt_new.R | 76 +++++++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 34 deletions(-) diff --git a/R/compute_fgt_new.R b/R/compute_fgt_new.R index 8b4f0b0c..75a91693 100644 --- a/R/compute_fgt_new.R +++ b/R/compute_fgt_new.R @@ -357,57 +357,65 @@ pov_from_DT <- function(DT, povline, g, cores = 1) { #' #' @return Returns a data.table with columns: line, fgt0,fgt1,fgt2,watts #' @export -fgt_watts_cumsum <- function(y, w, lines) { - # types +fgt_watts_cumsum <- function(y, w, lines, + check_sorted = FALSE) { + # 1) (Optional) type normalization: only coerce if needed + if (!is.double(y)) y <- as.double(y) + if (!is.double(w)) w <- as.double(w) + if (!is.double(lines)) lines <- as.double(lines) + + # 2) Optional sort check (fail fast during validation) + if (check_sorted) { + if (is.unsorted(y, strictly = FALSE)) { + cli::cli_abort("{.code y} must be sorted ascending within the subgroup. + Set {.code check_sorted=FALSE} to skip.") + } + } - y <- as.double(y) - w <- as.double(w) - lines <- as.double(lines) + # 3) Handle empty group quickly n <- length(y) - if (n == 0L) { return(data.table(line = lines, fgt0 = 0, fgt1 = 0, fgt2 = 0, watts = 0)) } - # total weight (collapse: fsum is very fast) - W <- fsum(w) - - # cumulative sums (collapse: fcumsum is multithreaded-aware, very fast) - cw <- fcumsum(w) - cwy <- fcumsum(w * y) - cwy2 <- fcumsum(w * (y * y)) + # 4) Group totals and prefix sums (collapse is very fast) + W <- fsum(w) # total weight + cw <- fcumsum(w) # cum pop + cwy <- fcumsum(w * y) # cum welfare + cwy2 <- fcumsum(w * (y*y)) # cum pop by welfare sqr - # Watts needs log(y) with y>0; clamp tiny positives for safety + # Watts: use log(y) (y must be positive; clamp tiny to avoid -Inf) y_pos <- pmax(y, 1e-12) - cwlog <- fcumsum(w * log(y_pos)) + cwlog <- fcumsum(w * log(y_pos)) # Σ w log y - # index of last obs <= line for each z (0..n) + # 5) For each z in `lines`, i = number of y's ≤ z (0..n) + # findInterval is C-coded; for increasing y, i = count(y ≤ z) i <- findInterval(lines, y) - take <- function(cs) { - out <- cs[pmax.int(i, 0L)] - out[i == 0L] <- 0 - out - } + # 6) Pull the needed prefixes at indices i; use the "prefix trick" for i=0 + # so that when no poor, sum is 0: c(0, cs)[i + 1]. + take <- function(cs) c(0, cs)[i + 1L] - cw_i <- take(cw) - cwy_i <- take(cwy) - cwy2_i <- take(cwy2) - cwlog_i <- take(cwlog) + cw_i <- take(cw) # Σ_{y≤z} w + cwy_i <- take(cwy) # Σ_{y≤z} w y + cwy2_i <- take(cwy2) # Σ_{y≤z} w y^2 + cwlog_i <- take(cwlog) # Σ_{y≤z} w log y + # 7) Assemble the indices, with small clamps for numerical safety z <- lines z2 <- z * z - z_s <- pmax(z, 1e-12) - z2_s <- pmax(z2, 1e-24) + z_s <- pmax(z, 1e-12) # avoid division by 0 in FGT1, Watts + z2_s <- pmax(z2, 1e-24) # avoid division by 0 in FGT2 - data.table( - line = lines, - fgt0 = cw_i / W, - fgt1 = (z * cw_i - cwy_i) / (z_s * W), - fgt2 = (z2 * cw_i - 2 * z * cwy_i + cwy2_i) / (z2_s * W), - watts = (log(z_s) * cw_i - cwlog_i) / W - ) + # 8) Closed-form formulas from the algebra above + fgt0 <- cw_i / W + fgt1 <- (z * cw_i - cwy_i) / (z_s * W) + fgt2 <- (z2 * cw_i - 2 * z * cwy_i + cwy2_i) / (z2_s * W) + watts <- (log(z_s) * cw_i - cwlog_i) / W + + # 9) Return tidy result + data.table(line = lines, fgt0 = fgt0, fgt1 = fgt1, fgt2 = fgt2, watts = watts) } From fadce923ba15372895782407ac0c470bb3a73046 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Wed, 10 Sep 2025 11:47:07 -0400 Subject: [PATCH 102/203] replace load_list_refy to read from fst files --- R/utils-pipdata.R | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/R/utils-pipdata.R b/R/utils-pipdata.R index a30d1160..dc5b823d 100644 --- a/R/utils-pipdata.R +++ b/R/utils-pipdata.R @@ -7,25 +7,28 @@ #' @keywords internal load_list_refy <- \(input_list){ - cores <- get_from_pipapienv("cores_to_use") - inames <- fs::path_file(input_list) |> + id_names <- input_list |> + fs::path_file() |> fs::path_ext_remove() - seq_flex <- if (interactive()) { - cli::cli_progress_along - } else { - base::seq_along - } + seq_flex <- if (interactive()) { + cli::cli_progress_along + } else { + base::seq_along + } + - dl <- lapply(seq_flex(input_list), \(x) { - qfile <- input_list[x] - qname <- inames[x] - qs::qread(file = qfile, nthreads = cores) |> - _[, id := qname] - }) |> - setNames(inames) + lfst <- lapply(seq_flex(input_list), + \(i) { + x <- lup_files[i] + idn <- fs::path_file(x) |> + fs::path_ext_remove() + fst::read_fst(x, as.data.table = TRUE) |> + _[, id := idn] + }) |> + setNames(id_names) - dl + lfst } From 968bbf3a0d6708c38799b08be38dce0801efe070 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Wed, 10 Sep 2025 12:07:05 -0400 Subject: [PATCH 103/203] remove first attempt of cumcum method --- R/compute_fgt_new.R | 135 -------------------------------------------- 1 file changed, 135 deletions(-) diff --git a/R/compute_fgt_new.R b/R/compute_fgt_new.R index 75a91693..2923e530 100644 --- a/R/compute_fgt_new.R +++ b/R/compute_fgt_new.R @@ -349,138 +349,3 @@ pov_from_DT <- function(DT, povline, g, cores = 1) { # NEW ARPPOACH USING CUMSUM ------------------ -#' compute fgt and watts using cumulative welfare rather than means -#' -#' @param y numeric welfare (sorted ascending within the subgroup) -#' @param w numeric weights (same order as y) -#' @param lines numeric vector of poverty lines -#' -#' @return Returns a data.table with columns: line, fgt0,fgt1,fgt2,watts -#' @export -fgt_watts_cumsum <- function(y, w, lines, - check_sorted = FALSE) { - # 1) (Optional) type normalization: only coerce if needed - if (!is.double(y)) y <- as.double(y) - if (!is.double(w)) w <- as.double(w) - if (!is.double(lines)) lines <- as.double(lines) - - # 2) Optional sort check (fail fast during validation) - if (check_sorted) { - if (is.unsorted(y, strictly = FALSE)) { - cli::cli_abort("{.code y} must be sorted ascending within the subgroup. - Set {.code check_sorted=FALSE} to skip.") - } - } - - # 3) Handle empty group quickly - n <- length(y) - if (n == 0L) { - return(data.table(line = lines, - fgt0 = 0, fgt1 = 0, fgt2 = 0, watts = 0)) - } - - # 4) Group totals and prefix sums (collapse is very fast) - W <- fsum(w) # total weight - cw <- fcumsum(w) # cum pop - cwy <- fcumsum(w * y) # cum welfare - cwy2 <- fcumsum(w * (y*y)) # cum pop by welfare sqr - - # Watts: use log(y) (y must be positive; clamp tiny to avoid -Inf) - y_pos <- pmax(y, 1e-12) - cwlog <- fcumsum(w * log(y_pos)) # Σ w log y - - # 5) For each z in `lines`, i = number of y's ≤ z (0..n) - # findInterval is C-coded; for increasing y, i = count(y ≤ z) - i <- findInterval(lines, y) - - # 6) Pull the needed prefixes at indices i; use the "prefix trick" for i=0 - # so that when no poor, sum is 0: c(0, cs)[i + 1]. - take <- function(cs) c(0, cs)[i + 1L] - - cw_i <- take(cw) # Σ_{y≤z} w - cwy_i <- take(cwy) # Σ_{y≤z} w y - cwy2_i <- take(cwy2) # Σ_{y≤z} w y^2 - cwlog_i <- take(cwlog) # Σ_{y≤z} w log y - - # 7) Assemble the indices, with small clamps for numerical safety - z <- lines - z2 <- z * z - z_s <- pmax(z, 1e-12) # avoid division by 0 in FGT1, Watts - z2_s <- pmax(z2, 1e-24) # avoid division by 0 in FGT2 - - # 8) Closed-form formulas from the algebra above - fgt0 <- cw_i / W - fgt1 <- (z * cw_i - cwy_i) / (z_s * W) - fgt2 <- (z2 * cw_i - 2 * z * cwy_i + cwy2_i) / (z2_s * W) - watts <- (log(z_s) * cw_i - cwlog_i) / W - - # 9) Return tidy result - data.table(line = lines, fgt0 = fgt0, fgt1 = fgt1, fgt2 = fgt2, watts = watts) -} - - - -# tl: list of data.tables, each with columns id, reporting_level, welfare, weight -# Assumes: within each element, data are sorted by welfare within each reporting_level -fgt_watts_list <- function(tl, lines) { - rbindlist(lapply(tl, function(dt) { - # compute per (id, reporting_level) - dt[, fgt_watts_cumsum(welfare, weight, lines), - by = .(id, reporting_level)] - }), - fill = TRUE) -} - - -# DT: one big data.table with id, reporting_level, welfare, weight -# Assumes: within each (id, reporting_level), rows are sorted by welfare - - -# DT <- rbindlist(lt) - - -fgt_watts_dt <- function(DT, lines) { - DT[, fgt_watts_cumsum(welfare, weight, lines), - by = .(id, reporting_level)] -} - - -# lines <- c(1:100) -# rlt <- fgt_watts_list(lt, lines) -# rdt <- fgt_watts_dt(DT, lines) -# -# waldo::compare(rlt, rdt) -# -# -# -# -# bench <- microbenchmark::microbenchmark( -# times = 50, -# lt = { -# fgt_watts_list(lt, lines) -# }, -# dt = { -# fgt_watts_dt(DT, lines) -# -# } -# ) -# if (requireNamespace("highcharter")) { -# hc_dt <- highcharter::data_to_boxplot(bench, -# time, -# expr, -# add_outliers = FALSE, -# name = "Time in milliseconds") -# -# highcharter::highchart() |> -# highcharter::hc_xAxis(type = "category") |> -# highcharter::hc_chart(inverted=TRUE) |> -# highcharter::hc_add_series_list(hc_dt) -# -# } else { -# boxplot(bench, outline = FALSE) -# } -# - - - - From fd0dc36b579bdbf3a90323fe71d4c071a6ec7510 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Wed, 10 Sep 2025 12:12:08 -0400 Subject: [PATCH 104/203] new file for new method of FGT using cumsum --- R/compute_fgt_new.R | 4 ---- R/fg_pip.R | 2 +- R/fgt_cumsum.R | 30 ++++++++++++++++++++++++++++++ 3 files changed, 31 insertions(+), 5 deletions(-) create mode 100644 R/fgt_cumsum.R diff --git a/R/compute_fgt_new.R b/R/compute_fgt_new.R index 2923e530..79cf0ecf 100644 --- a/R/compute_fgt_new.R +++ b/R/compute_fgt_new.R @@ -345,7 +345,3 @@ pov_from_DT <- function(DT, povline, g, cores = 1) { # } - - -# NEW ARPPOACH USING CUMSUM ------------------ - diff --git a/R/fg_pip.R b/R/fg_pip.R index 10d993e1..da323540 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -60,7 +60,7 @@ fg_pip <- function(country, full_list <- create_full_list(metadata = metadata) - lt <- + lfst <- load_list_refy(input_list = full_list) # lt <- lapply(lt, \(x) { diff --git a/R/fgt_cumsum.R b/R/fgt_cumsum.R new file mode 100644 index 00000000..477b327e --- /dev/null +++ b/R/fgt_cumsum.R @@ -0,0 +1,30 @@ +#' format the lfst list to usable data to estimate poverty +#' +#' @param lfst list from load_list_refy() +#' +#' @return list with DT and g (GRP object) +#' @keywords internal +format_lfst <- \(lfst) { + + DT <- rbindlist(lfst, fill = TRUE) + + # Convert to factors (is it faster?) + DT[, names(.SD) := lapply(.SD, qF), + .SDcols = c("id", "reporting_level")] + + # fix + # DT[index == 0, + # names(.SD) := 0, + # .SDcols = is.numeric] + # + # DT <- DT[!grepl("^CHN_", id)] + + ## Grouping ---------- + + g <- GRP(DT, ~ id + reporting_level, sort = FALSE) + + list(DT = DT, + g = g) +} + + From 30c50b9eca3324dedf3d05d3769fd7a6eac5afd2 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Wed, 10 Sep 2025 14:21:23 -0400 Subject: [PATCH 105/203] document helpers for fgt_cumcum --- NAMESPACE | 2 +- R/fgt_cumsum.R | 272 ++++++++++++++++++++++++++++++++++++++++ R/utils-pipdata.R | 33 ----- man/build_pair_dict.Rd | 35 ++++++ man/decode_pairs.Rd | 38 ++++++ man/dot-get_pairs_df.Rd | 22 ++++ man/encode_pairs.Rd | 41 ++++++ man/fgt_watts_cumsum.Rd | 21 ---- man/format_lfst.Rd | 38 ++++++ man/get_total_pop.Rd | 26 ++++ man/load_list_refy.Rd | 22 +++- man/transform_input.Rd | 2 +- man/update_pair_dict.Rd | 24 ++++ 13 files changed, 518 insertions(+), 58 deletions(-) create mode 100644 man/build_pair_dict.Rd create mode 100644 man/decode_pairs.Rd create mode 100644 man/dot-get_pairs_df.Rd create mode 100644 man/encode_pairs.Rd delete mode 100644 man/fgt_watts_cumsum.Rd create mode 100644 man/format_lfst.Rd create mode 100644 man/get_total_pop.Rd create mode 100644 man/update_pair_dict.Rd diff --git a/NAMESPACE b/NAMESPACE index e9c62630..9386e37b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -11,7 +11,6 @@ export(citation_from_version) export(create_etag_header) export(create_return_cols) export(create_versioned_lkups) -export(fgt_watts_cumsum) export(fillin_list) export(get_aux_table) export(get_aux_table_ui) @@ -24,6 +23,7 @@ export(get_md_vars) export(get_param_values) export(get_pip_version) export(get_pipapienv) +export(get_total_pop) export(get_user_alt_gt) export(get_user_x_code) export(get_valid_aux_long_format_tables) diff --git a/R/fgt_cumsum.R b/R/fgt_cumsum.R index 477b327e..dc5b9b7c 100644 --- a/R/fgt_cumsum.R +++ b/R/fgt_cumsum.R @@ -1,9 +1,18 @@ +# MAIN ------------------------------- + #' format the lfst list to usable data to estimate poverty #' #' @param lfst list from load_list_refy() #' #' @return list with DT and g (GRP object) #' @keywords internal +#' Format lfst list for poverty estimation +#' +#' Takes a list from load_list_refy() and returns a list with a data.table and a GRP object for grouped operations. +#' +#' @param lfst List from load_list_refy(). +#' @return List with elements: DT (data.table) and g (GRP object). +#' @keywords internal format_lfst <- \(lfst) { DT <- rbindlist(lfst, fill = TRUE) @@ -28,3 +37,266 @@ format_lfst <- \(lfst) { } + + +#' get total population by country year and reprting level +#' +#' @param LDTg list from format_lfst() with DT and g objects +#' +#' @return data.table with total population by `g[["groups]]` +#' @export +#' +#' @examples +#' Get total population by country, year, and reporting level +#' +#' Computes total population by group using the output of format_lfst(). +#' +#' @param LDTg List from format_lfst() with DT and g objects. +#' @return data.table with total population by group. +#' @export +#' @examples +#' # get_total_pop(format_lfst(lfst)) +get_total_pop <- \(LDTg) { + list2env(LDTg) + add_vars(g[["groups"]], + get_vars(DT, c("weight")) |> + fsum(g)) |> + setnames(old = "weight",new = "totpop") +} + + + +# --- helpers --------------------------------------------------------------- +#' load refy list +#' +#' @param input_list list. output from [create_full_list] +#' @param path character: directory path +#' +#' @return character vector +#' @keywords internal +#' Load refy list +#' +#' Loads a list of files and returns a named list of data.tables, each with an id column. +#' +#' @param input_list Character vector of file paths (output from create_full_list). +#' @return Named list of data.tables, each with an id column. +#' @keywords internal +load_list_refy <- \(input_list){ + + id_names <- input_list |> + fs::path_file() |> + fs::path_ext_remove() + + seq_flex <- if (interactive()) { + cli::cli_progress_along + } else { + base::seq_along + } + + + lfst <- lapply(seq_flex(input_list), + \(i) { + x <- lup_files[i] + idn <- fs::path_file(x) |> + fs::path_ext_remove() + fst::read_fst(x, as.data.table = TRUE) |> + _[, id := idn] + }) |> + setNames(id_names) + + lfst +} + + +# Pull just the 2 id columns as a base R data.frame (robust across classes) +#' Extract id and level columns as a data.frame +#' +#' @param X data.table or data.frame +#' @param id_col Name of id column +#' @param level_col Name of level column +#' @return data.frame with id and level columns +#' @keywords internal +.get_pairs_df <- function(X, id_col, level_col) { + as.data.frame(X[, c(id_col, level_col), drop = FALSE]) +} + + +# ------------------------------- # +# 1) Build pair dictionary (DT) # +# ------------------------------- # + +#' Dictionary for fast joins +#' +#' @param lkup lkup object +#' @param fill_gaps TRUE for lineup years, FALSE for survey years +#' +#' @return data.table with dictionary for merges. +#' @keywords internal +#' Build dictionary for fast joins +#' +#' Creates a data.table dictionary for merging by id and reporting_level. +#' +#' @param lkup Lookup object containing refy_lkup and svy_lkup. +#' @param fill_gaps Logical, TRUE for lineup years, FALSE for survey years. +#' @return data.table with columns id, reporting_level, and code. +#' @keywords internal +build_pair_dict <- function(lkup, fill_gaps = TRUE) { + + FT <- if (fill_gaps) { + lkup$refy_lkup[, .(country_code, reporting_year, reporting_level)] + } else { + lkup$svy_lkup[, .(country_code, reporting_year, reporting_level)] + } |> + funique() + + FT[, id := paste0(country_code, "_", reporting_year) + ][, c("country_code", "reporting_year") := NULL] + + cols <- c("id", "reporting_level") + dict <- unique(FT[, ..cols]) + + # deterministic code order + setorderv(dict, cols, order = 1L) # radix by default + dict[, code := as.integer(.I)] # fast in DT + setkeyv(dict, cols) # fast key lookups when needed + setindexv(dict, "code") # index on code + dict +} + + +# -------------------------------------------- # +# 2) Encode: add integer code via collapse::join +# -------------------------------------------- # +# DT: data.table to encode (by reference not guaranteed since join copies x->result) +# dict: data.table from build_pair_dict() +# code_col: name of code column to write +#' Encode pairs with integer code +#' +#' Adds an integer code column to a data.table by joining with a dictionary. +#' +#' @param DT data.table to encode. +#' @param dict data.table from build_pair_dict(). +#' @param id_col Name of id column. +#' @param level_col Name of reporting level column. +#' @param code_col Name of code column to write. +#' @param drop_labels Logical, drop id and level columns if TRUE. +#' @param strict Logical, error if any pairs are missing from dict. +#' @param verbose Integer, verbosity level. +#' @return data.table with code column added. +#' @keywords internal +encode_pairs <- function(DT, dict, + id_col = "id", level_col = "reporting_level", + code_col = "id_rl", + drop_labels = FALSE, + strict = TRUE, + verbose = 0L) { + + stopifnot(is.data.table(DT), is.data.table(dict)) + cols <- c(id_col, level_col) + stopifnot(all(cols %in% names(DT)), all(c(cols, "code") %in% names(dict))) + + + out <- join( + x = DT, + y = dict, + on = cols, + how = "left", + drop.dup.cols = "y", + validate = "m:1", + verbose = verbose + ) + # Ensure it's a data.table (join usually preserves) + if (!is.data.table(out)) setDT(out) + + # Rename 'code' -> code_col if needed + if (code_col != "code" && "code" %in% names(out)) { + setnames(out, "code", code_col) + } + + if (strict) { + if (anyNA(out[[code_col]])) { + nas <- is.na(out[[code_col]]) + miss <- unique(out[nas, ..cols])[1:min(10L, sum(nas))] + cli::cli_abort( + c( + "encode_pairs(): {fsum(nas)} unseen (id, reporting_level) pair(s).", + "Examples:\n{paste(capture.output(print(miss)), collapse = '\n')}" + ) + ) + } + } + + if (drop_labels) out[, (cols) := NULL] + out +} + +# ------------------------------------------------ # +# 3) Decode: join labels by code via collapse::join # +# ------------------------------------------------ # +#' Decode integer code to id and reporting level +#' +#' Joins labels by code using a dictionary. +#' +#' @param DT data.table to decode. +#' @param dict data.table from build_pair_dict(). +#' @param code_col Name of code column in DT. +#' @param id_col Name of id column in dict. +#' @param level_col Name of reporting level column in dict. +#' @param keep_code Logical, keep code column if TRUE. +#' @param verbose Integer, verbosity level. +#' @return data.table with id and reporting_level columns added. +#' @keywords internal +decode_pairs <- function(DT, dict, + code_col = "id_rl", + id_col = "id", level_col = "reporting_level", + keep_code = TRUE, + verbose = 0L) { + stopifnot(is.data.table(DT), is.data.table(dict)) + stopifnot(code_col %in% names(DT), all(c("code", id_col, level_col) %in% names(dict))) + + dict_min <- dict[, .(code, ..id_col = get(id_col), ..level_col = get(level_col))] + setnames(dict_min, c("code", id_col, level_col)) + + out <- collapse::join( + x = DT, y = dict_min, + on = setNames("code", code_col), # map DT[code_col] to dict$code + how = "left", + drop.dup.cols = "y", + validate = "m:1", + verbose = verbose + ) + if (!is.data.table(out)) setDT(out) + if (!keep_code) out[, (code_col) := NULL] + out[] +} + +# ----------------------------------------------------- # +# 4) Update dict with new pairs (append-only, fast DT) # +# ----------------------------------------------------- # +#' Update dictionary with new pairs +#' +#' Appends new (id, reporting_level) pairs to the dictionary if needed. +#' +#' @param dict data.table dictionary from build_pair_dict(). +#' @param DT data.table with id and reporting_level columns. +#' @param id_col Name of id column. +#' @param level_col Name of reporting level column. +#' @return Updated data.table dictionary. +#' @keywords internal +update_pair_dict <- function(dict, DT, + id_col = "id", level_col = "reporting_level") { + stopifnot(is.data.table(dict), is.data.table(DT)) + cols <- c(id_col, level_col) + stopifnot(all(c(cols, "code") %in% names(dict)), all(cols %in% names(DT))) + + new_pairs <- fsetdiff(unique(DT[, ..cols]), dict[, ..cols]) + if (nrow(new_pairs)) { + new_pairs[, code := as.integer(max(dict$code) + seq_len(.N))] + setkeyv(new_pairs, cols) + setindexv(new_pairs, "code") + dict <- rbindlist(list(dict, new_pairs), use.names = TRUE) + setkeyv(dict, cols) + setindexv(dict, "code") + } + dict[] +} diff --git a/R/utils-pipdata.R b/R/utils-pipdata.R index dc5b823d..4274521c 100644 --- a/R/utils-pipdata.R +++ b/R/utils-pipdata.R @@ -1,36 +1,3 @@ -#' load refy list -#' -#' @param input_list list. output from [create_full_list] -#' @param path character: directory path -#' -#' @return character vector -#' @keywords internal -load_list_refy <- \(input_list){ - - id_names <- input_list |> - fs::path_file() |> - fs::path_ext_remove() - - seq_flex <- if (interactive()) { - cli::cli_progress_along - } else { - base::seq_along - } - - - lfst <- lapply(seq_flex(input_list), - \(i) { - x <- lup_files[i] - idn <- fs::path_file(x) |> - fs::path_ext_remove() - fst::read_fst(x, as.data.table = TRUE) |> - _[, id := idn] - }) |> - setNames(id_names) - - lfst -} - diff --git a/man/build_pair_dict.Rd b/man/build_pair_dict.Rd new file mode 100644 index 00000000..6fe82339 --- /dev/null +++ b/man/build_pair_dict.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fgt_cumsum.R +\name{build_pair_dict} +\alias{build_pair_dict} +\title{Dictionary for fast joins} +\usage{ +build_pair_dict(lkup, fill_gaps = TRUE) +} +\arguments{ +\item{lkup}{Lookup object containing refy_lkup and svy_lkup.} + +\item{fill_gaps}{Logical, TRUE for lineup years, FALSE for survey years.} +} +\value{ +data.table with dictionary for merges. + +data.table with columns id, reporting_level, and code. +} +\description{ +Dictionary for fast joins +} +\keyword{Build} +\keyword{Creates} +\keyword{a} +\keyword{and} +\keyword{by} +\keyword{data.table} +\keyword{dictionary} +\keyword{fast} +\keyword{for} +\keyword{id} +\keyword{internal} +\keyword{joins} +\keyword{merging} +\keyword{reporting_level.} diff --git a/man/decode_pairs.Rd b/man/decode_pairs.Rd new file mode 100644 index 00000000..96a5537d --- /dev/null +++ b/man/decode_pairs.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fgt_cumsum.R +\name{decode_pairs} +\alias{decode_pairs} +\title{Decode integer code to id and reporting level} +\usage{ +decode_pairs( + DT, + dict, + code_col = "id_rl", + id_col = "id", + level_col = "reporting_level", + keep_code = TRUE, + verbose = 0L +) +} +\arguments{ +\item{DT}{data.table to decode.} + +\item{dict}{data.table from build_pair_dict().} + +\item{code_col}{Name of code column in DT.} + +\item{id_col}{Name of id column in dict.} + +\item{level_col}{Name of reporting level column in dict.} + +\item{keep_code}{Logical, keep code column if TRUE.} + +\item{verbose}{Integer, verbosity level.} +} +\value{ +data.table with id and reporting_level columns added. +} +\description{ +Joins labels by code using a dictionary. +} +\keyword{internal} diff --git a/man/dot-get_pairs_df.Rd b/man/dot-get_pairs_df.Rd new file mode 100644 index 00000000..4ef63c05 --- /dev/null +++ b/man/dot-get_pairs_df.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fgt_cumsum.R +\name{.get_pairs_df} +\alias{.get_pairs_df} +\title{Extract id and level columns as a data.frame} +\usage{ +.get_pairs_df(X, id_col, level_col) +} +\arguments{ +\item{X}{data.table or data.frame} + +\item{id_col}{Name of id column} + +\item{level_col}{Name of level column} +} +\value{ +data.frame with id and level columns +} +\description{ +Extract id and level columns as a data.frame +} +\keyword{internal} diff --git a/man/encode_pairs.Rd b/man/encode_pairs.Rd new file mode 100644 index 00000000..162a095c --- /dev/null +++ b/man/encode_pairs.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fgt_cumsum.R +\name{encode_pairs} +\alias{encode_pairs} +\title{Encode pairs with integer code} +\usage{ +encode_pairs( + DT, + dict, + id_col = "id", + level_col = "reporting_level", + code_col = "id_rl", + drop_labels = FALSE, + strict = TRUE, + verbose = 0L +) +} +\arguments{ +\item{DT}{data.table to encode.} + +\item{dict}{data.table from build_pair_dict().} + +\item{id_col}{Name of id column.} + +\item{level_col}{Name of reporting level column.} + +\item{code_col}{Name of code column to write.} + +\item{drop_labels}{Logical, drop id and level columns if TRUE.} + +\item{strict}{Logical, error if any pairs are missing from dict.} + +\item{verbose}{Integer, verbosity level.} +} +\value{ +data.table with code column added. +} +\description{ +Adds an integer code column to a data.table by joining with a dictionary. +} +\keyword{internal} diff --git a/man/fgt_watts_cumsum.Rd b/man/fgt_watts_cumsum.Rd deleted file mode 100644 index 74d7ecdf..00000000 --- a/man/fgt_watts_cumsum.Rd +++ /dev/null @@ -1,21 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/compute_fgt_new.R -\name{fgt_watts_cumsum} -\alias{fgt_watts_cumsum} -\title{compute fgt and watts using cumulative welfare rather than means} -\usage{ -fgt_watts_cumsum(y, w, lines) -} -\arguments{ -\item{y}{numeric welfare (sorted ascending within the subgroup)} - -\item{w}{numeric weights (same order as y)} - -\item{lines}{numeric vector of poverty lines} -} -\value{ -Returns a data.table with columns: line, fgt0,fgt1,fgt2,watts -} -\description{ -compute fgt and watts using cumulative welfare rather than means -} diff --git a/man/format_lfst.Rd b/man/format_lfst.Rd new file mode 100644 index 00000000..0667de50 --- /dev/null +++ b/man/format_lfst.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fgt_cumsum.R +\name{format_lfst} +\alias{format_lfst} +\title{format the lfst list to usable data to estimate poverty} +\usage{ +format_lfst(lfst) +} +\arguments{ +\item{lfst}{List from load_list_refy().} +} +\value{ +list with DT and g (GRP object) + +List with elements: DT (data.table) and g (GRP object). +} +\description{ +format the lfst list to usable data to estimate poverty +} +\keyword{Format} +\keyword{GRP} +\keyword{Takes} +\keyword{a} +\keyword{and} +\keyword{data.table} +\keyword{estimation} +\keyword{for} +\keyword{from} +\keyword{grouped} +\keyword{internal} +\keyword{lfst} +\keyword{list} +\keyword{load_list_refy()} +\keyword{object} +\keyword{operations.} +\keyword{poverty} +\keyword{returns} +\keyword{with} diff --git a/man/get_total_pop.Rd b/man/get_total_pop.Rd new file mode 100644 index 00000000..b73f558c --- /dev/null +++ b/man/get_total_pop.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fgt_cumsum.R +\name{get_total_pop} +\alias{get_total_pop} +\title{get total population by country year and reprting level} +\usage{ +get_total_pop(LDTg) +} +\arguments{ +\item{LDTg}{List from format_lfst() with DT and g objects.} +} +\value{ +data.table with total population by \verb{g[["groups]]} + +data.table with total population by group. +} +\description{ +get total population by country year and reprting level +} +\examples{ +Get total population by country, year, and reporting level + +Computes total population by group using the output of format_lfst(). + +# get_total_pop(format_lfst(lfst)) +} diff --git a/man/load_list_refy.Rd b/man/load_list_refy.Rd index b1533a4f..75fb33df 100644 --- a/man/load_list_refy.Rd +++ b/man/load_list_refy.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils-pipdata.R +% Please edit documentation in R/fgt_cumsum.R \name{load_list_refy} \alias{load_list_refy} \title{load refy list} @@ -7,14 +7,32 @@ load_list_refy(input_list) } \arguments{ -\item{input_list}{list. output from \link{create_full_list}} +\item{input_list}{Character vector of file paths (output from create_full_list).} \item{path}{character: directory path} } \value{ character vector + +Named list of data.tables, each with an id column. } \description{ load refy list } +\keyword{Load} +\keyword{Loads} +\keyword{a} +\keyword{an} +\keyword{and} +\keyword{column.} +\keyword{data.tables,} +\keyword{each} +\keyword{files} +\keyword{id} \keyword{internal} +\keyword{list} +\keyword{named} +\keyword{of} +\keyword{refy} +\keyword{returns} +\keyword{with} diff --git a/man/transform_input.Rd b/man/transform_input.Rd index e460cf0f..8415f5ac 100644 --- a/man/transform_input.Rd +++ b/man/transform_input.Rd @@ -7,7 +7,7 @@ transform_input(input_list) } \arguments{ -\item{input_list}{list. output from \link{create_full_list}} +\item{input_list}{Character vector of file paths (output from create_full_list).} } \value{ formated list diff --git a/man/update_pair_dict.Rd b/man/update_pair_dict.Rd new file mode 100644 index 00000000..a142e98d --- /dev/null +++ b/man/update_pair_dict.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fgt_cumsum.R +\name{update_pair_dict} +\alias{update_pair_dict} +\title{Update dictionary with new pairs} +\usage{ +update_pair_dict(dict, DT, id_col = "id", level_col = "reporting_level") +} +\arguments{ +\item{dict}{data.table dictionary from build_pair_dict().} + +\item{DT}{data.table with id and reporting_level columns.} + +\item{id_col}{Name of id column.} + +\item{level_col}{Name of reporting level column.} +} +\value{ +Updated data.table dictionary. +} +\description{ +Appends new (id, reporting_level) pairs to the dictionary if needed. +} +\keyword{internal} From 01c52cc5a939bcc46d224d59b4957f74c33b5a8e Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Wed, 10 Sep 2025 14:28:35 -0400 Subject: [PATCH 106/203] improve helpers --- R/fgt_cumsum.R | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/R/fgt_cumsum.R b/R/fgt_cumsum.R index dc5b9b7c..0c68c154 100644 --- a/R/fgt_cumsum.R +++ b/R/fgt_cumsum.R @@ -248,17 +248,25 @@ encode_pairs <- function(DT, dict, #' @keywords internal decode_pairs <- function(DT, dict, code_col = "id_rl", - id_col = "id", level_col = "reporting_level", + id_col = "id", + level_col = "reporting_level", keep_code = TRUE, verbose = 0L) { - stopifnot(is.data.table(DT), is.data.table(dict)) - stopifnot(code_col %in% names(DT), all(c("code", id_col, level_col) %in% names(dict))) + stopifnot(exprs = { + is.data.table(DT) + is.data.table(dict) + }) + stopifnot(exprs = { + code_col %in% names(DT) + all(c("code", id_col, level_col) %in% names(dict)) + }) + + # dict_min <- dict[, .(code, ..id_col = get(id_col), ..level_col = get(level_col))] + # setnames(dict_min, c("code", id_col, level_col)) - dict_min <- dict[, .(code, ..id_col = get(id_col), ..level_col = get(level_col))] - setnames(dict_min, c("code", id_col, level_col)) - - out <- collapse::join( - x = DT, y = dict_min, + out <- join( + x = DT, + y = dict, on = setNames("code", code_col), # map DT[code_col] to dict$code how = "left", drop.dup.cols = "y", @@ -267,7 +275,7 @@ decode_pairs <- function(DT, dict, ) if (!is.data.table(out)) setDT(out) if (!keep_code) out[, (code_col) := NULL] - out[] + out } # ----------------------------------------------------- # @@ -298,5 +306,5 @@ update_pair_dict <- function(dict, DT, setkeyv(dict, cols) setindexv(dict, "code") } - dict[] + dict } From 76c6610465c733b8ddad2921433b9e6ceea7a7b6 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Wed, 10 Sep 2025 18:14:47 -0400 Subject: [PATCH 107/203] first attempt of new cumcum functions --- NAMESPACE | 1 - R/compute_fgt_new.R | 3 +- R/fg_pip.R | 12 +++ R/fgt_cumsum.R | 216 ++++++++++++++++++++++++---------------- man/compute_fgt.Rd | 22 ++++ man/decode_pairs.Rd | 5 +- man/dot-get_pairs_df.Rd | 22 ---- man/format_lfst.Rd | 2 +- man/get_total_pop.Rd | 16 +-- 9 files changed, 177 insertions(+), 122 deletions(-) create mode 100644 man/compute_fgt.Rd delete mode 100644 man/dot-get_pairs_df.Rd diff --git a/NAMESPACE b/NAMESPACE index 9386e37b..96b78000 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -23,7 +23,6 @@ export(get_md_vars) export(get_param_values) export(get_pip_version) export(get_pipapienv) -export(get_total_pop) export(get_user_alt_gt) export(get_user_x_code) export(get_valid_aux_long_format_tables) diff --git a/R/compute_fgt_new.R b/R/compute_fgt_new.R index 79cf0ecf..c6d52d39 100644 --- a/R/compute_fgt_new.R +++ b/R/compute_fgt_new.R @@ -78,7 +78,7 @@ compute_fgt_dt <- function(dt, welfare, weight, povlines, mean_and_med = FALSE) -# Efficient FGT calculation for vectors (No data.table) +#' Efficient FGT calculation for vectors (No data.table) #' #' @param w character: welfare variable name #' @param wt character: weight variable name @@ -87,7 +87,6 @@ compute_fgt_dt <- function(dt, welfare, weight, povlines, mean_and_med = FALSE) #' @return data.table with estimates poverty estimates #' @keywords internal compute_fgt <- function(w, wt, povlines) { - n <- length(w) m <- length(povlines) # Pre-allocate result matrix diff --git a/R/fg_pip.R b/R/fg_pip.R index da323540..86ce5f5b 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -58,11 +58,23 @@ fg_pip <- function(country, data_in_cache = data_present_in_master)) } + dict <- build_pair_dict(lkup = lkup, fill_gaps = TRUE) + + full_list <- create_full_list(metadata = metadata) lfst <- load_list_refy(input_list = full_list) + LDTg <- format_lfst(lfst = lfst, + dict = dict) + tpop <- get_total_pop(LDTg = LDTg, + dict = dict) + fgt <- fgt_cumsum(LDTg = LDTg, + tpop = tpop, + povline = povline) |> + decode_pairs(dict = dict) + # lt <- lapply(lt, \(x) { # add_attributes_as_columns_vectorized(x) # }) diff --git a/R/fgt_cumsum.R b/R/fgt_cumsum.R index 0c68c154..9dd4fb02 100644 --- a/R/fgt_cumsum.R +++ b/R/fgt_cumsum.R @@ -13,113 +13,110 @@ #' @param lfst List from load_list_refy(). #' @return List with elements: DT (data.table) and g (GRP object). #' @keywords internal -format_lfst <- \(lfst) { +format_lfst <- \(lfst, dict) { DT <- rbindlist(lfst, fill = TRUE) # Convert to factors (is it faster?) - DT[, names(.SD) := lapply(.SD, qF), - .SDcols = c("id", "reporting_level")] + if (!is.integer(DT$index)) { + DT[, index := as.integer(index)] + } - # fix - # DT[index == 0, - # names(.SD) := 0, - # .SDcols = is.numeric] - # - # DT <- DT[!grepl("^CHN_", id)] + out <- encode_pairs(DT = DT, + dict = dict, + drop_labels = TRUE) ## Grouping ---------- + g <- GRP(out, ~ id_rl, sort = FALSE) - g <- GRP(DT, ~ id + reporting_level, sort = FALSE) - - list(DT = DT, + list(DT = out, g = g) } -#' get total population by country year and reprting level -#' -#' @param LDTg list from format_lfst() with DT and g objects -#' -#' @return data.table with total population by `g[["groups]]` -#' @export -#' -#' @examples -#' Get total population by country, year, and reporting level -#' #' Computes total population by group using the output of format_lfst(). #' #' @param LDTg List from format_lfst() with DT and g objects. +#' @param dict data dictionary from build_pair_dict() +#' #' @return data.table with total population by group. -#' @export -#' @examples -#' # get_total_pop(format_lfst(lfst)) -get_total_pop <- \(LDTg) { - list2env(LDTg) +#' @keywords internal +get_total_pop <- \(LDTg, dict) { + list2env(LDTg, envir = environment()) + rm(LDTg) add_vars(g[["groups"]], get_vars(DT, c("weight")) |> fsum(g)) |> - setnames(old = "weight",new = "totpop") + setnames(old = "weight", + new = "W") |> + encode_pairs(dict, drop_labels = TRUE) } - -# --- helpers --------------------------------------------------------------- -#' load refy list -#' -#' @param input_list list. output from [create_full_list] -#' @param path character: directory path -#' -#' @return character vector -#' @keywords internal -#' Load refy list -#' -#' Loads a list of files and returns a named list of data.tables, each with an id column. -#' -#' @param input_list Character vector of file paths (output from create_full_list). -#' @return Named list of data.tables, each with an id column. -#' @keywords internal -load_list_refy <- \(input_list){ - - id_names <- input_list |> - fs::path_file() |> - fs::path_ext_remove() - - seq_flex <- if (interactive()) { - cli::cli_progress_along - } else { - base::seq_along +fgt_cumsum <- \(LDTg, tpop, povline, + drop_vars = TRUE) { + list2env(LDTg, envir = environment()) + rm(LDTg) + + # Temporal values to be added to the data.table + tz <- pmax(povline, 1e-12) + tz2 <- pmax(povline^2, 1e-16) + tlogz <- log(tz) + + # 1) Compute cutpoint index for each z, using ONLY non-zero rows for welfare + # -> findInterval(povline, welfare) returns values in 0..N (never N+1) + ID <- DT[index > 0L, + { + idx <- findInterval(povline, welfare, left.open = TRUE) + # 2) Attach z, z2, logz in-group (no replication/copies) + data.table(index = idx, + z = tz, + z2 = tz2, + logz = tlogz) + }, + by = id_rl] + + # 3) Minimal cumulative view (shallow column subset; avoids copying DT) + DT_min <- get_vars(DT, + c("id_rl", "index", "cw", "cwy", "cwy2", "cwylog")) + + # 4) join cutpoints to cumulatives (index==0 hits the already-present zero row) + CS <- join( + x = ID, + y = DT_min, + on = c("id_rl","index"), + how = "left", + validate = "m:1", # many cutpoints -> 1 cumulative row + drop.dup.cols = "y", + verbose = 0) |> + # 5) Bring total population W + join(tpop, + on = "id_rl", + how = "left", + validate = "m:1", + drop.dup.cols = "y") |> + setorder(id_rl, index) + + + # 6) Compute measures (vectorized). Small clamps for numerical safety. + CS[, `:=`( + headcount = cw / W, + poverty_gap = (z * cw - cwy) / (z_s * W), + poverty_severity = (z2 * cw - 2 * z * cwy + cwy2) / (z2_s * W), + watts = (logz * cw - cwylog) / W + )] + + if (!drop_vars) { + return(CS) } + get_vars(CS, c("id_rl", "headcount", "poverty_gap", "poverty_severity", "watts")) - - lfst <- lapply(seq_flex(input_list), - \(i) { - x <- lup_files[i] - idn <- fs::path_file(x) |> - fs::path_ext_remove() - fst::read_fst(x, as.data.table = TRUE) |> - _[, id := idn] - }) |> - setNames(id_names) - - lfst } -# Pull just the 2 id columns as a base R data.frame (robust across classes) -#' Extract id and level columns as a data.frame -#' -#' @param X data.table or data.frame -#' @param id_col Name of id column -#' @param level_col Name of level column -#' @return data.frame with id and level columns -#' @keywords internal -.get_pairs_df <- function(X, id_col, level_col) { - as.data.frame(X[, c(id_col, level_col), drop = FALSE]) -} - +# --- helpers --------------------------------------------------------------- # ------------------------------- # # 1) Build pair dictionary (DT) # @@ -243,14 +240,17 @@ encode_pairs <- function(DT, dict, #' @param id_col Name of id column in dict. #' @param level_col Name of reporting level column in dict. #' @param keep_code Logical, keep code column if TRUE. +#' @param add_true_vars logical, add `country_code` and `reporting_year` #' @param verbose Integer, verbosity level. +#' #' @return data.table with id and reporting_level columns added. #' @keywords internal decode_pairs <- function(DT, dict, code_col = "id_rl", id_col = "id", level_col = "reporting_level", - keep_code = TRUE, + keep_code = FALSE, + add_true_vars = TRUE, verbose = 0L) { stopifnot(exprs = { is.data.table(DT) @@ -261,9 +261,6 @@ decode_pairs <- function(DT, dict, all(c("code", id_col, level_col) %in% names(dict)) }) - # dict_min <- dict[, .(code, ..id_col = get(id_col), ..level_col = get(level_col))] - # setnames(dict_min, c("code", id_col, level_col)) - out <- join( x = DT, y = dict, @@ -272,8 +269,16 @@ decode_pairs <- function(DT, dict, drop.dup.cols = "y", validate = "m:1", verbose = verbose - ) - if (!is.data.table(out)) setDT(out) + ) |> + qDT() + + if (add_true_vars) { + out[, `:=`( + country_code = gsub("(.+)(_.+)", "\\1", id), + reporting_year = as.integer(gsub("(.+_)(.+)", "\\2", id)) + )] + } + if (!keep_code) out[, (code_col) := NULL] out } @@ -308,3 +313,46 @@ update_pair_dict <- function(dict, DT, } dict } + + + +#' load refy list +#' +#' @param input_list list. output from [create_full_list] +#' @param path character: directory path +#' +#' @return character vector +#' @keywords internal +#' Load refy list +#' +#' Loads a list of files and returns a named list of data.tables, each with an id column. +#' +#' @param input_list Character vector of file paths (output from create_full_list). +#' @return Named list of data.tables, each with an id column. +#' @keywords internal +load_list_refy <- \(input_list){ + + id_names <- input_list |> + fs::path_file() |> + fs::path_ext_remove() + + seq_flex <- if (interactive()) { + cli::cli_progress_along + } else { + base::seq_along + } + + + lfst <- lapply(seq_flex(input_list), + \(i) { + x <- lup_files[i] + idn <- fs::path_file(x) |> + fs::path_ext_remove() + fst::read_fst(x, as.data.table = TRUE) |> + _[, id := idn] + }) |> + setNames(id_names) + + lfst +} + diff --git a/man/compute_fgt.Rd b/man/compute_fgt.Rd new file mode 100644 index 00000000..0a5b479e --- /dev/null +++ b/man/compute_fgt.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compute_fgt_new.R +\name{compute_fgt} +\alias{compute_fgt} +\title{Efficient FGT calculation for vectors (No data.table)} +\usage{ +compute_fgt(w, wt, povlines) +} +\arguments{ +\item{w}{character: welfare variable name} + +\item{wt}{character: weight variable name} + +\item{povlines}{double: vector with poverty lines} +} +\value{ +data.table with estimates poverty estimates +} +\description{ +Efficient FGT calculation for vectors (No data.table) +} +\keyword{internal} diff --git a/man/decode_pairs.Rd b/man/decode_pairs.Rd index 96a5537d..988829e2 100644 --- a/man/decode_pairs.Rd +++ b/man/decode_pairs.Rd @@ -10,7 +10,8 @@ decode_pairs( code_col = "id_rl", id_col = "id", level_col = "reporting_level", - keep_code = TRUE, + keep_code = FALSE, + add_true_vars = TRUE, verbose = 0L ) } @@ -27,6 +28,8 @@ decode_pairs( \item{keep_code}{Logical, keep code column if TRUE.} +\item{add_true_vars}{logical, add \code{country_code} and \code{reporting_year}} + \item{verbose}{Integer, verbosity level.} } \value{ diff --git a/man/dot-get_pairs_df.Rd b/man/dot-get_pairs_df.Rd deleted file mode 100644 index 4ef63c05..00000000 --- a/man/dot-get_pairs_df.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/fgt_cumsum.R -\name{.get_pairs_df} -\alias{.get_pairs_df} -\title{Extract id and level columns as a data.frame} -\usage{ -.get_pairs_df(X, id_col, level_col) -} -\arguments{ -\item{X}{data.table or data.frame} - -\item{id_col}{Name of id column} - -\item{level_col}{Name of level column} -} -\value{ -data.frame with id and level columns -} -\description{ -Extract id and level columns as a data.frame -} -\keyword{internal} diff --git a/man/format_lfst.Rd b/man/format_lfst.Rd index 0667de50..806e6161 100644 --- a/man/format_lfst.Rd +++ b/man/format_lfst.Rd @@ -4,7 +4,7 @@ \alias{format_lfst} \title{format the lfst list to usable data to estimate poverty} \usage{ -format_lfst(lfst) +format_lfst(lfst, dict) } \arguments{ \item{lfst}{List from load_list_refy().} diff --git a/man/get_total_pop.Rd b/man/get_total_pop.Rd index b73f558c..4808e0e7 100644 --- a/man/get_total_pop.Rd +++ b/man/get_total_pop.Rd @@ -2,25 +2,19 @@ % Please edit documentation in R/fgt_cumsum.R \name{get_total_pop} \alias{get_total_pop} -\title{get total population by country year and reprting level} +\title{Computes total population by group using the output of format_lfst().} \usage{ -get_total_pop(LDTg) +get_total_pop(LDTg, dict) } \arguments{ \item{LDTg}{List from format_lfst() with DT and g objects.} + +\item{dict}{data dictionary from build_pair_dict()} } \value{ -data.table with total population by \verb{g[["groups]]} - data.table with total population by group. } \description{ -get total population by country year and reprting level -} -\examples{ -Get total population by country, year, and reporting level - Computes total population by group using the output of format_lfst(). - -# get_total_pop(format_lfst(lfst)) } +\keyword{internal} From a182f088057872dc14d7cc3bc5b1e973d0e00462 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Wed, 10 Sep 2025 18:20:54 -0400 Subject: [PATCH 108/203] improve speed in get_mean_median --- R/fg_pip.R | 27 --------------------------- R/fgt_cumsum.R | 6 ++++-- R/utils.R | 14 +++++++------- 3 files changed, 11 insertions(+), 36 deletions(-) diff --git a/R/fg_pip.R b/R/fg_pip.R index 86ce5f5b..f3481c5c 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -75,33 +75,6 @@ fg_pip <- function(country, povline = povline) |> decode_pairs(dict = dict) - # lt <- lapply(lt, \(x) { - # add_attributes_as_columns_vectorized(x) - # }) - - # Extract some attributes - # lt_att <- get_lt_attr(lt) - - # get rows indices - # l_rl_rows <- get_rl_rows(lt_att) - - - # DT <- map_lt_to_dt(lt, l_rl_rows) - # setorder(DT, country_code, reporting_year, reporting_level, welfare) - # g <- GRP(DT, - # ~ country_code + reporting_year + reporting_level, - # sort = TRUE) - - - - - # ZP Add: do fgt estimations using `res <- lapply(lt, process_dt, povline = povline)` - #------------------------- - fgt <- map_fgt(lt, l_rl_rows, povline) |> - funique() # TO REMOVE - - # convert reporting year to numeric - fgt[, reporting_year := as.numeric(reporting_year)] # Add just mean and median res <- get_mean_median(fgt, lkup, fill_gaps = TRUE) diff --git a/R/fgt_cumsum.R b/R/fgt_cumsum.R index 9dd4fb02..02666587 100644 --- a/R/fgt_cumsum.R +++ b/R/fgt_cumsum.R @@ -240,7 +240,8 @@ encode_pairs <- function(DT, dict, #' @param id_col Name of id column in dict. #' @param level_col Name of reporting level column in dict. #' @param keep_code Logical, keep code column if TRUE. -#' @param add_true_vars logical, add `country_code` and `reporting_year` +#' @param add_true_vars logical, add `country_code` and `reporting_year` and +#' removes var `id` #' @param verbose Integer, verbosity level. #' #' @return data.table with id and reporting_level columns added. @@ -276,7 +277,8 @@ decode_pairs <- function(DT, dict, out[, `:=`( country_code = gsub("(.+)(_.+)", "\\1", id), reporting_year = as.integer(gsub("(.+_)(.+)", "\\2", id)) - )] + )][, + id := NULL] } if (!keep_code) out[, (code_col) := NULL] diff --git a/R/utils.R b/R/utils.R index 0d84c2ac..cf118506 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1478,14 +1478,14 @@ get_mean_median <- \(fgt, lkup, fill_gaps) { if (isFALSE(lkup$use_new_lineup_version)) return(fgt) if (fill_gaps) { - dist <- lkup$lineup_dist_stats[, - .(country_code, reporting_year, - reporting_level, mean, median)] + dist <- get_vars(lkup$lineup_dist_stats, + c("country_code", "reporting_year", + "reporting_level", "mean", "median")) } else { - dist <- lkup$dist_stats[, - .(country_code, reporting_year, - reporting_level, mean, - median = survey_median_ppp)] + dist <- get_vars(lkup$dist_stats, + c("country_code", "reporting_year", + "reporting_level", "mean", "survey_median_ppp")) + setnames(dist, "survey_median_ppp", "median") } joyn::joyn(x = fgt, From 72f96e29d03f0a7700b75660e84eb1c230cf45e4 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Wed, 10 Sep 2025 18:23:30 -0400 Subject: [PATCH 109/203] add join rather than joyn --- R/utils.R | 13 ++++++------- man/decode_pairs.Rd | 3 ++- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/R/utils.R b/R/utils.R index cf118506..60fa57ac 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1488,13 +1488,12 @@ get_mean_median <- \(fgt, lkup, fill_gaps) { setnames(dist, "survey_median_ppp", "median") } - joyn::joyn(x = fgt, - y = dist, - by = c('country_code', "reporting_year", "reporting_level"), - match_type = "m:1", # multiple povlines - keep = "left", - reportvar = FALSE, - verbose = FALSE) + join(x = fgt, + y = dist, + on = c('country_code', "reporting_year", "reporting_level"), + how = "left", + validate = "m:1", # multiple povlines + verbose = 0L) } diff --git a/man/decode_pairs.Rd b/man/decode_pairs.Rd index 988829e2..c77164bc 100644 --- a/man/decode_pairs.Rd +++ b/man/decode_pairs.Rd @@ -28,7 +28,8 @@ decode_pairs( \item{keep_code}{Logical, keep code column if TRUE.} -\item{add_true_vars}{logical, add \code{country_code} and \code{reporting_year}} +\item{add_true_vars}{logical, add \code{country_code} and \code{reporting_year} and +removes var \code{id}} \item{verbose}{Integer, verbosity level.} } From 85aaf04238f7f47289f14ee17a46962bdd9c5857 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Wed, 10 Sep 2025 18:27:59 -0400 Subject: [PATCH 110/203] improve documentation --- R/fgt_cumsum.R | 85 +++++++++++++++++++---------------------- man/build_pair_dict.Rd | 20 ++-------- man/decode_pairs.Rd | 8 ++-- man/encode_pairs.Rd | 5 ++- man/fgt_cumsum.Rd | 24 ++++++++++++ man/format_lfst.Rd | 31 ++++----------- man/get_total_pop.Rd | 9 +++-- man/load_list_refy.Rd | 27 ++----------- man/transform_input.Rd | 2 +- man/update_pair_dict.Rd | 5 ++- 10 files changed, 93 insertions(+), 123 deletions(-) create mode 100644 man/fgt_cumsum.Rd diff --git a/R/fgt_cumsum.R b/R/fgt_cumsum.R index 02666587..687dd177 100644 --- a/R/fgt_cumsum.R +++ b/R/fgt_cumsum.R @@ -1,17 +1,13 @@ # MAIN ------------------------------- -#' format the lfst list to usable data to estimate poverty +#' Format loaded survey list for grouped poverty analysis #' -#' @param lfst list from load_list_refy() +#' Combines a list of survey data.tables into a single data.table and encodes group identifiers using a dictionary. +#' Returns a data.table and a GRP object for efficient grouped operations, used as a preprocessing step for FGT and population calculations. #' -#' @return list with DT and g (GRP object) -#' @keywords internal -#' Format lfst list for poverty estimation -#' -#' Takes a list from load_list_refy() and returns a list with a data.table and a GRP object for grouped operations. -#' -#' @param lfst List from load_list_refy(). -#' @return List with elements: DT (data.table) and g (GRP object). +#' @param lfst Named list of data.tables, as returned by load_list_refy(). +#' @param dict data.table dictionary for id/reporting_level encoding (from build_pair_dict()). +#' @return List with elements: DT (data.table of all surveys, with id_rl) and g (GRP object for grouping by id_rl). #' @keywords internal format_lfst <- \(lfst, dict) { @@ -36,12 +32,14 @@ format_lfst <- \(lfst, dict) { -#' Computes total population by group using the output of format_lfst(). +#' Compute total population by survey and reporting level #' -#' @param LDTg List from format_lfst() with DT and g objects. -#' @param dict data dictionary from build_pair_dict() +#' Sums the weights for each (id, reporting_level) group in the combined survey data. +#' Used as a denominator for FGT and Watts index calculations. #' -#' @return data.table with total population by group. +#' @param LDTg List from format_lfst() with DT and g objects. +#' @param dict data.table dictionary for id/reporting_level encoding (from build_pair_dict()). +#' @return data.table with total population by group (columns: id_rl, W). #' @keywords internal get_total_pop <- \(LDTg, dict) { list2env(LDTg, envir = environment()) @@ -54,7 +52,16 @@ get_total_pop <- \(LDTg, dict) { encode_pairs(dict, drop_labels = TRUE) } - +#' Compute FGT and Watts indices for all groups and poverty lines +#' +#' Calculates headcount, poverty gap, poverty severity, and Watts index for each group and poverty line using cumulative sums. +#' +#' @param LDTg List from format_lfst() with DT and g objects. +#' @param tpop data.table with total population by group (from get_total_pop()). +#' @param povline Numeric vector of poverty lines. +#' @param drop_vars Logical, if TRUE returns only summary columns. +#' @return data.table with FGT and Watts measures by group and poverty line. +#' @keywords internal fgt_cumsum <- \(LDTg, tpop, povline, drop_vars = TRUE) { list2env(LDTg, envir = environment()) @@ -122,16 +129,10 @@ fgt_cumsum <- \(LDTg, tpop, povline, # 1) Build pair dictionary (DT) # # ------------------------------- # -#' Dictionary for fast joins -#' -#' @param lkup lkup object -#' @param fill_gaps TRUE for lineup years, FALSE for survey years -#' -#' @return data.table with dictionary for merges. -#' @keywords internal -#' Build dictionary for fast joins +#' Build dictionary for id/reporting_level encoding #' -#' Creates a data.table dictionary for merging by id and reporting_level. +#' Creates a data.table dictionary for mapping (id, reporting_level) pairs to integer codes for fast joins and decoding. +#' Used for efficient merging and decoding in the FGT pipeline. #' #' @param lkup Lookup object containing refy_lkup and svy_lkup. #' @param fill_gaps Logical, TRUE for lineup years, FALSE for survey years. @@ -164,12 +165,10 @@ build_pair_dict <- function(lkup, fill_gaps = TRUE) { # -------------------------------------------- # # 2) Encode: add integer code via collapse::join # -------------------------------------------- # -# DT: data.table to encode (by reference not guaranteed since join copies x->result) -# dict: data.table from build_pair_dict() -# code_col: name of code column to write -#' Encode pairs with integer code +#' Encode (id, reporting_level) pairs as integer codes #' -#' Adds an integer code column to a data.table by joining with a dictionary. +#' Joins a data.table with a dictionary to add an integer code column for each (id, reporting_level) pair. +#' Used for efficient grouping and decoding in the FGT pipeline. #' #' @param DT data.table to encode. #' @param dict data.table from build_pair_dict(). @@ -230,9 +229,10 @@ encode_pairs <- function(DT, dict, # ------------------------------------------------ # # 3) Decode: join labels by code via collapse::join # # ------------------------------------------------ # -#' Decode integer code to id and reporting level +#' Decode integer code to (id, reporting_level) labels #' -#' Joins labels by code using a dictionary. +#' Joins a data.table with a dictionary to recover id and reporting_level columns from integer codes. +#' Used after FGT calculations to restore human-readable labels. #' #' @param DT data.table to decode. #' @param dict data.table from build_pair_dict(). @@ -240,10 +240,8 @@ encode_pairs <- function(DT, dict, #' @param id_col Name of id column in dict. #' @param level_col Name of reporting level column in dict. #' @param keep_code Logical, keep code column if TRUE. -#' @param add_true_vars logical, add `country_code` and `reporting_year` and -#' removes var `id` +#' @param add_true_vars Logical, add country_code and reporting_year columns and remove id. #' @param verbose Integer, verbosity level. -#' #' @return data.table with id and reporting_level columns added. #' @keywords internal decode_pairs <- function(DT, dict, @@ -288,9 +286,10 @@ decode_pairs <- function(DT, dict, # ----------------------------------------------------- # # 4) Update dict with new pairs (append-only, fast DT) # # ----------------------------------------------------- # -#' Update dictionary with new pairs +#' Update dictionary with new (id, reporting_level) pairs #' -#' Appends new (id, reporting_level) pairs to the dictionary if needed. +#' Appends new (id, reporting_level) pairs to the dictionary if needed, ensuring all groups are encoded. +#' Used to keep the dictionary in sync with new survey data. #' #' @param dict data.table dictionary from build_pair_dict(). #' @param DT data.table with id and reporting_level columns. @@ -318,18 +317,12 @@ update_pair_dict <- function(dict, DT, -#' load refy list -#' -#' @param input_list list. output from [create_full_list] -#' @param path character: directory path -#' -#' @return character vector -#' @keywords internal -#' Load refy list +#' Load survey data from file list #' -#' Loads a list of files and returns a named list of data.tables, each with an id column. +#' Reads a list of survey files (e.g., .fst) and returns a named list of data.tables, each with an id column. +#' Used as the first step in the pipeline after creating the file list. #' -#' @param input_list Character vector of file paths (output from create_full_list). +#' @param input_list Character vector of file paths (from create_full_list()). #' @return Named list of data.tables, each with an id column. #' @keywords internal load_list_refy <- \(input_list){ diff --git a/man/build_pair_dict.Rd b/man/build_pair_dict.Rd index 6fe82339..c0c1d212 100644 --- a/man/build_pair_dict.Rd +++ b/man/build_pair_dict.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/fgt_cumsum.R \name{build_pair_dict} \alias{build_pair_dict} -\title{Dictionary for fast joins} +\title{Build dictionary for id/reporting_level encoding} \usage{ build_pair_dict(lkup, fill_gaps = TRUE) } @@ -12,24 +12,10 @@ build_pair_dict(lkup, fill_gaps = TRUE) \item{fill_gaps}{Logical, TRUE for lineup years, FALSE for survey years.} } \value{ -data.table with dictionary for merges. - data.table with columns id, reporting_level, and code. } \description{ -Dictionary for fast joins +Creates a data.table dictionary for mapping (id, reporting_level) pairs to integer codes for fast joins and decoding. +Used for efficient merging and decoding in the FGT pipeline. } -\keyword{Build} -\keyword{Creates} -\keyword{a} -\keyword{and} -\keyword{by} -\keyword{data.table} -\keyword{dictionary} -\keyword{fast} -\keyword{for} -\keyword{id} \keyword{internal} -\keyword{joins} -\keyword{merging} -\keyword{reporting_level.} diff --git a/man/decode_pairs.Rd b/man/decode_pairs.Rd index c77164bc..cefc628b 100644 --- a/man/decode_pairs.Rd +++ b/man/decode_pairs.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/fgt_cumsum.R \name{decode_pairs} \alias{decode_pairs} -\title{Decode integer code to id and reporting level} +\title{Decode integer code to (id, reporting_level) labels} \usage{ decode_pairs( DT, @@ -28,8 +28,7 @@ decode_pairs( \item{keep_code}{Logical, keep code column if TRUE.} -\item{add_true_vars}{logical, add \code{country_code} and \code{reporting_year} and -removes var \code{id}} +\item{add_true_vars}{Logical, add country_code and reporting_year columns and remove id.} \item{verbose}{Integer, verbosity level.} } @@ -37,6 +36,7 @@ removes var \code{id}} data.table with id and reporting_level columns added. } \description{ -Joins labels by code using a dictionary. +Joins a data.table with a dictionary to recover id and reporting_level columns from integer codes. +Used after FGT calculations to restore human-readable labels. } \keyword{internal} diff --git a/man/encode_pairs.Rd b/man/encode_pairs.Rd index 162a095c..01841436 100644 --- a/man/encode_pairs.Rd +++ b/man/encode_pairs.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/fgt_cumsum.R \name{encode_pairs} \alias{encode_pairs} -\title{Encode pairs with integer code} +\title{Encode (id, reporting_level) pairs as integer codes} \usage{ encode_pairs( DT, @@ -36,6 +36,7 @@ encode_pairs( data.table with code column added. } \description{ -Adds an integer code column to a data.table by joining with a dictionary. +Joins a data.table with a dictionary to add an integer code column for each (id, reporting_level) pair. +Used for efficient grouping and decoding in the FGT pipeline. } \keyword{internal} diff --git a/man/fgt_cumsum.Rd b/man/fgt_cumsum.Rd new file mode 100644 index 00000000..535ff3e9 --- /dev/null +++ b/man/fgt_cumsum.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fgt_cumsum.R +\name{fgt_cumsum} +\alias{fgt_cumsum} +\title{Compute FGT and Watts indices for all groups and poverty lines} +\usage{ +fgt_cumsum(LDTg, tpop, povline, drop_vars = TRUE) +} +\arguments{ +\item{LDTg}{List from format_lfst() with DT and g objects.} + +\item{tpop}{data.table with total population by group (from get_total_pop()).} + +\item{povline}{Numeric vector of poverty lines.} + +\item{drop_vars}{Logical, if TRUE returns only summary columns.} +} +\value{ +data.table with FGT and Watts measures by group and poverty line. +} +\description{ +Calculates headcount, poverty gap, poverty severity, and Watts index for each group and poverty line using cumulative sums. +} +\keyword{internal} diff --git a/man/format_lfst.Rd b/man/format_lfst.Rd index 806e6161..c1268268 100644 --- a/man/format_lfst.Rd +++ b/man/format_lfst.Rd @@ -2,37 +2,20 @@ % Please edit documentation in R/fgt_cumsum.R \name{format_lfst} \alias{format_lfst} -\title{format the lfst list to usable data to estimate poverty} +\title{Format loaded survey list for grouped poverty analysis} \usage{ format_lfst(lfst, dict) } \arguments{ -\item{lfst}{List from load_list_refy().} +\item{lfst}{Named list of data.tables, as returned by load_list_refy().} + +\item{dict}{data.table dictionary for id/reporting_level encoding (from build_pair_dict()).} } \value{ -list with DT and g (GRP object) - -List with elements: DT (data.table) and g (GRP object). +List with elements: DT (data.table of all surveys, with id_rl) and g (GRP object for grouping by id_rl). } \description{ -format the lfst list to usable data to estimate poverty +Combines a list of survey data.tables into a single data.table and encodes group identifiers using a dictionary. +Returns a data.table and a GRP object for efficient grouped operations, used as a preprocessing step for FGT and population calculations. } -\keyword{Format} -\keyword{GRP} -\keyword{Takes} -\keyword{a} -\keyword{and} -\keyword{data.table} -\keyword{estimation} -\keyword{for} -\keyword{from} -\keyword{grouped} \keyword{internal} -\keyword{lfst} -\keyword{list} -\keyword{load_list_refy()} -\keyword{object} -\keyword{operations.} -\keyword{poverty} -\keyword{returns} -\keyword{with} diff --git a/man/get_total_pop.Rd b/man/get_total_pop.Rd index 4808e0e7..a83525bd 100644 --- a/man/get_total_pop.Rd +++ b/man/get_total_pop.Rd @@ -2,19 +2,20 @@ % Please edit documentation in R/fgt_cumsum.R \name{get_total_pop} \alias{get_total_pop} -\title{Computes total population by group using the output of format_lfst().} +\title{Compute total population by survey and reporting level} \usage{ get_total_pop(LDTg, dict) } \arguments{ \item{LDTg}{List from format_lfst() with DT and g objects.} -\item{dict}{data dictionary from build_pair_dict()} +\item{dict}{data.table dictionary for id/reporting_level encoding (from build_pair_dict()).} } \value{ -data.table with total population by group. +data.table with total population by group (columns: id_rl, W). } \description{ -Computes total population by group using the output of format_lfst(). +Sums the weights for each (id, reporting_level) group in the combined survey data. +Used as a denominator for FGT and Watts index calculations. } \keyword{internal} diff --git a/man/load_list_refy.Rd b/man/load_list_refy.Rd index 75fb33df..b91f4d82 100644 --- a/man/load_list_refy.Rd +++ b/man/load_list_refy.Rd @@ -2,37 +2,18 @@ % Please edit documentation in R/fgt_cumsum.R \name{load_list_refy} \alias{load_list_refy} -\title{load refy list} +\title{Load survey data from file list} \usage{ load_list_refy(input_list) } \arguments{ -\item{input_list}{Character vector of file paths (output from create_full_list).} - -\item{path}{character: directory path} +\item{input_list}{Character vector of file paths (from create_full_list()).} } \value{ -character vector - Named list of data.tables, each with an id column. } \description{ -load refy list +Reads a list of survey files (e.g., .fst) and returns a named list of data.tables, each with an id column. +Used as the first step in the pipeline after creating the file list. } -\keyword{Load} -\keyword{Loads} -\keyword{a} -\keyword{an} -\keyword{and} -\keyword{column.} -\keyword{data.tables,} -\keyword{each} -\keyword{files} -\keyword{id} \keyword{internal} -\keyword{list} -\keyword{named} -\keyword{of} -\keyword{refy} -\keyword{returns} -\keyword{with} diff --git a/man/transform_input.Rd b/man/transform_input.Rd index 8415f5ac..14664b98 100644 --- a/man/transform_input.Rd +++ b/man/transform_input.Rd @@ -7,7 +7,7 @@ transform_input(input_list) } \arguments{ -\item{input_list}{Character vector of file paths (output from create_full_list).} +\item{input_list}{Character vector of file paths (from create_full_list()).} } \value{ formated list diff --git a/man/update_pair_dict.Rd b/man/update_pair_dict.Rd index a142e98d..4a41a16c 100644 --- a/man/update_pair_dict.Rd +++ b/man/update_pair_dict.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/fgt_cumsum.R \name{update_pair_dict} \alias{update_pair_dict} -\title{Update dictionary with new pairs} +\title{Update dictionary with new (id, reporting_level) pairs} \usage{ update_pair_dict(dict, DT, id_col = "id", level_col = "reporting_level") } @@ -19,6 +19,7 @@ update_pair_dict(dict, DT, id_col = "id", level_col = "reporting_level") Updated data.table dictionary. } \description{ -Appends new (id, reporting_level) pairs to the dictionary if needed. +Appends new (id, reporting_level) pairs to the dictionary if needed, ensuring all groups are encoded. +Used to keep the dictionary in sync with new survey data. } \keyword{internal} From 37a86ba782ce57204558f03c242d0fb56c1ae9db Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Wed, 10 Sep 2025 21:03:45 -0400 Subject: [PATCH 111/203] change lkup to read from fst --- R/create_lkups.R | 4 +++- R/fg_pip.R | 28 ++++++++++++++++++---------- R/fgt_cumsum.R | 12 +++++------- 3 files changed, 26 insertions(+), 18 deletions(-) diff --git a/R/create_lkups.R b/R/create_lkups.R index 9f2a847e..6b42381d 100644 --- a/R/create_lkups.R +++ b/R/create_lkups.R @@ -186,6 +186,8 @@ create_lkups <- function(data_dir, versions) { # ZP ADD - CREATE OBJECT: refy_lkup + + # CREATE OBJECT: refy_lkup ------------- #___________________________________________________________________________ if (use_new_lineup_version) { refy_lkup_path <- fs::path(data_dir, @@ -279,7 +281,7 @@ create_lkups <- function(data_dir, versions) { paste0(country_code, "_", reporting_year), - ext = "qs") |> + ext = "fst") |> as.character() } ] diff --git a/R/fg_pip.R b/R/fg_pip.R index f3481c5c..81a6bd11 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -58,23 +58,31 @@ fg_pip <- function(country, data_in_cache = data_present_in_master)) } - dict <- build_pair_dict(lkup = lkup, fill_gaps = TRUE) - + # Build a dictionary for encoding (id, reporting_level) pairs as integer codes. + dict <- build_pair_dict(lkup = lkup, + fill_gaps = TRUE) + # Create a list of file paths for all surveys to be loaded, based on the filtered metadata. full_list <- create_full_list(metadata = metadata) - lfst <- - load_list_refy(input_list = full_list) + # Load all survey data files into a named list of data.tables, each with an id column. + lfst <- load_list_refy(input_list = full_list) + # Combine all loaded surveys into a single data.table, encode group identifiers, + # and create a GRP object for efficient grouping. LDTg <- format_lfst(lfst = lfst, dict = dict) - tpop <- get_total_pop(LDTg = LDTg, - dict = dict) - fgt <- fgt_cumsum(LDTg = LDTg, - tpop = tpop, - povline = povline) |> - decode_pairs(dict = dict) + # Compute the total population (sum of weights) for each group (id_rl) in + # the combined survey data. + tpop <- get_total_pop(LDTg = LDTg) + + # Compute FGT and Watts indices for all groups and poverty lines, then decode + # integer codes back to (country_code, reporting_year, reporting_level). + fgt <- fgt_cumsum(LDTg = LDTg, + tpop = tpop, + povline = povline) |> + decode_pairs(dict = dict) # Add just mean and median res <- get_mean_median(fgt, lkup, fill_gaps = TRUE) diff --git a/R/fgt_cumsum.R b/R/fgt_cumsum.R index 687dd177..cb356415 100644 --- a/R/fgt_cumsum.R +++ b/R/fgt_cumsum.R @@ -38,18 +38,16 @@ format_lfst <- \(lfst, dict) { #' Used as a denominator for FGT and Watts index calculations. #' #' @param LDTg List from format_lfst() with DT and g objects. -#' @param dict data.table dictionary for id/reporting_level encoding (from build_pair_dict()). #' @return data.table with total population by group (columns: id_rl, W). #' @keywords internal -get_total_pop <- \(LDTg, dict) { +get_total_pop <- \(LDTg) { list2env(LDTg, envir = environment()) rm(LDTg) add_vars(g[["groups"]], get_vars(DT, c("weight")) |> fsum(g)) |> setnames(old = "weight", - new = "W") |> - encode_pairs(dict, drop_labels = TRUE) + new = "W") } #' Compute FGT and Watts indices for all groups and poverty lines @@ -110,8 +108,8 @@ fgt_cumsum <- \(LDTg, tpop, povline, # 6) Compute measures (vectorized). Small clamps for numerical safety. CS[, `:=`( headcount = cw / W, - poverty_gap = (z * cw - cwy) / (z_s * W), - poverty_severity = (z2 * cw - 2 * z * cwy + cwy2) / (z2_s * W), + poverty_gap = (z * cw - cwy) / (z * W), + poverty_severity = (z2 * cw - 2 * z * cwy + cwy2) / (z2 * W), watts = (logz * cw - cwylog) / W )] @@ -340,7 +338,7 @@ load_list_refy <- \(input_list){ lfst <- lapply(seq_flex(input_list), \(i) { - x <- lup_files[i] + x <- input_list[i] idn <- fs::path_file(x) |> fs::path_ext_remove() fst::read_fst(x, as.data.table = TRUE) |> From b02fbef1fa03ad6ca5c9c6bd10603eaba3bd6d3c Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Wed, 10 Sep 2025 21:18:06 -0400 Subject: [PATCH 112/203] pip(fill_gaps = TRUE) fully working --- R/fg_pip.R | 4 +--- R/fgt_cumsum.R | 4 +++- man/get_total_pop.Rd | 4 +--- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/R/fg_pip.R b/R/fg_pip.R index 81a6bd11..d4323ef1 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -127,9 +127,7 @@ fg_pip <- function(country, verbose = 0, overid = 2) - setnames(out, - "povline", - "poverty_line") + setnames(out, "povline", "poverty_line") # Ensure that out does not have duplicates out <- fg_remove_duplicates(out, diff --git a/R/fgt_cumsum.R b/R/fgt_cumsum.R index cb356415..d34c3ecd 100644 --- a/R/fgt_cumsum.R +++ b/R/fgt_cumsum.R @@ -113,10 +113,12 @@ fgt_cumsum <- \(LDTg, tpop, povline, watts = (logz * cw - cwylog) / W )] + setnames(CS, "z", "povline") if (!drop_vars) { return(CS) } - get_vars(CS, c("id_rl", "headcount", "poverty_gap", "poverty_severity", "watts")) + get_vars(CS, c("id_rl", "povline", "headcount", + "poverty_gap", "poverty_severity", "watts")) } diff --git a/man/get_total_pop.Rd b/man/get_total_pop.Rd index a83525bd..97b36860 100644 --- a/man/get_total_pop.Rd +++ b/man/get_total_pop.Rd @@ -4,12 +4,10 @@ \alias{get_total_pop} \title{Compute total population by survey and reporting level} \usage{ -get_total_pop(LDTg, dict) +get_total_pop(LDTg) } \arguments{ \item{LDTg}{List from format_lfst() with DT and g objects.} - -\item{dict}{data.table dictionary for id/reporting_level encoding (from build_pair_dict()).} } \value{ data.table with total population by group (columns: id_rl, W). From 47434242f79952f26f9d0584ae7fbbdfa1b29dfb Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Wed, 10 Sep 2025 21:18:26 -0400 Subject: [PATCH 113/203] Increment version number to 1.3.19.9003 --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 0f625a0e..b646fcde 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pipapi Title: API for the Poverty and Inequality Platform -Version: 1.3.19.9002 +Version: 1.3.19.9003 Authors@R: c(person(given = "Tony", family = "Fujs", From f35bfed10783aa509f8424d16b96892701cee730 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Thu, 11 Sep 2025 10:42:09 -0400 Subject: [PATCH 114/203] fix issue with censoring --- R/pip_agg.R | 2 +- inst/plumber/v1/endpoints.R | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/R/pip_agg.R b/R/pip_agg.R index 52b18bcd..0d5485fb 100644 --- a/R/pip_agg.R +++ b/R/pip_agg.R @@ -14,7 +14,7 @@ pip_agg <- function(country = "ALL", welfare_type = c("all", "consumption", "income"), reporting_level = c("all", "national"), lkup, - censor = TRUE, + censor = FALSE, lkup_hash = lkup$cache_data_id$hash_pip_grp, additional_ind = FALSE) { diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R index 77c83c07..5e42b7c7 100644 --- a/inst/plumber/v1/endpoints.R +++ b/inst/plumber/v1/endpoints.R @@ -281,6 +281,7 @@ function(req, res) { res$serializer <- pipapi::assign_serializer(format = params$format) params$format <- NULL params$version <- NULL + params$censor <- TRUE out <- do.call(pipapi::pip_agg, params) out From cebe90c4889aaf417bd6fb9264339abc7b1e9814 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Thu, 11 Sep 2025 10:46:53 -0400 Subject: [PATCH 115/203] fix bug --- R/pip_grp_new.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pip_grp_new.R b/R/pip_grp_new.R index e4880abb..86f68b6d 100644 --- a/R/pip_grp_new.R +++ b/R/pip_grp_new.R @@ -82,7 +82,7 @@ pip_grp_new <- \(country = "ALL", out <- censor_rows(out, lkup[["censored"]], type = "regions") } - + out } #' Subset country_code values based on matches in *_code columns and country_code From a010a8cd63488a9af72a6882161dd92b168fd767 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Thu, 11 Sep 2025 14:22:38 -0400 Subject: [PATCH 116/203] update ui_hp_stacked to account for use_new --- R/pip_grp_new.R | 2 +- R/ui_home_page.R | 49 +++++++++++++++++++++++++++++++++++------------- 2 files changed, 37 insertions(+), 14 deletions(-) diff --git a/R/pip_grp_new.R b/R/pip_grp_new.R index 86f68b6d..ecd7ce22 100644 --- a/R/pip_grp_new.R +++ b/R/pip_grp_new.R @@ -16,7 +16,7 @@ pip_grp_new <- \(country = "ALL", # subgroups aggregations only supported for "all" countries country <- toupper(country) year <- toupper(year) - reporting_level <- "all" + if (!all(country %in% c("ALL", lkup$query_controls$region$values))) { country <- "ALL" } diff --git a/R/ui_home_page.R b/R/ui_home_page.R index 04105c59..58230063 100644 --- a/R/ui_home_page.R +++ b/R/ui_home_page.R @@ -15,20 +15,43 @@ ui_hp_stacked <- function(povline = 1.9, ref_years <- sort(unique(lkup$ref_lkup$reporting_year)) ref_years <- ref_years[!ref_years %in% c(1981:1989)] - out <- pip_grp( - country = "all", - year = ref_years, - povline = povline, - group_by = "wb", - reporting_level = "national", - censor = FALSE, - lkup = lkup - ) + use_new <- lkup$use_new_lineup_version + + # Run correct function + #------------------------------------- + out <- if (use_new) { + x <- pip_grp_new(country = "ALL", + year = ref_years, + povline = povline, + welfare_type = "all", + reporting_level = "national", + lkup = lkup, + censor = FALSE) + + regs <- lkup$aux_files$country_list[, region_code] |> + funique() |> + c("WLD") + + x |> + fsubset(region_code %in% regs) + + } else { + pip_grp( + country = "all", + year = ref_years, + povline = povline, + group_by = "wb", + reporting_level = "national", + censor = FALSE, + lkup = lkup + ) + + } + + out <- get_vars(out, + c("region_code", "reporting_year", + "poverty_line", "pop_in_poverty")) - out <- out[, c( - "region_code", "reporting_year", - "poverty_line", "pop_in_poverty" - )] return(out) } From 48cf146107d8f3b7c13a9c67a1a87ad57e80747d Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Thu, 11 Sep 2025 14:26:16 -0400 Subject: [PATCH 117/203] improve ui_hp_countries --- R/ui_home_page.R | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/R/ui_home_page.R b/R/ui_home_page.R index 58230063..6b230aea 100644 --- a/R/ui_home_page.R +++ b/R/ui_home_page.R @@ -80,13 +80,16 @@ ui_hp_countries <- function(country = c("IDN", "CIV"), ) # Add pop_in_poverty and scale according to pop_units - out$pop_in_poverty <- out$reporting_pop * out$headcount / pop_units - out$reporting_pop <- out$reporting_pop / pop_units + out[, + `:=`( + pop_in_poverty = reporting_pop * headcount / pop_units, + reporting_pop = reporting_pop / pop_units + )] - out <- out[, c( + out <- get_vars(out, c( "region_code", "country_code", "reporting_year", "poverty_line", "reporting_pop", "pop_in_poverty" - )] + )) return(out) } From ae6818c9e3e7577bfc4657475007fd4de30ad2d2 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Thu, 11 Sep 2025 14:54:56 -0400 Subject: [PATCH 118/203] improve ui_pc_charts and ui_pc_regional --- R/pip.R | 2 +- R/ui_poverty_indicators.R | 33 +++++++++++++++++++++------------ inst/plumber/v1/endpoints.R | 3 ++- 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/R/pip.R b/R/pip.R index 0689ef26..44a4d556 100644 --- a/R/pip.R +++ b/R/pip.R @@ -52,7 +52,7 @@ pip <- function(country = "ALL", reporting_level = c("all", "national", "rural", "urban"), ppp = NULL, lkup, - censor = TRUE, + censor = FALSE, lkup_hash = lkup$cache_data_id$hash_pip, additional_ind = FALSE) { diff --git a/R/ui_poverty_indicators.R b/R/ui_poverty_indicators.R index f2758a13..b4e2b26d 100644 --- a/R/ui_poverty_indicators.R +++ b/R/ui_poverty_indicators.R @@ -14,6 +14,7 @@ ui_pc_charts <- function(country = c("AGO"), welfare_type = c("all", "consumption", "income"), reporting_level = c("all", "national", "rural", "urban"), pop_units = 1e6, + censor = TRUE, lkup) { # Set returned columns return_cols <- lkup$return_cols$ui_pc_charts$cols @@ -30,12 +31,17 @@ ui_pc_charts <- function(country = c("AGO"), fill_gaps = fill_gaps, group_by = group_by, reporting_level = reporting_level, - lkup = lkup + lkup = lkup, + censor = censor ) # Add pop_in_poverty and scale according to pop_units - out$pop_in_poverty <- out$reporting_pop * out$headcount / pop_units - out$reporting_pop <- out$reporting_pop / pop_units + out[, + `:=`( + pop_in_poverty = reporting_pop * headcount / pop_units, + reporting_pop = reporting_pop / pop_units + )] + # handle different responses when fill_gaps = TRUE / FALSE # Return all columns when survey years are requested @@ -45,7 +51,7 @@ ui_pc_charts <- function(country = c("AGO"), out <- out[, .SD, .SDcols = return_cols] } else { - out <- out[, .SD, .SDcols = return_cols] + out <- get_vars(out, return_cols) # Set non-interpolated variables to NA if line-up years are requested out[, (inequality_indicators) := NA] out[, survey_comparability := NA] # remove manually survey_comparability @@ -76,16 +82,19 @@ ui_pc_regional <- function(country = "ALL", } out <- pip_agg(country = country, - year = year, - group_by = "wb", - reporting_level = "national", - povline = povline, - lkup = lkup, - censor = TRUE) + year = year, + group_by = "wb", + reporting_level = "national", + povline = povline, + lkup = lkup, + censor = TRUE) # Add pop_in_poverty and scale according to pop_units - out$pop_in_poverty <- out$reporting_pop * out$headcount / pop_units - out$reporting_pop <- out$reporting_pop / pop_units + out[, + `:=`( + pop_in_poverty = reporting_pop * headcount / pop_units, + reporting_pop = reporting_pop / pop_units + )] out <- out[estimate_type == "actual"] diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R index 5e42b7c7..102a7dc6 100644 --- a/inst/plumber/v1/endpoints.R +++ b/inst/plumber/v1/endpoints.R @@ -281,7 +281,6 @@ function(req, res) { res$serializer <- pipapi::assign_serializer(format = params$format) params$format <- NULL params$version <- NULL - params$censor <- TRUE out <- do.call(pipapi::pip_agg, params) out @@ -676,6 +675,7 @@ function(req) { params <- req$argsQuery params$lkup <- lkups$versions_paths[[req$argsQuery$version]] params$version <- NULL + params$censor <- TRUE out <- do.call(pipapi::ui_pc_charts, params) return(out) @@ -700,6 +700,7 @@ function(req) { params$lkup <- lkups$versions_paths[[req$argsQuery$version]] params$pop_units <- 1 params$version <- NULL + params$censor <- TRUE do.call(pipapi::ui_pc_charts, params) From 5be1cdc7005b26277dfb6d8846a8cb5115fdedc4 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Thu, 11 Sep 2025 15:23:05 -0400 Subject: [PATCH 119/203] fix get_mean_median to account for welfare_type in survey years --- R/ui_country_profile.R | 14 ++++---------- R/utils.R | 10 ++++++++-- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/R/ui_country_profile.R b/R/ui_country_profile.R index ad64f538..f730c1f8 100644 --- a/R/ui_country_profile.R +++ b/R/ui_country_profile.R @@ -52,14 +52,6 @@ ui_cp_ki_headcount <- function(country, reporting_level = "all", lkup = lkup) - # Select max year and country - # res <- - # res_all[res_all[, - # .I[which.max(reporting_year)], - # by = country_code]$V1 - # ][ - # country_code == country - # ] res <- res_all[country_code == country ][, @@ -83,8 +75,10 @@ ui_cp_ki_headcount <- function(country, ### TEMP FIX END out <- data.table::data.table( - country_code = country, reporting_year = res$reporting_year, - poverty_line = povline, headcount = res$headcount + country_code = country, + reporting_year = res$reporting_year, + poverty_line = povline, + headcount = res$headcount ) return(out) } diff --git a/R/utils.R b/R/utils.R index 60fa57ac..f4eef7da 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1481,16 +1481,22 @@ get_mean_median <- \(fgt, lkup, fill_gaps) { dist <- get_vars(lkup$lineup_dist_stats, c("country_code", "reporting_year", "reporting_level", "mean", "median")) + by_var <- c('country_code', "reporting_year", "reporting_level") } else { dist <- get_vars(lkup$dist_stats, c("country_code", "reporting_year", - "reporting_level", "mean", "survey_median_ppp")) + "reporting_level", "mean", + "survey_median_ppp", "welfare_type")) setnames(dist, "survey_median_ppp", "median") + by_var <- c('country_code', + "reporting_year", + "reporting_level", + "welfare_type") } join(x = fgt, y = dist, - on = c('country_code', "reporting_year", "reporting_level"), + on = by_var, how = "left", validate = "m:1", # multiple povlines verbose = 0L) From 02c2ca021c777968fa839602ed5e79213b565076 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Thu, 11 Sep 2025 16:22:54 -0400 Subject: [PATCH 120/203] improve country_profile --- R/ui_country_profile.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/ui_country_profile.R b/R/ui_country_profile.R index f730c1f8..20fe7dac 100644 --- a/R/ui_country_profile.R +++ b/R/ui_country_profile.R @@ -311,7 +311,7 @@ ui_cp_download <- function(country = "AGO", hc <- lapply(country, \(.) { ui_cp_ki_headcount(., year, povline, lkup) }) |> - data.table::rbindlist(use.names = TRUE) + rbindlist(use.names = TRUE) df <- lkup[["cp_lkups"]]$flat$flat_cp df <- df[country_code %chin% country] From 391c78195913090391e3a3fc19603f0898abf4be Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Sat, 13 Sep 2025 20:54:36 -0400 Subject: [PATCH 121/203] Increment version number to 1.3.19.9004 --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index b646fcde..2e8242b1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pipapi Title: API for the Poverty and Inequality Platform -Version: 1.3.19.9003 +Version: 1.3.19.9004 Authors@R: c(person(given = "Tony", family = "Fujs", From 79ff74226ff49f7132a35026aaa9e0bb0d82d969 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Mon, 15 Sep 2025 17:12:44 -0400 Subject: [PATCH 122/203] add popshare to subset_lkup --- R/rg_pip.R | 3 ++- R/utils.R | 9 ++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/R/rg_pip.R b/R/rg_pip.R index 769878ae..279f4d49 100644 --- a/R/rg_pip.R +++ b/R/rg_pip.R @@ -30,7 +30,8 @@ rg_pip <- function(country, data_dir = data_dir, povline = povline, cache_file_path = cache_file_path, - fill_gaps = FALSE + fill_gaps = FALSE, + popshare = popshare ) data_present_in_master <- metadata$data_present_in_master diff --git a/R/utils.R b/R/utils.R index f4eef7da..88011ff4 100644 --- a/R/utils.R +++ b/R/utils.R @@ -16,9 +16,16 @@ subset_lkup <- function(country, data_dir = NULL, povline, cache_file_path, - fill_gaps + fill_gaps, + popshare = NULL ) { + if (!is.null(popshare)) { + return(list(data_present_in_master = NULL, + lkup = lkup, + povline = povline)) + } + # STEP 1 - Keep every row by default keep <- rep(TRUE, nrow(lkup)) # STEP 2 - Select countries From 3d499358e03d5bbcddf7984cf573636852ca3213 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Mon, 15 Sep 2025 18:02:38 -0400 Subject: [PATCH 123/203] update round to 2 decimals --- inst/plumber/v1/endpoints.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R index 102a7dc6..e7345ecb 100644 --- a/inst/plumber/v1/endpoints.R +++ b/inst/plumber/v1/endpoints.R @@ -151,7 +151,7 @@ function(req, res) { # Round poverty line # This is to prevent users to abuse the API by passing too many decimals if (!is.null(req$argsQuery$povline)) { - req$argsQuery$povline <- round(req$argsQuery$povline, digits = 3) + req$argsQuery$povline <- round(req$argsQuery$povline, digits = 2) } } plumber::forward() From ec932256e9249bc04c48210e5c69fc9d78f38d4a Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Mon, 15 Sep 2025 18:05:38 -0400 Subject: [PATCH 124/203] In case of error, make sure you log the endpoint closes #432 --- inst/plumber/v1/plumber.R | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/inst/plumber/v1/plumber.R b/inst/plumber/v1/plumber.R index 9df24ad4..c5441da4 100644 --- a/inst/plumber/v1/plumber.R +++ b/inst/plumber/v1/plumber.R @@ -28,6 +28,12 @@ plumber::pr(endpoints_path) |> plumber::pr_hook("exit", function() { # log_info('Bye bye: {proc.time()[["elapsed"]]}') }) |> + plumber::pr_set_error(function(req, res, err) { + # In case of error, make sure you log the endpoint for #432 + method <- req$REQUEST_METHOD + path <- req$PATH_INFO + cat(sprintf("ERROR at %s %s: %s\n", method, path, err$message)) + }) |> # Set API spec plumber::pr_set_api_spec(api = function(spec) { spec$info$version <- utils::packageVersion("pipapi") |> From 2b40da8aeee31a901f0c94ab7fa0c591edaab0d0 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Mon, 15 Sep 2025 18:07:41 -0400 Subject: [PATCH 125/203] include povline in aggregates list --- R/add_agg_stats.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/add_agg_stats.R b/R/add_agg_stats.R index 5b93067d..bbc81ed9 100644 --- a/R/add_agg_stats.R +++ b/R/add_agg_stats.R @@ -16,7 +16,8 @@ add_agg_stats <- function(df, aggregated_list <- split(aggregated, interaction( aggregated$country_code, - aggregated$reporting_year + aggregated$reporting_year, + aggregated$poverty_line ), drop = TRUE ) From 33b685d3482513387967b9e31d75deaf2dcd33fc Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Mon, 15 Sep 2025 18:08:48 -0400 Subject: [PATCH 126/203] add gc() each time it disconnects --- R/duckdb_func.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/R/duckdb_func.R b/R/duckdb_func.R index 5071e1ca..bc9c89be 100644 --- a/R/duckdb_func.R +++ b/R/duckdb_func.R @@ -265,6 +265,7 @@ update_master_file <- function(dat, ")) duckdb::dbDisconnect(write_con) + invisible(gc()) if (nr > 0 && verbose) message(glue("{target_file} is updated.")) return(nr) @@ -335,6 +336,7 @@ reset_cache <- function(pass = Sys.getenv('PIP_CACHE_LOCAL_KEY'), DBI::dbExecute(write_con, "DELETE from fg_master_file") } duckdb::dbDisconnect(write_con) + invisible(gc()) } create_duckdb_file <- function(cache_file_path) { @@ -360,6 +362,7 @@ create_duckdb_file <- function(cache_file_path) { watts DOUBLE )") DBI::dbDisconnect(con) + invisible(gc()) } @@ -394,6 +397,7 @@ load_inter_cache <- function(lkup = NULL, # connection object if it is not closed More details here # https://app.clickup.com/t/868cdpe3q duckdb::dbDisconnect(con) + invisible(gc()) setDT(master_file) } From 6ae50037c1da428037234f6a9e56aa9b5e0b2823 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Mon, 15 Sep 2025 18:10:11 -0400 Subject: [PATCH 127/203] set pov line to NULL if popshare is defined in fg --- R/fg_pip.R | 1 + R/fg_pip_old.R | 2 ++ 2 files changed, 3 insertions(+) diff --git a/R/fg_pip.R b/R/fg_pip.R index d4323ef1..b0761839 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -19,6 +19,7 @@ fg_pip <- function(country, interpolation_list <- lkup$interpolation_list data_dir <- lkup$data_root refy_lkup <- lkup$refy_lkup # cleaned refy table, unique by country-years but some columns removed in order to do that + if (!is.null(popshare)) povline <- NULL cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb") # fg_pip is called from multiple places like pip, pip_grp_logic. We have connection object created diff --git a/R/fg_pip_old.R b/R/fg_pip_old.R index 2da40213..444df522 100644 --- a/R/fg_pip_old.R +++ b/R/fg_pip_old.R @@ -19,6 +19,8 @@ fg_pip_old <- function(country, data_dir <- lkup$data_root ref_lkup <- lkup$ref_lkup + if (!is.null(popshare)) povline <- NULL + cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb") # fg_pip is called from multiple places like pip, pip_grp_logic. We have connection object created # when calling from `pip`. For other functions we create it here. From 0a24c57fe3f2c76f2e87d6f7519669a52738d7d7 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Mon, 15 Sep 2025 18:13:53 -0400 Subject: [PATCH 128/203] check pip_grp_logic with changes made to old_DEV branch last commit --- R/pip_grp_logic.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pip_grp_logic.R b/R/pip_grp_logic.R index e6f54e9b..cc23d253 100644 --- a/R/pip_grp_logic.R +++ b/R/pip_grp_logic.R @@ -73,7 +73,7 @@ pip_grp_logic <- function(country = "ALL", ## This will then be re-used in various part of the function ## This is to avoid re-computing and re-loading the same data over and over fg_pip_master <- fg_pip_old( - country = c(lcv$md_off_reg, lcv$user_off_reg), + country = c(lcv$md_off_reg, lcv$user_alt_agg, lcv$user_off_reg), year = year, povline = povline, popshare = NULL, From 3dc0dc38026c363e98ad687c0a3d843093662899 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Mon, 15 Sep 2025 18:17:12 -0400 Subject: [PATCH 129/203] limit set of poverty lines to store --- R/pip_new_lineups.R | 10 +++++++++- R/pip_old.R | 10 +++++++++- R/pip_old_lineups.R | 12 +++++++++++- 3 files changed, 29 insertions(+), 3 deletions(-) diff --git a/R/pip_new_lineups.R b/R/pip_new_lineups.R index 5153f037..84a2c141 100644 --- a/R/pip_new_lineups.R +++ b/R/pip_new_lineups.R @@ -315,7 +315,15 @@ treat_cache_and_main <- \(out, cache_file_path, rowbind(cached_data) } - update_master_file(main_data, cache_file_path, fill_gaps) + pl <- c(seq(from = 0.01, to = 5, by = 0.01), + seq(from = 5.1, to = 20, by = 0.1), + seq(from = 21, to = 100, by = 1), + seq(from = 105, to = 900, by = 5)) + # Only update master file if poverty line is part of this pl list + # Using round to avoid precision error with decimals + if (all(round(povline, 2) %in% round(pl, 2))) { + update_master_file(main_data, cache_file_path, fill_gaps) + } } else { out <- cached_data diff --git a/R/pip_old.R b/R/pip_old.R index ae62bb22..5efc61ae 100644 --- a/R/pip_old.R +++ b/R/pip_old.R @@ -147,7 +147,15 @@ pip_old <- function(country = "ALL", # This will be used only for development purpose and we don't have any intention to use it in production. if(!is.null(cached_data)) { # Update cache with data - update_master_file(main_data, cache_file_path, fill_gaps) + pl <- c(seq(from = 0.01, to = 5, by = 0.01), + seq(from = 5.1, to = 20, by = 0.1), + seq(from = 21, to = 100, by = 1), + seq(from = 105, to = 900, by = 5)) + # Only update master file if poverty line is part of this pl list + # Using round to avoid precision error with decimals + if (all(round(povline, 2) %in% round(pl, 2))) { + update_master_file(main_data, cache_file_path, fill_gaps) + } } } else { out <- cached_data diff --git a/R/pip_old_lineups.R b/R/pip_old_lineups.R index fc01620b..be67050c 100644 --- a/R/pip_old_lineups.R +++ b/R/pip_old_lineups.R @@ -145,7 +145,17 @@ pip_old_lineups <- function(country = "ALL", out <- main_data |> rowbind(cached_data) - update_master_file(main_data, cache_file_path, fill_gaps) + + pl <- c(seq(from = 0.01, to = 5, by = 0.01), + seq(from = 5.1, to = 20, by = 0.1), + seq(from = 21, to = 100, by = 1), + seq(from = 105, to = 900, by = 5)) + # Only update master file if poverty line is part of this pl list + # Using round to avoid precision error with decimals + if (all(round(povline, 2) %in% round(pl, 2))) { + update_master_file(main_data, cache_file_path, fill_gaps) + } + } else { out <- cached_data From a099a386b7649a64a5ad96b28790b636097d70a3 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Mon, 15 Sep 2025 18:20:34 -0400 Subject: [PATCH 130/203] store vars in env defined in zzz --- R/pip_new_lineups.R | 5 +---- R/pip_old.R | 5 +---- R/pip_old_lineups.R | 5 +---- R/zzz.R | 8 ++++++++ 4 files changed, 11 insertions(+), 12 deletions(-) diff --git a/R/pip_new_lineups.R b/R/pip_new_lineups.R index 84a2c141..7838e3a1 100644 --- a/R/pip_new_lineups.R +++ b/R/pip_new_lineups.R @@ -315,10 +315,7 @@ treat_cache_and_main <- \(out, cache_file_path, rowbind(cached_data) } - pl <- c(seq(from = 0.01, to = 5, by = 0.01), - seq(from = 5.1, to = 20, by = 0.1), - seq(from = 21, to = 100, by = 1), - seq(from = 105, to = 900, by = 5)) + pl <- get_from_pipapienv("pl_to_store") # Only update master file if poverty line is part of this pl list # Using round to avoid precision error with decimals if (all(round(povline, 2) %in% round(pl, 2))) { diff --git a/R/pip_old.R b/R/pip_old.R index 5efc61ae..34fc5371 100644 --- a/R/pip_old.R +++ b/R/pip_old.R @@ -147,10 +147,7 @@ pip_old <- function(country = "ALL", # This will be used only for development purpose and we don't have any intention to use it in production. if(!is.null(cached_data)) { # Update cache with data - pl <- c(seq(from = 0.01, to = 5, by = 0.01), - seq(from = 5.1, to = 20, by = 0.1), - seq(from = 21, to = 100, by = 1), - seq(from = 105, to = 900, by = 5)) + pl <- get_from_pipapienv("pl_to_store") # Only update master file if poverty line is part of this pl list # Using round to avoid precision error with decimals if (all(round(povline, 2) %in% round(pl, 2))) { diff --git a/R/pip_old_lineups.R b/R/pip_old_lineups.R index be67050c..f7000674 100644 --- a/R/pip_old_lineups.R +++ b/R/pip_old_lineups.R @@ -146,10 +146,7 @@ pip_old_lineups <- function(country = "ALL", rowbind(cached_data) - pl <- c(seq(from = 0.01, to = 5, by = 0.01), - seq(from = 5.1, to = 20, by = 0.1), - seq(from = 21, to = 100, by = 1), - seq(from = 105, to = 900, by = 5)) + pl <- get_from_pipapienv("pl_to_store") # Only update master file if poverty line is part of this pl list # Using round to avoid precision error with decimals if (all(round(povline, 2) %in% round(pl, 2))) { diff --git a/R/zzz.R b/R/zzz.R index d3c42be7..302841c5 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -43,6 +43,14 @@ pipapi_default_options <- list( min(8) set_in_pipapienv("cores_to_use", cores_to_use) + + # pov lines to store + pl <- c(seq(from = 0.01, to = 5, by = 0.01), + seq(from = 5.1, to = 20, by = 0.1), + seq(from = 21, to = 100, by = 1), + seq(from = 105, to = 900, by = 5)) + set_in_pipapienv("pl_to_store", pl) + invisible() } From 64eedb585e109ce450be92b449489d1edb7967bb Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Mon, 15 Sep 2025 18:23:18 -0400 Subject: [PATCH 131/203] update data documentation --- R/pipapi-package.R | 11 ++++++ data-raw/data.R | 72 +++++++++--------------------------- man/empty_response_fg_add.Rd | 16 ++++++++ man/pip.Rd | 2 +- man/pip_agg.Rd | 2 +- man/subset_lkup.Rd | 6 ++- man/ui_pc_charts.Rd | 3 ++ 7 files changed, 54 insertions(+), 58 deletions(-) create mode 100644 man/empty_response_fg_add.Rd diff --git a/R/pipapi-package.R b/R/pipapi-package.R index 7a52927d..c45284df 100644 --- a/R/pipapi-package.R +++ b/R/pipapi-package.R @@ -59,6 +59,17 @@ NULL #' @format Data frame with 0 rows and 53 columns NULL +#' Dataframe for fill gaps empty response with additional variables +#' +#' @docType data +#' @keywords datasets +#' @name empty_response_fg_add +#' @usage data(empty_response_fg_add) +#' @format Data frame with 0 rows and 56 columns +NULL + + + utils::globalVariables( c( ".", diff --git a/data-raw/data.R b/data-raw/data.R index b6d075be..a0a6a16f 100644 --- a/data-raw/data.R +++ b/data-raw/data.R @@ -35,61 +35,22 @@ empty_response_grp <- pip_grp("all", year, lkup = lkup, group_by = "wb") empty_response_grp <- empty_response_grp[-c(1:nrow(empty_response_grp))] -empty_response_fg <- data.table::data.table( - country_code = character(0), - survey_id = character(0), - cache_id = character(0), - wb_region_code = character(0), - reporting_year = numeric(0), - surveyid_year = character(0), - survey_year = numeric(0), - survey_time = character(0), - survey_acronym = character(0), - survey_coverage = character(0), - survey_comparability = numeric(0), - comparable_spell = character(0), - welfare_type = character(0), - reporting_level = character(0), - survey_mean_lcu = numeric(0), - survey_mean_ppp = numeric(0), - survey_median_ppp = numeric(0), - survey_median_lcu = numeric(0), - predicted_mean_ppp = numeric(0), - ppp = numeric(0), - cpi = numeric(0), - reporting_pop = numeric(0), - reporting_gdp = numeric(0), - reporting_pce = numeric(0), - pop_data_level = character(0), - gdp_data_level = character(0), - pce_data_level = character(0), - cpi_data_level = character(0), - ppp_data_level = character(0), - distribution_type = character(0), - gd_type = character(0), - is_interpolated = logical(0), - is_used_for_line_up = logical(0), - is_used_for_aggregation = logical(0), - estimation_type = character(0), - interpolation_id = character(0), - display_cp = numeric(0), - country_name = character(0), - africa_split = character(0), - africa_split_code = character(0), - region_name = character(0), - region_code = character(0), - world = character(0), - world_code = character(0), - path = character(0), - data_interpolation_id = character(0), - poverty_line = numeric(0), - mean = numeric(0), - median = numeric(0), - headcount = numeric(0), - poverty_gap = numeric(0), - poverty_severity = numeric(0), - watts = numeric(0) -) +fg <- fg_pip( + ctr, + year = year, + povline = 3, + welfare_type = "all", + reporting_level = "all", + popshare = NULL, + lkup = lkup +) |> + rbindlist() +empty_response_fg <- fg[-1] + + +add_vars_out_of_pipeline(fg, fill_gaps = TRUE, lkup = lkup) + +empty_response_fg_add <- fg[-1] usethis::use_data( empty_response, @@ -97,5 +58,6 @@ usethis::use_data( reporting_level_list, empty_response_grp, empty_response_fg, + empty_response_fg_add, overwrite = TRUE ) diff --git a/man/empty_response_fg_add.Rd b/man/empty_response_fg_add.Rd new file mode 100644 index 00000000..7bb34bb1 --- /dev/null +++ b/man/empty_response_fg_add.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pipapi-package.R +\docType{data} +\name{empty_response_fg_add} +\alias{empty_response_fg_add} +\title{Dataframe for fill gaps empty response with additional variables} +\format{ +Data frame with 0 rows and 56 columns +} +\usage{ +data(empty_response_fg_add) +} +\description{ +Dataframe for fill gaps empty response with additional variables +} +\keyword{datasets} diff --git a/man/pip.Rd b/man/pip.Rd index 0161eeef..f128449b 100644 --- a/man/pip.Rd +++ b/man/pip.Rd @@ -15,7 +15,7 @@ pip( reporting_level = c("all", "national", "rural", "urban"), ppp = NULL, lkup, - censor = TRUE, + censor = FALSE, lkup_hash = lkup$cache_data_id$hash_pip, additional_ind = FALSE ) diff --git a/man/pip_agg.Rd b/man/pip_agg.Rd index 3eb27292..f47dbba8 100644 --- a/man/pip_agg.Rd +++ b/man/pip_agg.Rd @@ -14,7 +14,7 @@ pip_agg( welfare_type = c("all", "consumption", "income"), reporting_level = c("all", "national"), lkup, - censor = TRUE, + censor = FALSE, lkup_hash = lkup$cache_data_id$hash_pip_grp, additional_ind = FALSE ) diff --git a/man/subset_lkup.Rd b/man/subset_lkup.Rd index d09baf8a..ee5dd4cf 100644 --- a/man/subset_lkup.Rd +++ b/man/subset_lkup.Rd @@ -14,7 +14,8 @@ subset_lkup( data_dir = NULL, povline, cache_file_path, - fill_gaps + fill_gaps, + popshare = NULL ) } \arguments{ @@ -39,6 +40,9 @@ for region selection} \item{fill_gaps}{logical: If set to TRUE, will interpolate / extrapolate values for missing years} + +\item{popshare}{numeric: Proportion of the population living below the +poverty line} } \value{ data.frame diff --git a/man/ui_pc_charts.Rd b/man/ui_pc_charts.Rd index 49715b48..9801ecfe 100644 --- a/man/ui_pc_charts.Rd +++ b/man/ui_pc_charts.Rd @@ -13,6 +13,7 @@ ui_pc_charts( welfare_type = c("all", "consumption", "income"), reporting_level = c("all", "national", "rural", "urban"), pop_units = 1e+06, + censor = TRUE, lkup ) } @@ -36,6 +37,8 @@ sub-groups} \item{pop_units}{numeric: Units used to express population numbers (default to million)} +\item{censor}{logical: Triggers censoring of country/year statistics} + \item{lkup}{list: A list of lkup tables} } \value{ From d057467c15d130e491607ef0d3fa45671f58da94 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Tue, 16 Sep 2025 10:20:33 -0400 Subject: [PATCH 132/203] first attempt of infer_poverty_line --- R/infer_poverty_line.R | 68 ++++++++++++++++++++++++++++++++++++++++++ R/pip_old.R | 3 ++ R/rg_pip.R | 8 +++++ 3 files changed, 79 insertions(+) create mode 100644 R/infer_poverty_line.R diff --git a/R/infer_poverty_line.R b/R/infer_poverty_line.R new file mode 100644 index 00000000..39dd517e --- /dev/null +++ b/R/infer_poverty_line.R @@ -0,0 +1,68 @@ +infer_poverty_line <- function(welfare, weight, popshare = 0.5, + include = FALSE, + method = c("nearest","interp"), + assume_sorted = FALSE) { + method <- match.arg(method) + + # basic checks + if (length(welfare) != length(weight)) stop("welfare and weight must have the same length") + if (anyNA(welfare) || anyNA(weight)) stop("welfare and weight cannot contain NA") + if (any(weight < 0)) stop("weights must be non-negative") + if (!length(popshare)) return(numeric(0)) + + # clamp probs + p <- pmin(pmax(as.numeric(popshare), 0), 1) + + # fast sort (or not) + if (!assume_sorted) { + o <- if (requireNamespace("data.table", quietly = TRUE)) { + data.table::forder(welfare, na.last = FALSE) + } else { + order(welfare, na.last = NA, method = "radix") + } + y <- welfare[o]; w <- weight[o] + } else { + y <- welfare; w <- weight + } + + if (method == "interp") { + # collapse::fquantile: weighted linear interpolation + # - 'sorted' tells fquantile that 'y' is already sorted + # - 'w' supplies weights + # 'include' is not used here: interpolation doesn't have that discrete toggle + return(collapse::fquantile(y, probs = p, w = w, sorted = TRUE)) + } + + # ---- method == "nearest" (matches your function) ---- + # cumulative weight fractions + W <- collapse::fsum(w) + if (W <= 0) stop("sum(weight) must be > 0") + cw <- collapse::fcumsum(w) + prob <- cw / W + n <- length(y) + + # for each p, find the nearest cumulative location (ties -> lower index) + j <- findInterval(p, prob, left.open = FALSE) # j ∈ {0..n} + j[j < 0L] <- 0L; j[j > n] <- n + + prev_idx <- pmax.int(j, 1L) # 1..n + next_idx <- pmin.int(j + 1L, n) # 1..n + d_prev <- p - prob[prev_idx] + d_next <- prob[next_idx] - p + use_next <- (d_next < d_prev) & (j < n) + idx <- ifelse(use_next, next_idx, prev_idx) # final index + + if (!include) { + # take the discrete value at the nearest location + return(y[idx]) + } else { + # average the two neighbors using their weights (your original rule) + idx2 <- pmin.int(idx + 1L, n) + wi <- w[idx] + wi2 <- w[idx2] + s <- wi + wi2 + num <- wi * y[idx] + wi2 * y[idx2] + out <- ifelse((idx == n) | (s <= 0), y[idx], num / s) + return(out) + } +} diff --git a/R/pip_old.R b/R/pip_old.R index 34fc5371..3378876d 100644 --- a/R/pip_old.R +++ b/R/pip_old.R @@ -307,6 +307,9 @@ rg_pip_old <- function(country, valid_regions <- lkup$query_controls$region$values svy_lkup <- lkup$svy_lkup data_dir <- lkup$data_root + # povline is set to NULL if popshare is given + if (!is.null(popshare)) povline <- NULL + cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb") diff --git a/R/rg_pip.R b/R/rg_pip.R index 279f4d49..4cdf0181 100644 --- a/R/rg_pip.R +++ b/R/rg_pip.R @@ -17,6 +17,8 @@ rg_pip <- function(country, valid_regions <- lkup$query_controls$region$values svy_lkup <- lkup$svy_lkup data_dir <- lkup$data_root + # povline is set to NULL if popshare is given + if (!is.null(popshare)) povline <- NULL cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb") @@ -52,6 +54,12 @@ rg_pip <- function(country, # load data lt <- load_data_list(metadata) + # Calculate and update poverty line if popshare is passed + if (!is.null(popshare)) { + povline <- lapply(lt, \(x) { + wbpip:::md_infer_poverty_line(x$welfare, x$weight, popshare) + }) + } # parallelization # res <- get_pov_estimates(lt, povline = povline) From 819caaae295e9e286cdf688518d8625fffa1b225 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Tue, 16 Sep 2025 10:27:39 -0400 Subject: [PATCH 133/203] document --- R/infer_poverty_line.R | 28 +++++++++++++++++++++++++ man/infer_poverty_line.Rd | 44 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) create mode 100644 man/infer_poverty_line.Rd diff --git a/R/infer_poverty_line.R b/R/infer_poverty_line.R index 39dd517e..4ee9cb53 100644 --- a/R/infer_poverty_line.R +++ b/R/infer_poverty_line.R @@ -1,3 +1,31 @@ +#' Infer the poverty line for a given population share +#' +#' Computes the welfare value (poverty line) corresponding to a given population +#' share, using either nearest or interpolated weighted quantile methods. +#' Supports both discrete (nearest) and linear interpolation approaches, and can +#' optionally average neighbors for ties. +#' +#' @param welfare Numeric vector of welfare values (e.g., income or +#' consumption). +#' @param weight Numeric vector of sampling weights (must be non-negative, same +#' length as welfare). +#' @param popshare Numeric vector of population shares (probabilities in [0,1]); +#' default is 0.5 (median). +#' @param include Logical; if TRUE, averages neighbors for ties (only for method +#' = "nearest"). +#' @param method Character; either "nearest" (default, discrete quantile) or +#' "interp" (weighted linear interpolation). +#' @param assume_sorted Logical; if TRUE, assumes welfare and weight are already +#' sorted by welfare. +#' +#' @return Numeric vector of poverty line(s) corresponding to the requested +#' population share(s). +#' @details +#' - If method = "nearest", returns the welfare value at the closest cumulative weight fraction to each popshare. +#' - If method = "interp", uses collapse::fquantile for weighted linear interpolation. +#' - If include = TRUE (and method = "nearest"), averages the two closest neighbors using their weights. +#' - Returns numeric(0) if popshare is empty. +#' @keywords internal infer_poverty_line <- function(welfare, weight, popshare = 0.5, include = FALSE, method = c("nearest","interp"), diff --git a/man/infer_poverty_line.Rd b/man/infer_poverty_line.Rd new file mode 100644 index 00000000..3344c543 --- /dev/null +++ b/man/infer_poverty_line.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/infer_poverty_line.R +\name{infer_poverty_line} +\alias{infer_poverty_line} +\title{Infer the poverty line for a given population share} +\usage{ +infer_poverty_line( + welfare, + weight, + popshare = 0.5, + include = FALSE, + method = c("nearest", "interp"), + assume_sorted = FALSE +) +} +\arguments{ +\item{welfare}{Numeric vector of welfare values (e.g., income or consumption).} + +\item{weight}{Numeric vector of sampling weights (must be non-negative, same length as welfare).} + +\item{popshare}{Numeric vector of population shares (probabilities in \link{0,1}); default is 0.5 (median).} + +\item{include}{Logical; if TRUE, averages neighbors for ties (only for method = "nearest").} + +\item{method}{Character; either "nearest" (default, discrete quantile) or "interp" (weighted linear interpolation).} + +\item{assume_sorted}{Logical; if TRUE, assumes welfare and weight are already sorted by welfare.} +} +\value{ +Numeric vector of poverty line(s) corresponding to the requested population share(s). +} +\description{ +Computes the welfare value (poverty line) corresponding to a given population share, using either nearest or interpolated weighted quantile methods. +Supports both discrete (nearest) and linear interpolation approaches, and can optionally average neighbors for ties. +} +\details{ +\itemize{ +\item If method = "nearest", returns the welfare value at the closest cumulative weight fraction to each popshare. +\item If method = "interp", uses collapse::fquantile for weighted linear interpolation. +\item If include = TRUE (and method = "nearest"), averages the two closest neighbors using their weights. +\item Returns numeric(0) if popshare is empty. +} +} +\keyword{internal} From bd23ba13a56fc75d1343502b92d7656ae6788d36 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Tue, 16 Sep 2025 11:19:26 -0400 Subject: [PATCH 134/203] multiple popshare for svy years. --- R/infer_poverty_line.R | 44 ++++++++++++++++++++++-------------------- R/rg_pip.R | 24 ++++++++++++++++------- R/utils.R | 12 ++++++------ 3 files changed, 46 insertions(+), 34 deletions(-) diff --git a/R/infer_poverty_line.R b/R/infer_poverty_line.R index 4ee9cb53..9b3a53fe 100644 --- a/R/infer_poverty_line.R +++ b/R/infer_poverty_line.R @@ -29,13 +29,13 @@ infer_poverty_line <- function(welfare, weight, popshare = 0.5, include = FALSE, method = c("nearest","interp"), - assume_sorted = FALSE) { + assume_sorted = TRUE) { method <- match.arg(method) - # basic checks - if (length(welfare) != length(weight)) stop("welfare and weight must have the same length") - if (anyNA(welfare) || anyNA(weight)) stop("welfare and weight cannot contain NA") - if (any(weight < 0)) stop("weights must be non-negative") + # defenses + if (length(welfare) != length(weight)) cli::cli_abort("welfare and weight must have the same length") + if (anyNA(welfare) || anyNA(weight)) cli::cli_abort("welfare and weight cannot contain NA") + if (any(weight < 0)) cli::cli_abort("weights must be non-negative") if (!length(popshare)) return(numeric(0)) # clamp probs @@ -43,54 +43,56 @@ infer_poverty_line <- function(welfare, weight, popshare = 0.5, # fast sort (or not) if (!assume_sorted) { - o <- if (requireNamespace("data.table", quietly = TRUE)) { - data.table::forder(welfare, na.last = FALSE) - } else { - order(welfare, na.last = NA, method = "radix") - } - y <- welfare[o]; w <- weight[o] + o <- data.table:::forder(welfare) + y <- welfare[o] + w <- weight[o] } else { - y <- welfare; w <- weight + y <- welfare + w <- weight + o <- seq_along(welfare) } if (method == "interp") { # collapse::fquantile: weighted linear interpolation - # - 'sorted' tells fquantile that 'y' is already sorted - # - 'w' supplies weights # 'include' is not used here: interpolation doesn't have that discrete toggle - return(collapse::fquantile(y, probs = p, w = w, sorted = TRUE)) + return(fquantile(y, + probs = p, + w = w, + o = o, + names = FALSE)) } # ---- method == "nearest" (matches your function) ---- # cumulative weight fractions - W <- collapse::fsum(w) + W <- fsum(w) if (W <= 0) stop("sum(weight) must be > 0") - cw <- collapse::fcumsum(w) + cw <- fcumsum(w) prob <- cw / W n <- length(y) # for each p, find the nearest cumulative location (ties -> lower index) j <- findInterval(p, prob, left.open = FALSE) # j ∈ {0..n} - j[j < 0L] <- 0L; j[j > n] <- n + j[j < 0L] <- 0L # fit into boundries of vector + j[j > n] <- n prev_idx <- pmax.int(j, 1L) # 1..n next_idx <- pmin.int(j + 1L, n) # 1..n d_prev <- p - prob[prev_idx] d_next <- prob[next_idx] - p use_next <- (d_next < d_prev) & (j < n) - idx <- ifelse(use_next, next_idx, prev_idx) # final index + idx <- fifelse(use_next, next_idx, prev_idx) # final index if (!include) { # take the discrete value at the nearest location return(y[idx]) } else { - # average the two neighbors using their weights (your original rule) + # average the two neighbors using their weights (the original rule in wbpip) idx2 <- pmin.int(idx + 1L, n) wi <- w[idx] wi2 <- w[idx2] s <- wi + wi2 num <- wi * y[idx] + wi2 * y[idx2] - out <- ifelse((idx == n) | (s <= 0), y[idx], num / s) + out <- fifelse((idx == n) | (s <= 0), y[idx], num / s) return(out) } } diff --git a/R/rg_pip.R b/R/rg_pip.R index 4cdf0181..905677b0 100644 --- a/R/rg_pip.R +++ b/R/rg_pip.R @@ -19,6 +19,7 @@ rg_pip <- function(country, data_dir <- lkup$data_root # povline is set to NULL if popshare is given if (!is.null(popshare)) povline <- NULL + if (is.list(povline)) povline <- unlist(povline) cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb") @@ -57,19 +58,28 @@ rg_pip <- function(country, # Calculate and update poverty line if popshare is passed if (!is.null(popshare)) { povline <- lapply(lt, \(x) { - wbpip:::md_infer_poverty_line(x$welfare, x$weight, popshare) + # wbpip:::md_infer_poverty_line(x$welfare, x$weight, popshare) + infer_poverty_line(welfare = x$welfare, + weight = x$weight, + popshare = popshare, + include = FALSE, + method = "nearest", + assume_sorted = TRUE) }) } - # parallelization - # res <- get_pov_estimates(lt, povline = povline) - - # Regular lapply - res <- lapply(lt, process_dt, povline = povline) + # if popshare is not null, povline will be list + if (is.list(povline)) { + # If povline is list, it comes from infer_poverty_line when popshare + # is not null. Then, we have one set of lines per survey. + res <- Map(process_dt, lt, povline) + } else { + # if povline is vector, it should be applied to all surveys in lt + res <- lapply(lt, process_dt, povline = povline) + } res <- rbindlist(res, fill = TRUE) - # clean data metadata[, file := basename(path)] diff --git a/R/utils.R b/R/utils.R index 88011ff4..9c09d1a2 100644 --- a/R/utils.R +++ b/R/utils.R @@ -20,12 +20,6 @@ subset_lkup <- function(country, popshare = NULL ) { - if (!is.null(popshare)) { - return(list(data_present_in_master = NULL, - lkup = lkup, - povline = povline)) - } - # STEP 1 - Keep every row by default keep <- rep(TRUE, nrow(lkup)) # STEP 2 - Select countries @@ -53,6 +47,12 @@ subset_lkup <- function(country, lkup <- lkup[keep, ] + if (!is.null(popshare)) { + return(list(data_present_in_master = NULL, + lkup = lkup, + povline = NULL)) + } + # Return with grace return_if_exists(slkup = lkup, povline = povline, From 90deb51beec635181c41cc51bce1bc9abeb2b36b Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Tue, 16 Sep 2025 11:29:14 -0400 Subject: [PATCH 135/203] add prosperity gapto ui_cp_charts --- R/ui_country_profile.R | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/R/ui_country_profile.R b/R/ui_country_profile.R index 20fe7dac..75fa6102 100644 --- a/R/ui_country_profile.R +++ b/R/ui_country_profile.R @@ -118,8 +118,16 @@ ui_cp_charts <- function(country = "AGO", x[country_code == country] }) - dl <- list(append(dl, dl2)) - names(dl) <- country + # Add prosperity gap + pg <- get_aux_table(data_dir = lkup$data_root, + "pg_svy") + + pg <- pg[country_code == country] + dl2[["pg"]] <- pg + + + dl <- list(append(dl, dl2)) |> + setNames(country) return(dl) } From 4bf15af783a3f4d70a6b5320c6a904522c8476b3 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Tue, 16 Sep 2025 11:40:39 -0400 Subject: [PATCH 136/203] allow to select regions from any variable. this must be changed once we allow many other aggregations --- R/utils.R | 80 +++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 57 insertions(+), 23 deletions(-) diff --git a/R/utils.R b/R/utils.R index 9c09d1a2..7c42de11 100644 --- a/R/utils.R +++ b/R/utils.R @@ -20,46 +20,68 @@ subset_lkup <- function(country, popshare = NULL ) { + + lkup <- lkup_filter(lkup, + country, + year, + valid_regions, + reporting_level, + welfare_type, + data_dir) + # If povline is NULL, this happens when popshare is passed + # i.e popshare is not NULL + if (is.null(povline)) { + return(list( + data_present_in_master = NULL, + lkup = lkup, + povline = NULL + )) + } + # Return with grace + return_if_exists( + slkup = lkup, + povline = povline, + cache_file_path = cache_file_path, + fill_gaps = fill_gaps + ) +} + + +#' @keywords internal +lkup_filter <- function(lkup, + country, + year, + valid_regions, + reporting_level, + welfare_type, + data_dir) { # STEP 1 - Keep every row by default keep <- rep(TRUE, nrow(lkup)) # STEP 2 - Select countries keep <- select_country(lkup, keep, country, valid_regions) # STEP 3 - Select years - keep <- select_years(lkup = lkup, - keep = keep, - year = year, - country = country, - data_dir = data_dir, + keep <- select_years(lkup = lkup, + keep = keep, + year = year, + country = country, + data_dir = data_dir, valid_regions = valid_regions) - # # step 4. Select MRV - # keep <- select_MRV(lkup, keep, year, country, valid_regions, data_dir) - # STEP 4 - Select welfare_type if (welfare_type[1] != "all") { keep <- keep & lkup$welfare_type == welfare_type } # STEP 5 - Select reporting_level - keep <- select_reporting_level(lkup = lkup, - keep = keep, + keep <- select_reporting_level(lkup = lkup, + keep = keep, reporting_level = reporting_level[1]) lkup <- lkup[keep, ] - - if (!is.null(popshare)) { - return(list(data_present_in_master = NULL, - lkup = lkup, - povline = NULL)) - } - - # Return with grace - return_if_exists(slkup = lkup, - povline = povline, - cache_file_path = cache_file_path, - fill_gaps = fill_gaps) + return(lkup) } + #' select_country #' Helper function for subset_lkup() #' @inheritParams subset_lkup @@ -72,7 +94,19 @@ select_country <- function(lkup, keep, country, valid_regions) { # Select regions if (any(country %in% valid_regions)) { selected_regions <- country[country %in% valid_regions] - keep_regions <- lkup$region_code %in% selected_regions + # Find all columns ending with _code + code_cols <- grep("_code$", names(lkup), value = TRUE) + code_cols <- code_cols[!code_cols %in% "wb_region_code"] # Temporary solution + # For each code column, check if any value matches selected_regions + keep_regions_list <- lapply(code_cols, \(col) { + lkup[[col]] %in% selected_regions + }) + # Combine with logical OR across all code columns + if (length(keep_regions_list) > 0) { + keep_regions <- Reduce(`|`, keep_regions_list) + } else { + keep_regions <- rep(FALSE, nrow(lkup)) + } } else { keep_regions <- rep(FALSE, length(lkup$country_code)) } From 916780cc944a45980e535b637dc6b9f8e7add4b7 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Tue, 16 Sep 2025 11:43:44 -0400 Subject: [PATCH 137/203] fix bug --- R/pip_grp_new.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pip_grp_new.R b/R/pip_grp_new.R index ecd7ce22..efa25bb8 100644 --- a/R/pip_grp_new.R +++ b/R/pip_grp_new.R @@ -23,7 +23,7 @@ pip_grp_new <- \(country = "ALL", # Select countries to estimate poverty cts <- copy(lkup$aux_files$country_list) - country_code <- if (country != "ALL") { + country_code <- if (!"ALL" %in% country) { get_country_code_subset(dt = cts, country = country) } else { "ALL" From bd65c2ee84ae221a5310235cd9dc04fe70051e30 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Tue, 16 Sep 2025 12:01:38 -0400 Subject: [PATCH 138/203] fix some tests --- tests/testthat/test-fg_pip-local.R | 27 +++++++++++---------------- tests/testthat/test-pip-local.R | 10 ++++++++++ tests/testthat/test-utils.R | 15 ++++++++++----- 3 files changed, 31 insertions(+), 21 deletions(-) diff --git a/tests/testthat/test-fg_pip-local.R b/tests/testthat/test-fg_pip-local.R index 13a56c84..b73460ca 100644 --- a/tests/testthat/test-fg_pip-local.R +++ b/tests/testthat/test-fg_pip-local.R @@ -30,8 +30,7 @@ test_that("Imputation is working for extrapolated aggregated distribution", { welfare_type = "all", reporting_level = "all", ppp = NULL, - lkup = lkup, - con = con + lkup = lkup ) expect_equal(nrow(tmp$main_data), 0) @@ -44,8 +43,7 @@ test_that("Imputation is working for extrapolated aggregated distribution", { welfare_type = "all", reporting_level = "national", ppp = NULL, - lkup = lkup, - con = con + lkup = lkup ) expect_equal(nrow(tmp$main_data), 0) @@ -61,8 +59,7 @@ test_that("Imputation is working for interpolated mixed distribution", { welfare_type = "all", reporting_level = "all", ppp = NULL, - lkup = lkup, - con = con + lkup = lkup ) expect_equal(nrow(tmp$main_data), 0) @@ -75,8 +72,7 @@ test_that("Imputation is working for interpolated mixed distribution", { welfare_type = "all", reporting_level = "national", ppp = NULL, - lkup = lkup, - con = con + lkup = lkup ) expect_equal(nrow(tmp$main_data), 0) @@ -91,11 +87,11 @@ test_that("Imputation is working for interpolated aggregate distribution", { welfare_type = "all", reporting_level = "all", ppp = NULL, - lkup = lkup, - con = con + lkup = lkup ) - expect_equal(nrow(tmp$main_data), 2) + expect_equal(nrow(tmp$main_data), 0) + expect_equal(nrow(tmp$data_in_cache), 2) tmp <- fg_pip( country = "CHN", @@ -105,11 +101,11 @@ test_that("Imputation is working for interpolated aggregate distribution", { welfare_type = "all", reporting_level = "national", ppp = NULL, - lkup = lkup, - con = con + lkup = lkup ) - expect_equal(nrow(tmp$main_data), 2) + expect_equal(nrow(tmp$main_data), 0) + expect_equal(nrow(tmp$data_in_cache), 2) }) @@ -158,8 +154,7 @@ tmp <- fg_pip( welfare_type = "all", reporting_level = "all", ppp = NULL, - lkup = lkup, - con = con + lkup = lkup ) tmp <- tmp$data_in_cache |> as.data.table() # dt <- pip(country = "ALL", diff --git a/tests/testthat/test-pip-local.R b/tests/testthat/test-pip-local.R index 59e8ac89..43cf92ce 100644 --- a/tests/testthat/test-pip-local.R +++ b/tests/testthat/test-pip-local.R @@ -841,3 +841,13 @@ test_that("SPL is the same by reporting level", { }) + +test_that("make sure popshare bug no which was reported no longer exists", { + out <- pip(country = "USA", year = 2022, + popshare = .5, lkup = lkup) + # Ensure poverty line is not the default one + expect_false(out$poverty_line %in% c(1.9, 3)) + # Ensure headcount is closer to 0.5 + expect_equal(out$headcount, 0.5, tolerance = .05) +}) + diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index 7b2b53a4..da735a34 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -66,7 +66,8 @@ test_that("subset_lkup correctly selects all countries", { reporting_level = "all", lkup = ref_lkup, valid_regions = valid_regions, - data_dir = data_dir) + data_dir = data_dir, + povline = NULL) expect_equal(nrow(tmp$lkup), nrow(ref_lkup)) }) @@ -79,7 +80,8 @@ test_that("subset_lkup correctly selects countries", { reporting_level = "all", lkup = ref_lkup, valid_regions = valid_regions, - data_dir = data_dir) + data_dir = data_dir, + povline = NULL) expect_equal(sort(unique(tmp$lkup$country_code)), sort(selection)) }) @@ -92,7 +94,8 @@ test_that("subset_lkup correctly selects single regions", { reporting_level = "all", lkup = ref_lkup, valid_regions = valid_regions, - data_dir = data_dir) + data_dir = data_dir, + povline = NULL) expect_equal(sort(unique(tmp$lkup$region_code)), sort(selection)) }) @@ -105,7 +108,8 @@ test_that("subset_lkup correctly selects multiple regions", { reporting_level = "all", lkup = ref_lkup, valid_regions = valid_regions, - data_dir = data_dir) + data_dir = data_dir, + povline = NULL) expect_equal(sort(unique(tmp$lkup$region_code)), sort(selection)) }) @@ -122,7 +126,8 @@ test_that("subset_lkup correctly selects countries and regions", { reporting_level = "all", lkup = ref_lkup, valid_regions = valid_regions, - data_dir = data_dir) + data_dir = data_dir, + povline = NULL) # Regions are selected expect_true(all(region_selection %in% (unique(tmp$lkup$region_code)))) From 449cf1592c079e58932ceeed086f5c8c39cfc59d Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Tue, 16 Sep 2025 12:09:56 -0400 Subject: [PATCH 139/203] add documentation --- DESCRIPTION | 2 +- NEWS.md | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 2e8242b1..69cf9b62 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pipapi Title: API for the Poverty and Inequality Platform -Version: 1.3.19.9004 +Version: 1.3.24 Authors@R: c(person(given = "Tony", family = "Fujs", diff --git a/NEWS.md b/NEWS.md index 50441fb5..d13ef03c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,28 @@ # pipapi (development version) +# pipapi 1.3.24 +* Incorporate Lineup distribution and Countries with Missing Data (CMD) distributions +* New way to estimate poverty using cumulative sums +* allow multiple `popshare` + + +# pipapi 1.3.23 +* Add Venn diagram information to Country profiles +* Add prosperity gap to country profiles chart. + + +# pipapi 1.3.22 + +* Fix issue with popshare in fill gaps calls. + +# pipapi 1.3.21 +* Fix issue with popshare in survey year calls +* Fix problem with alternative aggregates like AFW and AFE +* Make sure all tests pass +* add logs in docker container + +# pipapi 1.3.20 + # pipapi 1.3.19 * fix issue with comparability From ed575a6689b22acb8a264b54962f36ecef0509d7 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Tue, 16 Sep 2025 12:11:01 -0400 Subject: [PATCH 140/203] Increment version number to 1.3.24.9000 --- DESCRIPTION | 2 +- NEWS.md | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 69cf9b62..0c170952 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pipapi Title: API for the Poverty and Inequality Platform -Version: 1.3.24 +Version: 1.3.24.9000 Authors@R: c(person(given = "Tony", family = "Fujs", diff --git a/NEWS.md b/NEWS.md index d13ef03c..4808fc19 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,6 @@ # pipapi (development version) +* add new way to infer poverty lines + # pipapi 1.3.24 * Incorporate Lineup distribution and Countries with Missing Data (CMD) distributions From 549152b9e06779419912d691b394d94569e951e2 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Tue, 16 Sep 2025 12:29:43 -0400 Subject: [PATCH 141/203] remove filter_Lkup from fg_pip() --- R/fg_pip.R | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/R/fg_pip.R b/R/fg_pip.R index b0761839..ad28bf60 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -18,16 +18,13 @@ fg_pip <- function(country, valid_regions <- lkup$query_controls$region$values interpolation_list <- lkup$interpolation_list data_dir <- lkup$data_root - refy_lkup <- lkup$refy_lkup # cleaned refy table, unique by country-years but some columns removed in order to do that + refy_lkup <- lkup$refy_lkup # cleaned refy table, unique by country- + + #povline is set to NULL if popshare is given if (!is.null(popshare)) povline <- NULL + if (is.list(povline)) povline <- unlist(povline) cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb") - # fg_pip is called from multiple places like pip, pip_grp_logic. We have connection object created - # when calling from `pip`. For other functions we create it here. - # if (is.null(con)) { - # cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb") - # con <- duckdb::dbConnect(duckdb::duckdb(), dbdir = cache_file_path, read_only = TRUE) - # } # Handle interpolation metadata <- subset_lkup( @@ -46,10 +43,7 @@ fg_pip <- function(country, data_present_in_master <- metadata$data_present_in_master povline <- metadata$povline metadata <- metadata$lkup - # Remove aggregate distribution if popshare is specified - # TEMPORARY FIX UNTIL popshare is supported for aggregate distributions - metadata <- filter_lkup(metadata = metadata, - popshare = popshare) + setDT(metadata) # Return empty dataframe if no metadata is found (i.e. all in cache) From a0b8eb3f7fa09dfe2b4218c6dec5c4c7d59afec1 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Tue, 16 Sep 2025 12:51:19 -0400 Subject: [PATCH 142/203] add multiple popshare for fillgaps --- R/compute_fgt_new.R | 7 +++-- R/fg_pip.R | 62 ++++++++++++++++++++++++++++++++------------- 2 files changed, 49 insertions(+), 20 deletions(-) diff --git a/R/compute_fgt_new.R b/R/compute_fgt_new.R index c6d52d39..6c68b139 100644 --- a/R/compute_fgt_new.R +++ b/R/compute_fgt_new.R @@ -210,9 +210,12 @@ map_fgt <- \(lt, l_rl_rows, povline) { rbindlist(fill = TRUE) } -process_dt <- function(dt, povline, mean_and_med = FALSE) { +process_dt <- function(dt, povline, + mean_and_med = FALSE, + id_var = "file") { + byvars <- c(id_var, "reporting_level") dt[, compute_fgt_dt(.SD, "welfare", "weight", povline, mean_and_med), - by = .(file, reporting_level)] + by = byvars] } #' load survey year files and store them in a list diff --git a/R/fg_pip.R b/R/fg_pip.R index ad28bf60..ccebf59a 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -42,9 +42,8 @@ fg_pip <- function(country, data_present_in_master <- metadata$data_present_in_master povline <- metadata$povline - metadata <- metadata$lkup - - setDT(metadata) + metadata <- metadata$lkup |> + setDT() # Return empty dataframe if no metadata is found (i.e. all in cache) if (nrow(metadata) == 0) { @@ -62,22 +61,49 @@ fg_pip <- function(country, # Load all survey data files into a named list of data.tables, each with an id column. lfst <- load_list_refy(input_list = full_list) + # Calculate and update poverty line if popshare is passed + # YES. this is INEFFICIENT because welfare cumsum is already created in + # the data, but we don't have time... FIX for the next release + if (!is.null(popshare)) { + povline <- lapply(lfst, \(x) { + # wbpip:::md_infer_poverty_line(x$welfare, x$weight, popshare) + infer_poverty_line(welfare = x$welfare, + weight = x$weight, + popshare = popshare, + include = FALSE, + method = "nearest", + assume_sorted = TRUE) + }) + + fgt <- Map(process_dt, lfst, povline, id_var = "id") |> + rbindlist(fill = TRUE) + + fgt[, `:=`( + country_code = gsub("(.+)(_.+)", "\\1", id), + reporting_year = as.integer(gsub("(.+_)(.+)", "\\2", id)) + )][, + id := NULL] + + + } else { + # Combine all loaded surveys into a single data.table, encode group identifiers, + # and create a GRP object for efficient grouping. + LDTg <- format_lfst(lfst = lfst, + dict = dict) + + # Compute the total population (sum of weights) for each group (id_rl) in + # the combined survey data. + tpop <- get_total_pop(LDTg = LDTg) + + # Compute FGT and Watts indices for all groups and poverty lines, then decode + # integer codes back to (country_code, reporting_year, reporting_level). + fgt <- fgt_cumsum(LDTg = LDTg, + tpop = tpop, + povline = povline) |> + decode_pairs(dict = dict) + + } - # Combine all loaded surveys into a single data.table, encode group identifiers, - # and create a GRP object for efficient grouping. - LDTg <- format_lfst(lfst = lfst, - dict = dict) - - # Compute the total population (sum of weights) for each group (id_rl) in - # the combined survey data. - tpop <- get_total_pop(LDTg = LDTg) - - # Compute FGT and Watts indices for all groups and poverty lines, then decode - # integer codes back to (country_code, reporting_year, reporting_level). - fgt <- fgt_cumsum(LDTg = LDTg, - tpop = tpop, - povline = povline) |> - decode_pairs(dict = dict) # Add just mean and median res <- get_mean_median(fgt, lkup, fill_gaps = TRUE) From 57c642d12d128dc239b34b144819145346ff769f Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Tue, 16 Sep 2025 16:51:10 -0400 Subject: [PATCH 143/203] update empty data --- data-raw/data.R | 2 +- data/empty_response_fg.rda | Bin 670 -> 735 bytes data/empty_response_fg_add.rda | Bin 0 -> 754 bytes 3 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 data/empty_response_fg_add.rda diff --git a/data-raw/data.R b/data-raw/data.R index a0a6a16f..2a87ea42 100644 --- a/data-raw/data.R +++ b/data-raw/data.R @@ -44,7 +44,7 @@ fg <- fg_pip( popshare = NULL, lkup = lkup ) |> - rbindlist() + rbindlist(fill = TRUE) empty_response_fg <- fg[-1] diff --git a/data/empty_response_fg.rda b/data/empty_response_fg.rda index 6785534c663fcb70b60f3c44a565efdf30383c72..31da4426fd98a9165dd7372c46428379c7e6115a 100644 GIT binary patch literal 735 zcmV<50wDcDT4*^jL0KkKSuA~}L;wR7f589$|9}7iIsl9SP(Z)u-=IJM00B?~z5oCK z0}TxT00E!?000a$GynhwfB*mhFwoEd02%-Q006^7KmY(}00000sU*mNjWh~+GM)(& zX)-+GI-e{9BN2CAN_@tUGTHfPwDRaGF?pSb!K}qJ&JWUu=QCzS+Y#j*s+?+C-}SF z%DY>B=4*VHG;C<*UB-A&o-qQVB&v#vf+cv{o}|43DtyIlJE{ye{LUgv|Y;PZB@;=4D3?3pKW$C8%NyXeVgWu7f;iWpN3s%WIbm@J;m zrly6LX1dC1F1tlGsYLfSEl-9|hPR`=n(fx4!-ABhDYb=@f&ZQSi7AsckE_LHIpbn| z+8c|Xu14}Mr}Et9GCxQ5*4o~1ELle~%1&lU+V*Tlr&$^dI+~70O*xctQg6D-b*!@M zD9v}v5BkyA*2!`6Pn$xSCUQ>VcCoq1gkaK2O&NN&w-Z}5j%=Nd=Tv-Otodw48lI~= zS(is!tiqgB^mLtx(3H%6PA)GN!q3C8(3eL}w~CoK6jF*i9}ib4tl*=dojpDY(%xuu z%M(dP<_{&^OEyI{zqX|P)M0Z=8<{CW#VIJ=gQNA>k%;@PB^GAXM)ah?7Rg6>=2;j^ zr7V`B%^Eq|m@Y26Cz-*>?|7W!&A6oAm76y@J)g7lZ%30v(Z%k=hDpSuYo^RC-rm@! z(Y5C8QD!`vjjhFPbT=m3rV&Zlk*K#tS@TcJ2X^x6GviF0uZI@(A48z%Z)!Fq-A*gX zX9pwXjFW?Krr9^fgVY~nU literal 670 zcmV;P0%83^T4*^jL0KkKS+{~~%m4zVe}Mo0|9}7iIsgm+P(Z)u-=F{iPy=2C0009? z6GjtEj7&@klT1xC#zJ9$0B8UVng9R|0e}E#01TP{01W|v0B8UVng9R|0926#&?X@X z;XTwHV zWo=O;`O$x$#_qjuIy zT*;#+qX$Ips6S{VsSzroqNI@(#bJ7o6+Zt<$S)B@ltoYsK!J=1BF&mi@8Y_p540X? zkr0$ZB?*ZFK_F2N0nJNfVDD2S2YcMPLlrS=Q#B?kVp%;Gnwgeh$%4viFksEGC#kJ! zS?bjLJD9L+wI(iwDN0jCCqd%w+>)1KvEb&ioTEhlnU{Tclf|*bx$=D(wYHQ}jteO| z7j`__BRAz4j9%uWr>)R(Q+5j}?^$LElLu#uH|WuJq1v6S%3U&%>`vA@Tk%FFa#Ci$ zn`t#MNaJ@@dt0pixT6+NHJyx%6RpteejO*hIVClZbG@kGQaIw~rQ1p=M_bwE*l2B0 zz?^vZCl=!~gt0S}U~^r`W|2?3B>xm()@NeKqLh?t!0P@qNsp4!QD*dRN-zs*QQCY< zBN0i37G&hwz|n&R+|CW|*CEZYq}r8>7bJST9nGllW|_Y&{F8}B$5omw&AF{l%bJa> zJ}k{`g><{@mqjPRBO={ZXJUM@JGK}o%atzN_Hb@d;Bq-PH5$@wrxp@f!M)gvrv)%# zab2O2*G)r1r7Y1Rj`vY8Qj;W#_`2V!&+WX;nvLtMr3vy7+Z6OSAe)xB~(cJ(j&glOQL+e zmUe7ohtAr`50w)CKYhf9DoGC~ljzB1KB_W$Eb>!p&GlV^nN;=UepfUp*3pYalkmxO z$xVwHEV6&h9&UyWZQ3$+urq9C;?;|$ev~JKh$qhfuTniwCo&BS1*&7+BP;QF<;w;T;uE~ARdPAjK#^4g7atc=DlQ&G=p z#Yak$b_*%ivdk7ylLu}ulQ&~p><6(ujLKayl6y(q$46Eu#Lh}g*ZelLCbmf(H+DU) zxb(x;bUSvVCb!{db29{Wy3ADMr`@#dPNb$o*6DP-8w)(0&ZNN{^}H0jfkh~zvFQ9d z4QB;7snkw<{1X=AGnBD2lxW2FuzHqkifesZllf7L13MZnr3)0KqfAb(+hRsB_gYFU z$*7IVNr@J^j`Q^_j8g=qlGIt7Hztfs1`HQ?oEzQm1Ca53n~M$US+jK`^7y;k(c;ZB za%*|Aai$5#qerUD7X2IBRQNQ0uX2kq@oF}-D{HBuZNV``Cu&Bb+-lC`?8NtOIk8yI z?p9#n^m1=e@Hw2@+Kq`fQBUSKoh!AO9O Date: Tue, 16 Sep 2025 16:51:29 -0400 Subject: [PATCH 144/203] document --- man/infer_poverty_line.Rd | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/man/infer_poverty_line.Rd b/man/infer_poverty_line.Rd index 3344c543..e00f00ef 100644 --- a/man/infer_poverty_line.Rd +++ b/man/infer_poverty_line.Rd @@ -10,28 +10,37 @@ infer_poverty_line( popshare = 0.5, include = FALSE, method = c("nearest", "interp"), - assume_sorted = FALSE + assume_sorted = TRUE ) } \arguments{ -\item{welfare}{Numeric vector of welfare values (e.g., income or consumption).} +\item{welfare}{Numeric vector of welfare values (e.g., income or +consumption).} -\item{weight}{Numeric vector of sampling weights (must be non-negative, same length as welfare).} +\item{weight}{Numeric vector of sampling weights (must be non-negative, same +length as welfare).} -\item{popshare}{Numeric vector of population shares (probabilities in \link{0,1}); default is 0.5 (median).} +\item{popshare}{Numeric vector of population shares (probabilities in \link{0,1}); +default is 0.5 (median).} -\item{include}{Logical; if TRUE, averages neighbors for ties (only for method = "nearest").} +\item{include}{Logical; if TRUE, averages neighbors for ties (only for method += "nearest").} -\item{method}{Character; either "nearest" (default, discrete quantile) or "interp" (weighted linear interpolation).} +\item{method}{Character; either "nearest" (default, discrete quantile) or +"interp" (weighted linear interpolation).} -\item{assume_sorted}{Logical; if TRUE, assumes welfare and weight are already sorted by welfare.} +\item{assume_sorted}{Logical; if TRUE, assumes welfare and weight are already +sorted by welfare.} } \value{ -Numeric vector of poverty line(s) corresponding to the requested population share(s). +Numeric vector of poverty line(s) corresponding to the requested +population share(s). } \description{ -Computes the welfare value (poverty line) corresponding to a given population share, using either nearest or interpolated weighted quantile methods. -Supports both discrete (nearest) and linear interpolation approaches, and can optionally average neighbors for ties. +Computes the welfare value (poverty line) corresponding to a given population +share, using either nearest or interpolated weighted quantile methods. +Supports both discrete (nearest) and linear interpolation approaches, and can +optionally average neighbors for ties. } \details{ \itemize{ From 27deea46b4fb80617c46621a7f8e75cb4b84864d Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Tue, 16 Sep 2025 17:16:40 -0400 Subject: [PATCH 145/203] fix bug --- R/pip_new_lineups.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/pip_new_lineups.R b/R/pip_new_lineups.R index 7838e3a1..f9610cae 100644 --- a/R/pip_new_lineups.R +++ b/R/pip_new_lineups.R @@ -315,7 +315,9 @@ treat_cache_and_main <- \(out, cache_file_path, rowbind(cached_data) } - pl <- get_from_pipapienv("pl_to_store") + pl <- get_from_pipapienv("pl_to_store") + povline <- main_data[, poverty_line] |> + unique() # Only update master file if poverty line is part of this pl list # Using round to avoid precision error with decimals if (all(round(povline, 2) %in% round(pl, 2))) { From 81691c6d8a833610db0cf058fa416976ed71972e Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Tue, 16 Sep 2025 17:16:48 -0400 Subject: [PATCH 146/203] Increment version number to 1.3.24.9001 --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 0c170952..1c7b2be4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pipapi Title: API for the Poverty and Inequality Platform -Version: 1.3.24.9000 +Version: 1.3.24.9001 Authors@R: c(person(given = "Tony", family = "Fujs", From 0f59379cba13ca4abcacb972111f16e520dba233 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Tue, 16 Sep 2025 21:44:11 -0400 Subject: [PATCH 147/203] sort for surveys with more than 1 reporting_level --- R/fg_pip.R | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/R/fg_pip.R b/R/fg_pip.R index ccebf59a..bcb049df 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -67,12 +67,18 @@ fg_pip <- function(country, if (!is.null(popshare)) { povline <- lapply(lfst, \(x) { # wbpip:::md_infer_poverty_line(x$welfare, x$weight, popshare) + uni_rl <- funique(x$reporting_level) + if (length(uni_rl) > 1) { + assume_sorted <- FALSE + } else { + assume_sorted <- TRUE + } infer_poverty_line(welfare = x$welfare, weight = x$weight, popshare = popshare, include = FALSE, method = "nearest", - assume_sorted = TRUE) + assume_sorted = assume_sorted) }) fgt <- Map(process_dt, lfst, povline, id_var = "id") |> From c3da472a31dec11a7cc8b793d2e5096cefb848e8 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Tue, 16 Sep 2025 21:44:22 -0400 Subject: [PATCH 148/203] Increment version number to 1.3.24.9002 --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 1c7b2be4..757b624c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pipapi Title: API for the Poverty and Inequality Platform -Version: 1.3.24.9001 +Version: 1.3.24.9002 Authors@R: c(person(given = "Tony", family = "Fujs", From 3985f5a26995766d4811e7896b463d49cb480849 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Tue, 16 Sep 2025 22:15:41 -0400 Subject: [PATCH 149/203] remove verbose in join --- R/fgt_cumsum.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/fgt_cumsum.R b/R/fgt_cumsum.R index d34c3ecd..8128b64f 100644 --- a/R/fgt_cumsum.R +++ b/R/fgt_cumsum.R @@ -101,7 +101,8 @@ fgt_cumsum <- \(LDTg, tpop, povline, on = "id_rl", how = "left", validate = "m:1", - drop.dup.cols = "y") |> + drop.dup.cols = "y", + verbose = 0) |> setorder(id_rl, index) From 2503a9379fae549771a106bc8501ab2da60915e7 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Tue, 16 Sep 2025 22:15:50 -0400 Subject: [PATCH 150/203] Increment version number to 1.3.24.9003 --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 757b624c..3683c1c2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pipapi Title: API for the Poverty and Inequality Platform -Version: 1.3.24.9002 +Version: 1.3.24.9003 Authors@R: c(person(given = "Tony", family = "Fujs", From 247caa5f086e2d043d24dac26706a17ee1a245ad Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Wed, 17 Sep 2025 16:39:17 -0400 Subject: [PATCH 151/203] make use of missing data file instead of recreate i --- R/create_lkups.R | 48 ++++++++++++++---------------------------------- 1 file changed, 14 insertions(+), 34 deletions(-) diff --git a/R/create_lkups.R b/R/create_lkups.R index 6b42381d..9d9c9aa7 100644 --- a/R/create_lkups.R +++ b/R/create_lkups.R @@ -214,42 +214,19 @@ create_lkups <- function(data_dir, versions) { ncountries <- nrow(country_list) ly <- lineup_years$lineup_years + cmd <- fs::path(data_dir, + "_aux/missing_data.fst") |> + fst::read_fst(as.data.table = TRUE) |> + fselect(country_code, + reporting_year = year, + welfare_type) - # Add reporting year - cmd <- CJ(country_list$country_code, - ly, c("national", "urban", "rural")) - - setnames(cmd, new = c('country_code', - 'reporting_year', - 'reporting_level')) - - cmd <- joyn::joyn(x = cmd, - y = refy_lkup, - by = c('country_code', - 'reporting_year', - 'reporting_level'), - keep = "anti", - # reportvar = FALSE, - match_type = "1:1") |> - setDT() - - # get number or reporing levels left after anti join - # if less than three, it should NOT be CMD (e.g., ARG or CHN) - cmd[, n := .N, - by = c("country_code", "reporting_year")] - - # Delete unnecessary reporting levels - cmd <- cmd[(reporting_level == "national" & - .joyn == "x" & n == 3) - ] # build some variables cmd[, `:=`( - cache_id = paste(country_code, reporting_year,"NOSVY_D1_CON_CMD", - sep = "_"), survey_coverage = "national", - welfare_type = "consumption", + reporting_level = "national", distribution_type = "CMD distribution", is_interpolated = FALSE, is_used_for_line_up = TRUE, @@ -261,10 +238,13 @@ create_lkups <- function(data_dir, versions) { relative_distance = 1, lineup_approach = "CMD", mult_factor = 1, - .joyn = NULL, - n = NULL - - )] + wt_code = toupper(substr(welfare_type, 1, 3)) + )][, + cache_id := paste(country_code, + reporting_year, + paste0("NOSVY_D1_", wt_code,"_CMD"), + sep = "_") + ][, wt_code := NULL] # Append lineup and CMD info From 7e3cdb8b74969b24e5584a5477421bad07c63ce8 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Thu, 18 Sep 2025 13:01:24 -0400 Subject: [PATCH 152/203] Increment version number to 1.3.24.9004 --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 3683c1c2..b35e67d7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pipapi Title: API for the Poverty and Inequality Platform -Version: 1.3.24.9003 +Version: 1.3.24.9004 Authors@R: c(person(given = "Tony", family = "Fujs", From f04612b93ff7b0843cd6d4c9caa7f3b4ece5412b Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Thu, 18 Sep 2025 17:00:14 -0400 Subject: [PATCH 153/203] remove heavy objects and apply gc() --- R/fg_pip.R | 3 +++ R/pip_new_lineups.R | 3 ++- R/rg_pip.R | 2 ++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/R/fg_pip.R b/R/fg_pip.R index bcb049df..f58d4890 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -108,7 +108,10 @@ fg_pip <- function(country, povline = povline) |> decode_pairs(dict = dict) + rm(LDTg) } + rm(lfst) + invisible(gc()) # Add just mean and median diff --git a/R/pip_new_lineups.R b/R/pip_new_lineups.R index f9610cae..22fea671 100644 --- a/R/pip_new_lineups.R +++ b/R/pip_new_lineups.R @@ -309,7 +309,7 @@ treat_cache_and_main <- \(out, cache_file_path, if (nrow(main_data) > 0) { if (is.null(cached_data)) { - out <- main_data + out <- copy(main_data) } else { out <- main_data |> rowbind(cached_data) @@ -323,6 +323,7 @@ treat_cache_and_main <- \(out, cache_file_path, if (all(round(povline, 2) %in% round(pl, 2))) { update_master_file(main_data, cache_file_path, fill_gaps) } + rm(main_data) } else { out <- cached_data diff --git a/R/rg_pip.R b/R/rg_pip.R index 905677b0..0b216e2b 100644 --- a/R/rg_pip.R +++ b/R/rg_pip.R @@ -77,6 +77,8 @@ rg_pip <- function(country, # if povline is vector, it should be applied to all surveys in lt res <- lapply(lt, process_dt, povline = povline) } + rm(lt) + invisible(gc()) res <- rbindlist(res, fill = TRUE) From 1b5447b4ef3f41819d0fafcc66d2179ef4765b28 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Thu, 18 Sep 2025 17:46:36 -0400 Subject: [PATCH 154/203] filter properly proverty lines to update master (inter cache) file --- R/duckdb_func.R | 28 +++++++++++++++++++++++++++- R/pip_new_lineups.R | 9 +-------- R/pip_old.R | 14 ++++---------- R/pip_old_lineups.R | 7 ------- 4 files changed, 32 insertions(+), 26 deletions(-) diff --git a/R/duckdb_func.R b/R/duckdb_func.R index bc9c89be..2d8647c6 100644 --- a/R/duckdb_func.R +++ b/R/duckdb_func.R @@ -206,9 +206,35 @@ return_if_exists <- function(slkup, update_master_file <- function(dat, cache_file_path, fill_gaps, - verbose = getOption("pipapi.verbose") + verbose = getOption("pipapi.verbose"), + decimal = 2 ) { + # select the right lines + pl <- get_from_pipapienv("pl_to_store") + + + # Keep only rows with <= 2 decimal places + to_keep <- get_vars(dat, "poverty_line") |> + reg_elem() |> # extract vectos + as.character() |> + sub("^[^.]*\\.?","", x = _) |> # get only the decimal part + (\(x) which(nchar(x) <= decimal))() + + dat <- dat[to_keep] + + povline <- dat[, poverty_line] |> + unique() + + # Keep only those that belong to the list + wpl <- povline[povline %in% round(pl, decimal)] + + if (length(wpl) == 0) return(invisible(FALSE)) + + dat <- dat[poverty_line %in% wpl] + + if (nrow(dat) == 0) return(invisible(FALSE)) + write_con <- connect_with_retry(cache_file_path, read_only = FALSE) if (fill_gaps) { diff --git a/R/pip_new_lineups.R b/R/pip_new_lineups.R index 22fea671..a7c51b3e 100644 --- a/R/pip_new_lineups.R +++ b/R/pip_new_lineups.R @@ -315,14 +315,7 @@ treat_cache_and_main <- \(out, cache_file_path, rowbind(cached_data) } - pl <- get_from_pipapienv("pl_to_store") - povline <- main_data[, poverty_line] |> - unique() - # Only update master file if poverty line is part of this pl list - # Using round to avoid precision error with decimals - if (all(round(povline, 2) %in% round(pl, 2))) { - update_master_file(main_data, cache_file_path, fill_gaps) - } + update_master_file(main_data, cache_file_path, fill_gaps) rm(main_data) } else { diff --git a/R/pip_old.R b/R/pip_old.R index 3378876d..e0194c20 100644 --- a/R/pip_old.R +++ b/R/pip_old.R @@ -141,18 +141,12 @@ pip_old <- function(country = "ALL", if (nrow(main_data) > 0) { out <- main_data |> - collapse::fmutate(path = as.character(path)) |> - collapse::rowbind(cached_data) + fmutate(path = as.character(path)) |> + rowbind(cached_data) # cached_data is NULL when we are querying live data in which case we don't update cache # This will be used only for development purpose and we don't have any intention to use it in production. - if(!is.null(cached_data)) { - # Update cache with data - pl <- get_from_pipapienv("pl_to_store") - # Only update master file if poverty line is part of this pl list - # Using round to avoid precision error with decimals - if (all(round(povline, 2) %in% round(pl, 2))) { - update_master_file(main_data, cache_file_path, fill_gaps) - } + if (!is.null(cached_data)) { + update_master_file(main_data, cache_file_path, fill_gaps) } } else { out <- cached_data diff --git a/R/pip_old_lineups.R b/R/pip_old_lineups.R index f7000674..1bb86dea 100644 --- a/R/pip_old_lineups.R +++ b/R/pip_old_lineups.R @@ -145,14 +145,7 @@ pip_old_lineups <- function(country = "ALL", out <- main_data |> rowbind(cached_data) - - pl <- get_from_pipapienv("pl_to_store") - # Only update master file if poverty line is part of this pl list - # Using round to avoid precision error with decimals - if (all(round(povline, 2) %in% round(pl, 2))) { update_master_file(main_data, cache_file_path, fill_gaps) - } - } else { out <- cached_data From c76bbd394b86a9c7f03973e7b27af5df30257c46 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Thu, 18 Sep 2025 18:30:27 -0400 Subject: [PATCH 155/203] Increment version number to 1.3.24.9005 --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index b35e67d7..a4b955d8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pipapi Title: API for the Poverty and Inequality Platform -Version: 1.3.24.9004 +Version: 1.3.24.9005 Authors@R: c(person(given = "Tony", family = "Fujs", From 84ecf2877c5e44e7c6dd59631dc0821dc8b15544 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Fri, 19 Sep 2025 08:10:44 -0400 Subject: [PATCH 156/203] remove gc() --- R/duckdb_func.R | 8 ++++---- R/fg_pip.R | 2 +- R/rg_pip.R | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/R/duckdb_func.R b/R/duckdb_func.R index 2d8647c6..12b47e80 100644 --- a/R/duckdb_func.R +++ b/R/duckdb_func.R @@ -291,7 +291,7 @@ update_master_file <- function(dat, ")) duckdb::dbDisconnect(write_con) - invisible(gc()) + if (nr > 0 && verbose) message(glue("{target_file} is updated.")) return(nr) @@ -362,7 +362,7 @@ reset_cache <- function(pass = Sys.getenv('PIP_CACHE_LOCAL_KEY'), DBI::dbExecute(write_con, "DELETE from fg_master_file") } duckdb::dbDisconnect(write_con) - invisible(gc()) + } create_duckdb_file <- function(cache_file_path) { @@ -388,7 +388,7 @@ create_duckdb_file <- function(cache_file_path) { watts DOUBLE )") DBI::dbDisconnect(con) - invisible(gc()) + } @@ -423,7 +423,7 @@ load_inter_cache <- function(lkup = NULL, # connection object if it is not closed More details here # https://app.clickup.com/t/868cdpe3q duckdb::dbDisconnect(con) - invisible(gc()) + setDT(master_file) } diff --git a/R/fg_pip.R b/R/fg_pip.R index f58d4890..726a0779 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -111,7 +111,7 @@ fg_pip <- function(country, rm(LDTg) } rm(lfst) - invisible(gc()) + # Add just mean and median diff --git a/R/rg_pip.R b/R/rg_pip.R index 0b216e2b..a1cce53c 100644 --- a/R/rg_pip.R +++ b/R/rg_pip.R @@ -78,7 +78,7 @@ rg_pip <- function(country, res <- lapply(lt, process_dt, povline = povline) } rm(lt) - invisible(gc()) + res <- rbindlist(res, fill = TRUE) From db6fcd871d297d840791c09792b4c93aaf0c7f33 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Fri, 19 Sep 2025 08:29:22 -0400 Subject: [PATCH 157/203] Increment version number to 1.3.24.9006 --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index a4b955d8..f552ad3f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pipapi Title: API for the Poverty and Inequality Platform -Version: 1.3.24.9005 +Version: 1.3.24.9006 Authors@R: c(person(given = "Tony", family = "Fujs", From 847fb20dc5d1e2bdae0896de81d467d46f5730d7 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Fri, 19 Sep 2025 08:56:14 -0400 Subject: [PATCH 158/203] fix bug req$args to req$argsQuery --- inst/TMP/TMP_duckdb_cache.R | 59 ++++++++++++++++++++++++++----------- inst/plumber/v1/endpoints.R | 6 ++-- 2 files changed, 44 insertions(+), 21 deletions(-) diff --git a/inst/TMP/TMP_duckdb_cache.R b/inst/TMP/TMP_duckdb_cache.R index e37562d9..daae5baf 100644 --- a/inst/TMP/TMP_duckdb_cache.R +++ b/inst/TMP/TMP_duckdb_cache.R @@ -1,38 +1,61 @@ +# SETUP ------------- + devtools::load_all(".") +library(fastverse) force <- FALSE -if (!"lkups" %in% ls() || isTRUE(force)) { - data_dir <- Sys.getenv("PIPAPI_DATA_ROOT_FOLDER_LOCAL") |> - fs::path() - fs::dir_ls(data_dir, recurse = FALSE) -} +root_dir <- Sys.getenv("PIPAPI_DATA_ROOT_FOLDER_LOCAL") |> + fs::path() +fs::dir_ls(root_dir, recurse = FALSE) +# latest_version <- - pipapi:::available_versions(data_dir) |> + available_versions(root_dir) |> max() -latest_version <- NULL -latest_version <- "20240627_2017_01_02_PROD" -lkups <- create_versioned_lkups(data_dir, +# lkups <- create_versioned_lkups(root_dir, +# vintage_pattern = "^202509.+2017.+(PROD)$") +lkups <- create_versioned_lkups(root_dir, vintage_pattern = latest_version) -lkup <- lkups$versions_paths[[lkups$latest_release]] +# lkup <- lkups$versions_paths[[lkups$versions[[2]]]] +ver_to_use <- lkups$latest_release # this is important. You need this object below +lkup <- lkups$versions_paths[[ver_to_use]] -reset_cache(lkup = lkup) +# DEGUB ------------- -# 1. -pip(country = "all", year = 2000, lkup = lkup) -# 2. -pip(country = "AGO", year = 2000, lkup = lkup) +options(pipapi.query_live_data = FALSE) +getOption("pipapi.query_live_data") +reset_cache(lkup = lkup) -pip(country = "all", year = "all", lkup = lkup) +tictoc::tic() +sv <- pip(country = "ALL", + year = "ALL", + povline = lkup$pl_lkup$poverty_line, + lkup = lkup, + fill_gaps = FALSE) +tictoc::toc() -pip(country = "IND", year = 2018, lkup = lkup) +tictoc::tic() +fg <- pip(country = "ALL", + year = "ALL", + povline = lkup$pl_lkup$poverty_line, + lkup = lkup, + fill_gaps = TRUE) +tictoc::toc() -pip(country = "IND", year = "all", lkup = lkup) + +# copy cache to TFS folder +ori_cache <- fs::path(lkup$data_root, "cache.duckdb") +dest_cache <- Sys.getenv("PIPAPI_DATA_ROOT_FOLDER_SERVER") |> + fs::path(ver_to_use, "cache.duckdb") + +if (fs::file_exists(ori_cache)) { + fs::file_copy(ori_cache, dest_cache, overwrite = TRUE) +} diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R index e7345ecb..575c9a08 100644 --- a/inst/plumber/v1/endpoints.R +++ b/inst/plumber/v1/endpoints.R @@ -299,7 +299,7 @@ function(req, res) { params <- req$argsQuery res$serializer <- pipapi::assign_serializer(format = params$format) - if (is.null(req$args$table)) { + if (is.null(req$argsQuery$table)) { # return all available tables if none selected list_of_tables <- lkups$versions_paths[[params$version]]$aux_tables out <- data.frame(tables = list_of_tables) @@ -796,7 +796,7 @@ function(req, res) { params <- req$argsQuery exclude <- req$argsQuery$exclude res$serializer <- pipapi::assign_serializer(format = params$format) - if (is.null(req$args$table)) { + if (is.null(req$argsQuery$table)) { # return all available tables if none selected list_of_tables <- lkups$versions_paths[[params$version]]$aux_tables out <- data.frame(tables = list_of_tables) @@ -808,7 +808,7 @@ function(req, res) { params$exclude <- NULL out <- do.call(pipapi::get_aux_table_ui, params) - if (req$args$table == "countries" && exclude == TRUE) { + if (req$argsQuery$table == "countries" && exclude == TRUE) { # hardcoded to_remove <- c("MDG", "UKR") out <- out[!(country_code %in% to_remove)] From 3b433c50e302a8134b8dabb3f0bb64fa1bf780e6 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Fri, 19 Sep 2025 08:56:24 -0400 Subject: [PATCH 159/203] Increment version number to 1.3.24.9007 --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index f552ad3f..9d167253 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pipapi Title: API for the Poverty and Inequality Platform -Version: 1.3.24.9006 +Version: 1.3.24.9007 Authors@R: c(person(given = "Tony", family = "Fujs", From 5afa3d51da9c39cf9a4b775b407edd04716763d7 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Fri, 19 Sep 2025 10:11:01 -0400 Subject: [PATCH 160/203] handle errors --- R/zzz.R | 8 ++++---- inst/TMP/TMP_duckdb_cache.R | 7 +++---- inst/plumber/v1/plumber.R | 14 ++++++++++++-- 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/R/zzz.R b/R/zzz.R index 302841c5..0f481c3d 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -37,10 +37,10 @@ pipapi_default_options <- list( # set multi threats - available_cores <- parallel::detectCores() - 1 - - cores_to_use <- max(available_cores, 1) |> - min(8) + # available_cores <- parallel::detectCores() - 1 + # + # cores_to_use <- max(available_cores, 1) |> + # min(8) set_in_pipapienv("cores_to_use", cores_to_use) diff --git a/inst/TMP/TMP_duckdb_cache.R b/inst/TMP/TMP_duckdb_cache.R index daae5baf..b1e7137a 100644 --- a/inst/TMP/TMP_duckdb_cache.R +++ b/inst/TMP/TMP_duckdb_cache.R @@ -2,7 +2,6 @@ devtools::load_all(".") library(fastverse) -force <- FALSE root_dir <- Sys.getenv("PIPAPI_DATA_ROOT_FOLDER_LOCAL") |> fs::path() @@ -13,10 +12,10 @@ latest_version <- available_versions(root_dir) |> max() -# lkups <- create_versioned_lkups(root_dir, -# vintage_pattern = "^202509.+2017.+(PROD)$") lkups <- create_versioned_lkups(root_dir, - vintage_pattern = latest_version) + vintage_pattern = "^202509.+2017.+(PROD)$") +# lkups <- create_versioned_lkups(root_dir, +# vintage_pattern = latest_version) # lkup <- lkups$versions_paths[[lkups$versions[[2]]]] ver_to_use <- lkups$latest_release # this is important. You need this object below diff --git a/inst/plumber/v1/plumber.R b/inst/plumber/v1/plumber.R index c5441da4..32056e79 100644 --- a/inst/plumber/v1/plumber.R +++ b/inst/plumber/v1/plumber.R @@ -29,10 +29,20 @@ plumber::pr(endpoints_path) |> # log_info('Bye bye: {proc.time()[["elapsed"]]}') }) |> plumber::pr_set_error(function(req, res, err) { - # In case of error, make sure you log the endpoint for #432 + # Log error with request info method <- req$REQUEST_METHOD path <- req$PATH_INFO - cat(sprintf("ERROR at %s %s: %s\n", method, path, err$message)) + msg <- sprintf("ERROR at %s %s: %s\n", method, path, err$message) + cat(msg) + # Always return a JSON error response + res$status <- 500 + res$body <- jsonlite::toJSON(list( + error = "Internal Server Error", + message = err$message, + path = path, + method = method + ), auto_unbox = TRUE) + res }) |> # Set API spec plumber::pr_set_api_spec(api = function(spec) { From 86a66da541797d049fda580c287756b5490ba24d Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Fri, 19 Sep 2025 10:16:14 -0400 Subject: [PATCH 161/203] add some tryCatch in some endpoints --- inst/plumber/v1/endpoints.R | 42 +++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R index 575c9a08..00e5f112 100644 --- a/inst/plumber/v1/endpoints.R +++ b/inst/plumber/v1/endpoints.R @@ -248,15 +248,20 @@ function(req, res) { #* Default is FALSE function(req, res) { - # Process request - params <- req$argsQuery - params$lkup <- lkups$versions_paths[[params$version]] - res$serializer <- pipapi::assign_serializer(format = params$format) - params$format <- NULL - params$version <- NULL + # Defensive error handling for pip endpoint + tryCatch({ + params <- req$argsQuery + params$lkup <- lkups$versions_paths[[params$version]] + res$serializer <- pipapi::assign_serializer(format = params$format) + params$format <- NULL + params$version <- NULL - out <- do.call(pipapi::pip, params) - out + out <- do.call(pipapi::pip, params) + out + }, error = function(e) { + res$status <- 500 + list(error = "Error in /api/v1/pip", message = e$message) + }) } ### pip-grp ---------- @@ -275,15 +280,20 @@ function(req, res) { #* @param additional_ind:[bool] Additional indicators based on standard PIP output. #* Default is FALSE function(req, res) { - # Process request - params <- req$argsQuery - params$lkup <- lkups$versions_paths[[params$version]] - res$serializer <- pipapi::assign_serializer(format = params$format) - params$format <- NULL - params$version <- NULL + # Defensive error handling for pip-grp endpoint + tryCatch({ + params <- req$argsQuery + params$lkup <- lkups$versions_paths[[params$version]] + res$serializer <- pipapi::assign_serializer(format = params$format) + params$format <- NULL + params$version <- NULL - out <- do.call(pipapi::pip_agg, params) - out + out <- do.call(pipapi::pip_agg, params) + out + }, error = function(e) { + res$status <- 500 + list(error = "Error in /api/v1/pip-grp", message = e$message) + }) } ### aux ------------------ From 71afe001eccbe418a5b24c726b03cae51322987e Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Fri, 19 Sep 2025 11:02:00 -0400 Subject: [PATCH 162/203] Add a tiny request-ID + timings filter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit every request prints one access line; you also get a serialize line. If you later see “request started” (client side) but no access log, the hang is inside handler/filters. If access prints but no serialize, the hang is in serialization --- R/zzz.R | 2 +- inst/plumber/v1/endpoints.R | 42 +++++++++++++++++++++++++++++++++++++ man/update_master_file.Rd | 3 ++- 3 files changed, 45 insertions(+), 2 deletions(-) diff --git a/R/zzz.R b/R/zzz.R index 0f481c3d..094f9dda 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -41,7 +41,7 @@ pipapi_default_options <- list( # # cores_to_use <- max(available_cores, 1) |> # min(8) - set_in_pipapienv("cores_to_use", cores_to_use) + # set_in_pipapienv("cores_to_use", cores_to_use) # pov lines to store diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R index 00e5f112..a8bafbb5 100644 --- a/inst/plumber/v1/endpoints.R +++ b/inst/plumber/v1/endpoints.R @@ -3,6 +3,48 @@ #* pip.worldbank.org library(pipapi) +# ---- tiny telemetry (ID + timings) -------------------------------------- +`%||%` <- function(a, b) if (is.null(a)) b else a +.req_id <- function() paste0(as.integer((Sys.time() |> as.POSIXct()) * 1e3), "-", sample.int(1e9,1)) +.now <- function() proc.time()[["elapsed"]] + +#* Always-on request context (one access line per request) +#* @filter ctx +function(req, res) { + req$.id <- .req_id() + req$.start <- .now() + req$.path <- req$PATH_INFO %||% "" + req$.meth <- req$REQUEST_METHOD %||% "" + res$setHeader("X-Request-ID", req$.id) + + # mark serialization time + req$.ser0 <- NA_real_ + on.exit({ + total <- .now() - req$.start + cat(sprintf( + '{"type":"access","id":"%s","method":"%s","path":"%s","status":%s,"dur_s":%.6f}\n', + req$.id, req$.meth, req$.path, as.character(res$status %||% NA_integer_), total + ), file = stderr()) + }, add = TRUE) + + forward() +} + +# plumber-wide preserialize/postserialize hooks (stay in this file) +#* @plumber +function(pr) { + pr |> + pr_hook("preserialize", function(req, res) req$.ser0 <- .now()) |> + pr_hook("postserialize", function(req, res) { + if (!is.na(req$.ser0)) { + ser <- .now() - req$.ser0 + cat(sprintf( + '{"type":"serialize","id":"%s","path":"%s","dur_s":%.6f}\n', + req$.id, req$PATH_INFO, ser + ), file = stderr()) + } + }) +} # API filters ------------------------------------------------------------- ## Validate version parameter ---- diff --git a/man/update_master_file.Rd b/man/update_master_file.Rd index 39b1d3b9..968d3de4 100644 --- a/man/update_master_file.Rd +++ b/man/update_master_file.Rd @@ -8,7 +8,8 @@ update_master_file( dat, cache_file_path, fill_gaps, - verbose = getOption("pipapi.verbose") + verbose = getOption("pipapi.verbose"), + decimal = 2 ) } \arguments{ From fd91f928218d87cbdc3df1f99c8e55cc53069fe0 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Fri, 19 Sep 2025 12:07:32 -0400 Subject: [PATCH 163/203] add Request timed out to pip and pip-grp endpoints --- inst/plumber/v1/endpoints.R | 60 ++++++++++++++++++++++++++++++++----- 1 file changed, 52 insertions(+), 8 deletions(-) diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R index a8bafbb5..ae32d142 100644 --- a/inst/plumber/v1/endpoints.R +++ b/inst/plumber/v1/endpoints.R @@ -21,10 +21,17 @@ function(req, res) { req$.ser0 <- NA_real_ on.exit({ total <- .now() - req$.start - cat(sprintf( - '{"type":"access","id":"%s","method":"%s","path":"%s","status":%s,"dur_s":%.6f}\n', - req$.id, req$.meth, req$.path, as.character(res$status %||% NA_integer_), total - ), file = stderr()) + cat( + sprintf( + '{"type":"access","id":"%s","method":"%s","path":"%s","status":%s,"dur_s":%.6f}\n', + req$.id, + req$.meth, + req$.path, + as.character(res$status %||% NA_integer_), + total + ), + file = stderr() + ) }, add = TRUE) forward() @@ -46,6 +53,17 @@ function(pr) { }) } + +# ---- kill runaway requests ---------------------------------------------- +with_req_timeout <- function(expr, + secs = as.numeric(Sys.getenv("PLUMBER_REQ_TIMEOUT","30"))) { + if (!is.finite(secs) || secs <= 0) return(force(expr)) + R.utils::withTimeout(force(expr), timeout = secs, onTimeout = "silent") # uses setTimeLimit() +} + + + + # API filters ------------------------------------------------------------- ## Validate version parameter ---- @@ -298,11 +316,24 @@ function(req, res) { params$format <- NULL params$version <- NULL - out <- do.call(pipapi::pip, params) + out <- do.call(pipapi::pip, params) |> + with_req_timeout() + if (is.null(out)) { + res$status <- 503 + return(list( + error = "Request timed out", + request_id = req$.id, # added in the ctx filter + endpoint = "/api/v1/pip" + )) + } + out }, error = function(e) { res$status <- 500 - list(error = "Error in /api/v1/pip", message = e$message) + list(error = "Error in /api/v1/pip", + message = e$message, + request_id = tryCatch(req$.id, error = function(.) NA) + ) }) } @@ -330,11 +361,24 @@ function(req, res) { params$format <- NULL params$version <- NULL - out <- do.call(pipapi::pip_agg, params) + out <- do.call(pipapi::pip_agg, params) |> + with_req_timeout() + if (is.null(out)) { + res$status <- 503 + return(list( + error = "Request timed out", + request_id = req$.id, # added in the ctx filter + endpoint = "/api/v1/pip-grp" + )) + } + out }, error = function(e) { res$status <- 500 - list(error = "Error in /api/v1/pip-grp", message = e$message) + list(error = "Error in /api/v1/pip-grp", + message = e$message, + request_id = tryCatch(req$.id, error = function(.) NA) + ) }) } From c495e0771bf5e0d5c7daaeac5e808cd22048748c Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Fri, 19 Sep 2025 12:12:05 -0400 Subject: [PATCH 164/203] implement timeout wrapper to more endpoint --- inst/plumber/v1/endpoints.R | 171 ++++++++++++++++++++++++++++-------- 1 file changed, 133 insertions(+), 38 deletions(-) diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R index ae32d142..8208a819 100644 --- a/inst/plumber/v1/endpoints.R +++ b/inst/plumber/v1/endpoints.R @@ -726,13 +726,28 @@ function(req) { #* @param ppp_version:[chr] ppp year to be used #* @param version:[chr] Data version. Defaults to most recent version. See api/v1/versions #* @serializer json -function(req) { - params <- req$argsQuery - params$lkup <- lkups$versions_paths[[req$argsQuery$version]] - params$version <- NULL - - do.call(pipapi::ui_hp_stacked, params) - +function(req, res) { + tryCatch({ + params <- req$argsQuery + params$lkup <- lkups$versions_paths[[req$argsQuery$version]] + params$version <- NULL + out <- do.call(pipapi::ui_hp_stacked, params) |> + with_req_timeout() + if (is.null(out)) { + res$status <- 503 + return(list( + error = "Request timed out", + request_id = req$.id, + endpoint = "/api/v1/hp-stacked" + )) + } + out + }, error = function(e) { + res$status <- 500 + list(error = "Error in /api/v1/hp-stacked", + message = e$message, + request_id = tryCatch(req$.id, error = function(.) NA)) + }) } ### hp-countries ------------- @@ -743,11 +758,28 @@ function(req) { #* @param ppp_version:[chr] ppp year to be used #* @param version:[chr] Data version. Defaults to most recent version. See api/v1/versions #* @serializer json -function(req) { - params <- req$argsQuery - params$lkup <- lkups$versions_paths[[req$argsQuery$version]] - params$version <- NULL - do.call(pipapi::ui_hp_countries, params) +function(req, res) { + tryCatch({ + params <- req$argsQuery + params$lkup <- lkups$versions_paths[[req$argsQuery$version]] + params$version <- NULL + out <- do.call(pipapi::ui_hp_countries, params) |> + with_req_timeout() + if (is.null(out)) { + res$status <- 503 + return(list( + error = "Request timed out", + request_id = req$.id, + endpoint = "/api/v1/hp-countries" + )) + } + out + }, error = function(e) { + res$status <- 500 + list(error = "Error in /api/v1/hp-countries", + message = e$message, + request_id = tryCatch(req$.id, error = function(.) NA)) + }) } @@ -767,14 +799,29 @@ function(req) { #* @param ppp_version:[chr] ppp year to be used #* @param version:[chr] Data version. Defaults to most recent version. See api/v1/versions #* @serializer json list(na = "null") -function(req) { - params <- req$argsQuery - params$lkup <- lkups$versions_paths[[req$argsQuery$version]] - params$version <- NULL - params$censor <- TRUE - - out <- do.call(pipapi::ui_pc_charts, params) - return(out) +function(req, res) { + tryCatch({ + params <- req$argsQuery + params$lkup <- lkups$versions_paths[[req$argsQuery$version]] + params$version <- NULL + params$censor <- TRUE + out <- do.call(pipapi::ui_pc_charts, params) |> + with_req_timeout() + if (is.null(out)) { + res$status <- 503 + return(list( + error = "Request timed out", + request_id = req$.id, + endpoint = "/api/v1/pc-charts" + )) + } + out + }, error = function(e) { + res$status <- 500 + list(error = "Error in /api/v1/pc-charts", + message = e$message, + request_id = tryCatch(req$.id, error = function(.) NA)) + }) } ### pc-download ----------- @@ -812,13 +859,28 @@ function(req) { #* @param ppp_version:[chr] ppp year to be used #* @param version:[chr] Data version. Defaults to most recent version. See api/v1/versions #* @serializer json -function(req) { - params <- req$argsQuery - params$lkup <- lkups$versions_paths[[req$argsQuery$version]] - params$version <- NULL - - do.call(pipapi::ui_pc_regional, params) - +function(req, res) { + tryCatch({ + params <- req$argsQuery + params$lkup <- lkups$versions_paths[[req$argsQuery$version]] + params$version <- NULL + out <- do.call(pipapi::ui_pc_regional, params) |> + with_req_timeout() + if (is.null(out)) { + res$status <- 503 + return(list( + error = "Request timed out", + request_id = req$.id, + endpoint = "/api/v1/pc-regional-aggregates" + )) + } + out + }, error = function(e) { + res$status <- 500 + list(error = "Error in /api/v1/pc-regional-aggregates", + message = e$message, + request_id = tryCatch(req$.id, error = function(.) NA)) + }) } @@ -833,11 +895,28 @@ function(req) { #* @param ppp_version:[chr] ppp year to be used #* @param version:[chr] Data version. Defaults to most recent version. See api/v1/versions #* @serializer json list(na="null") -function(req) { - params <- req$argsQuery - params$lkup <- lkups$versions_paths[[req$argsQuery$version]] - params$version <- NULL - do.call(pipapi::ui_cp_key_indicators, params) +function(req, res) { + tryCatch({ + params <- req$argsQuery + params$lkup <- lkups$versions_paths[[req$argsQuery$version]] + params$version <- NULL + out <- do.call(pipapi::ui_cp_key_indicators, params) |> + with_req_timeout() + if (is.null(out)) { + res$status <- 503 + return(list( + error = "Request timed out", + request_id = req$.id, + endpoint = "/api/v1/cp-key-indicators" + )) + } + out + }, error = function(e) { + res$status <- 500 + list(error = "Error in /api/v1/cp-key-indicators", + message = e$message, + request_id = tryCatch(req$.id, error = function(.) NA)) + }) } @@ -850,12 +929,28 @@ function(req) { #* @param ppp_version:[chr] ppp year to be used #* @param version:[chr] Data version. Defaults to most recent version. See api/v1/versions #* @serializer json -function(req) { - params <- req$argsQuery - params$lkup <- lkups$versions_paths[[req$argsQuery$version]] - - params$version <- NULL - do.call(pipapi::ui_cp_charts, params) +function(req, res) { + tryCatch({ + params <- req$argsQuery + params$lkup <- lkups$versions_paths[[req$argsQuery$version]] + params$version <- NULL + out <- do.call(pipapi::ui_cp_charts, params) |> + with_req_timeout() + if (is.null(out)) { + res$status <- 503 + return(list( + error = "Request timed out", + request_id = req$.id, + endpoint = "/api/v1/cp-charts" + )) + } + out + }, error = function(e) { + res$status <- 500 + list(error = "Error in /api/v1/cp-charts", + message = e$message, + request_id = tryCatch(req$.id, error = function(.) NA)) + }) } ### cp-download ----------- From 6fa1450bf5a6621deb7a0ed0fe37a1ad1707866f Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Fri, 19 Sep 2025 12:46:16 -0400 Subject: [PATCH 165/203] revamp plumber --- inst/plumber/v1/endpoints.R | 62 ++++++++++++++++++++++------ inst/plumber/v1/plumber.R | 80 ++++++++++++++++++------------------- 2 files changed, 89 insertions(+), 53 deletions(-) diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R index 8208a819..a780c921 100644 --- a/inst/plumber/v1/endpoints.R +++ b/inst/plumber/v1/endpoints.R @@ -4,9 +4,39 @@ library(pipapi) # ---- tiny telemetry (ID + timings) -------------------------------------- -`%||%` <- function(a, b) if (is.null(a)) b else a -.req_id <- function() paste0(as.integer((Sys.time() |> as.POSIXct()) * 1e3), "-", sample.int(1e9,1)) -.now <- function() proc.time()[["elapsed"]] +`%||%` <- function(a, b) { + if (is.null(a)) + b + else + a +} + +# Generate a unique request ID +.req_id <- function() { + # current time in milliseconds as a character string + ts_ms <- format(as.numeric(Sys.time()) * 1000, scientific = FALSE) + + # add random component + rnd <- sample.int(1e9, 1) + + # combine + id <- paste0(ts_ms, "-", rnd) + + return(id) +} + +# Capture current elapsed CPU time (since R started) +.now <- function() { + pt <- proc.time() + + # Extract elapsed time (in seconds) + elapsed <- pt[["elapsed"]] + + return(elapsed) +} + + + #* Always-on request context (one access line per request) #* @filter ctx @@ -55,10 +85,18 @@ function(pr) { # ---- kill runaway requests ---------------------------------------------- -with_req_timeout <- function(expr, - secs = as.numeric(Sys.getenv("PLUMBER_REQ_TIMEOUT","30"))) { - if (!is.finite(secs) || secs <= 0) return(force(expr)) - R.utils::withTimeout(force(expr), timeout = secs, onTimeout = "silent") # uses setTimeLimit() +with_req_timeout <- + function(expr, + secs = Sys.getenv("PLUMBER_REQ_TIMEOUT", "150") |> + as.numeric() + ){ + + if (!is.finite(secs) || secs <= 0) + return(force(expr)) + + R.utils::withTimeout(force(expr), + timeout = secs, + onTimeout = "silent") # uses setTimeLimit() } @@ -731,7 +769,7 @@ function(req, res) { params <- req$argsQuery params$lkup <- lkups$versions_paths[[req$argsQuery$version]] params$version <- NULL - out <- do.call(pipapi::ui_hp_stacked, params) |> + out <- do.call(pipapi::ui_hp_stacked, params) |> with_req_timeout() if (is.null(out)) { res$status <- 503 @@ -763,7 +801,7 @@ function(req, res) { params <- req$argsQuery params$lkup <- lkups$versions_paths[[req$argsQuery$version]] params$version <- NULL - out <- do.call(pipapi::ui_hp_countries, params) |> + out <- do.call(pipapi::ui_hp_countries, params) |> with_req_timeout() if (is.null(out)) { res$status <- 503 @@ -864,7 +902,7 @@ function(req, res) { params <- req$argsQuery params$lkup <- lkups$versions_paths[[req$argsQuery$version]] params$version <- NULL - out <- do.call(pipapi::ui_pc_regional, params) |> + out <- do.call(pipapi::ui_pc_regional, params) |> with_req_timeout() if (is.null(out)) { res$status <- 503 @@ -900,7 +938,7 @@ function(req, res) { params <- req$argsQuery params$lkup <- lkups$versions_paths[[req$argsQuery$version]] params$version <- NULL - out <- do.call(pipapi::ui_cp_key_indicators, params) |> + out <- do.call(pipapi::ui_cp_key_indicators, params) |> with_req_timeout() if (is.null(out)) { res$status <- 503 @@ -934,7 +972,7 @@ function(req, res) { params <- req$argsQuery params$lkup <- lkups$versions_paths[[req$argsQuery$version]] params$version <- NULL - out <- do.call(pipapi::ui_cp_charts, params) |> + out <- do.call(pipapi::ui_cp_charts, params) |> with_req_timeout() if (is.null(out)) { res$status <- 503 diff --git a/inst/plumber/v1/plumber.R b/inst/plumber/v1/plumber.R index 32056e79..118e9617 100644 --- a/inst/plumber/v1/plumber.R +++ b/inst/plumber/v1/plumber.R @@ -1,54 +1,52 @@ +# ---- process-level thread caps (Step 3) ----------------------------- +# Avoid oversubscribing on multi-core servers (common cause of hangs under load) +Sys.setenv( + OPENBLAS_NUM_THREADS = "1", + MKL_NUM_THREADS = "1", + OMP_NUM_THREADS = "1" +) + +ncores <- parallel::detectCores(logical = FALSE) + +data.table::setDTthreads(max(1L, ncores)) +collapse::set_collapse(nthreads = max(1L, ncores)) +fst::threads_fst(max(1L, ncores)) + +# ---- plumber setup --------------------------------------------------- library(plumber) endpoints_path <- system.file("plumber/v1/endpoints.R", package = "pipapi") -api_spec_path <- system.file("plumber/v1/openapi.yaml", package = "pipapi") -# convert_empty <- pipapi:::convert_empty - -plumber::pr(endpoints_path) |> - # pre-route log - plumber::pr_hook("preroute", function() { - # log_separator() - # tictoc::tic("route") # Start timer for log info - }) |> - # post-route log - plumber::pr_hook("postroute", function(req, res) { - # end_route <- tictoc::toc(quiet = TRUE) - # log_info('route: {convert_empty(req$REMOTE_ADDR)} {convert_empty(req$REQUEST_METHOD)} {convert_empty(req$PATH_INFO)} {convert_empty(req$QUERY_STRING)} {convert_empty(res$status)} {round(end_route$toc - end_route$tic, digits = getOption("digits", 6))}') - }) |> - # pre-serialization log - plumber::pr_hook("preserialize", function() { - # tictoc::tic("serialize") - }) |> - # post-serialization log - plumber::pr_hook("postserialize", function(req) { - # end_serial <- tictoc::toc(quiet = TRUE) - # log_info('serialize: {convert_empty(req$PATH_INFO)} {round(end_serial$toc - end_serial$tic, digits = getOption("digits", 6))}') - # log_separator() - }) |> - plumber::pr_hook("exit", function() { - # log_info('Bye bye: {proc.time()[["elapsed"]]}') - }) |> +api_spec_path <- system.file("plumber/v1/openapi.yaml", package = "pipapi") + +pr <- plumber::pr(endpoints_path) |> + # (Step 4) simplified error handler plumber::pr_set_error(function(req, res, err) { - # Log error with request info + rid <- tryCatch(req$.id, error = function(e) NA) method <- req$REQUEST_METHOD - path <- req$PATH_INFO - msg <- sprintf("ERROR at %s %s: %s\n", method, path, err$message) - cat(msg) - # Always return a JSON error response + path <- req$PATH_INFO + + # structured log to stderr (good for docker logs) + cat(sprintf( + '{"level":"error","id":"%s","method":"%s","path":"%s","msg":%s}\n', + as.character(rid), method, path, + jsonlite::toJSON(err$message, auto_unbox = TRUE) + ), file = stderr()) + res$status <- 500 res$body <- jsonlite::toJSON(list( - error = "Internal Server Error", - message = err$message, - path = path, - method = method + error = "Internal Server Error", + message = err$message, + path = path, + method = method, + request_id = rid ), auto_unbox = TRUE) res }) |> - # Set API spec + # Inject version into OpenAPI spec plumber::pr_set_api_spec(api = function(spec) { - spec$info$version <- utils::packageVersion("pipapi") |> - as.character() + spec$info$version <- as.character(utils::packageVersion("pipapi")) spec }) |> - plumber::pr_set_api_spec( - yaml::read_yaml(api_spec_path)) + plumber::pr_set_api_spec(yaml::read_yaml(api_spec_path)) + +pr From 912a0a707b0c4be1ba1c88e39781ecf3d23b90cc Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Fri, 19 Sep 2025 14:10:43 -0400 Subject: [PATCH 166/203] new plumber setup. --- inst/plumber/v1/endpoints.R | 91 ++++++++++++------------ inst/plumber/v1/plumber.R | 135 ++++++++++++++++++++++++++++-------- 2 files changed, 153 insertions(+), 73 deletions(-) diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R index a780c921..dde6835f 100644 --- a/inst/plumber/v1/endpoints.R +++ b/inst/plumber/v1/endpoints.R @@ -4,86 +4,91 @@ library(pipapi) # ---- tiny telemetry (ID + timings) -------------------------------------- -`%||%` <- function(a, b) { - if (is.null(a)) - b - else - a +# Monotonic wall time for durations (in seconds) +.now <- function() { + pt <- proc.time() + elapsed <- pt[["elapsed"]] + return(elapsed) } -# Generate a unique request ID +# Generate a request id AND return a decoded structure +# - id_raw: "milliseconds-since-epoch-random" +# - timestamp: POSIXct in UTC +# - random: integer .req_id <- function() { - # current time in milliseconds as a character string - ts_ms <- format(as.numeric(Sys.time()) * 1000, scientific = FALSE) + # 1) milliseconds since epoch (as numeric -> character; avoid 32-bit overflow) + ts_ms_num <- as.numeric(Sys.time()) * 1000 + ts_ms_chr <- format(ts_ms_num, scientific = FALSE, trim = TRUE) - # add random component + # 2) random component rnd <- sample.int(1e9, 1) - # combine - id <- paste0(ts_ms, "-", rnd) - - return(id) -} - -# Capture current elapsed CPU time (since R started) -.now <- function() { - pt <- proc.time() + # 3) build id string + id_raw <- paste0(ts_ms_chr, "-", rnd) - # Extract elapsed time (in seconds) - elapsed <- pt[["elapsed"]] + # 4) decoded timestamp (POSIXct, UTC) + ts_posix <- as.POSIXct(as.numeric(ts_ms_chr) / 1000, + origin = "1970-01-01", tz = "UTC") - return(elapsed) + out <- list( + id_raw = id_raw, + timestamp = ts_posix, + random = rnd + ) + return(out) } +`%||%` <- function(a, b) if (!is.null(a)) a else b - - -#* Always-on request context (one access line per request) +# ---- Always-on request context (one log line per request) ---- #* @filter ctx function(req, res) { - req$.id <- .req_id() + rid <- .req_id() # <- returns list(id_raw, timestamp, random) + + # put a STRING into places that need strings + req$.id <- rid$id_raw # string token for logs & error bodies + res$setHeader("X-Request-ID", req$.id) + + # keep the decoded pieces if you want (optional) + req$.id_time <- rid$timestamp # POSIXct (UTC) + req$.id_rand <- rid$random # integer + req$.start <- .now() req$.path <- req$PATH_INFO %||% "" req$.meth <- req$REQUEST_METHOD %||% "" - res$setHeader("X-Request-ID", req$.id) - # mark serialization time - req$.ser0 <- NA_real_ on.exit({ total <- .now() - req$.start - cat( - sprintf( - '{"type":"access","id":"%s","method":"%s","path":"%s","status":%s,"dur_s":%.6f}\n', - req$.id, - req$.meth, - req$.path, - as.character(res$status %||% NA_integer_), - total - ), - file = stderr() - ) + cat(sprintf( + '{"type":"access","id":"%s","method":"%s","path":"%s","status":%s,"dur_s":%.6f}\n', + req$.id, req$.meth, req$.path, as.character(res$status %||% NA_integer_), total + ), file = stderr()) }, add = TRUE) forward() } -# plumber-wide preserialize/postserialize hooks (stay in this file) + +# ---- Serialization timing hooks (stay with the filter for coherence) ---- #* @plumber function(pr) { pr |> - pr_hook("preserialize", function(req, res) req$.ser0 <- .now()) |> + pr_hook("preserialize", function(req, res) { + req$.ser0 <- .now() + }) |> pr_hook("postserialize", function(req, res) { - if (!is.na(req$.ser0)) { + if (!is.null(req$.ser0) && !is.na(req$.ser0)) { ser <- .now() - req$.ser0 cat(sprintf( '{"type":"serialize","id":"%s","path":"%s","dur_s":%.6f}\n', - req$.id, req$PATH_INFO, ser + req$.id %||% "", req$.path %||% "", ser ), file = stderr()) } }) } + # ---- kill runaway requests ---------------------------------------------- with_req_timeout <- function(expr, diff --git a/inst/plumber/v1/plumber.R b/inst/plumber/v1/plumber.R index 118e9617..67a9ebf2 100644 --- a/inst/plumber/v1/plumber.R +++ b/inst/plumber/v1/plumber.R @@ -12,41 +12,116 @@ data.table::setDTthreads(max(1L, ncores)) collapse::set_collapse(nthreads = max(1L, ncores)) fst::threads_fst(max(1L, ncores)) -# ---- plumber setup --------------------------------------------------- +# ---- build router -------------------------------------------------------- library(plumber) endpoints_path <- system.file("plumber/v1/endpoints.R", package = "pipapi") -api_spec_path <- system.file("plumber/v1/openapi.yaml", package = "pipapi") +api_spec_path <- system.file("plumber/v1/openapi.yaml", package = "pipapi") pr <- plumber::pr(endpoints_path) |> - # (Step 4) simplified error handler - plumber::pr_set_error(function(req, res, err) { - rid <- tryCatch(req$.id, error = function(e) NA) - method <- req$REQUEST_METHOD - path <- req$PATH_INFO - - # structured log to stderr (good for docker logs) - cat(sprintf( - '{"level":"error","id":"%s","method":"%s","path":"%s","msg":%s}\n', - as.character(rid), method, path, - jsonlite::toJSON(err$message, auto_unbox = TRUE) - ), file = stderr()) - - res$status <- 500 - res$body <- jsonlite::toJSON(list( - error = "Internal Server Error", - message = err$message, - path = path, - method = method, - request_id = rid - ), auto_unbox = TRUE) - res - }) |> - # Inject version into OpenAPI spec - plumber::pr_set_api_spec(api = function(spec) { - spec$info$version <- as.character(utils::packageVersion("pipapi")) - spec - }) |> + + # ---- Pre-route: attach request id + start time ---- +plumber::pr_hook("preroute", function(req, res) { + rid <- .req_id() + + # store a STRING in req$.id and in header + req$.id <- rid$id_raw + req$.id_time <- rid$timestamp # optional metadata + req$.id_rand <- rid$random # optional metadata + res$setHeader("X-Request-ID", req$.id) + + # basic context + req$.start <- .now() + req$.path <- req$PATH_INFO %||% "" + req$.meth <- req$REQUEST_METHOD %||% "" + + # mark serialization time start + req$.ser0 <- NA_real_ + + forward() +}) |> + + # ---- Post-route: log route duration (handler time) ---- +plumber::pr_hook("postroute", function(req, res) { + if (!is.null(req$.start)) { + dur <- .now() - req$.start + cat( + sprintf( + '{"type":"route","id":"%s","method":"%s","path":"%s","status":%s,"dur_s":%.6f}\n', + req$.id %||% "", + req$.meth %||% "", + req$.path %||% "", + as.character(res$status %||% NA_integer_), + dur + ), + file = stderr() + ) + } +}) |> + + # ---- Pre-serialization: mark when we start serializing ---- +plumber::pr_hook("preserialize", function(req, res) { + req$.ser0 <- .now() +}) |> + + # ---- Post-serialization: log serialization duration ---- +plumber::pr_hook("postserialize", function(req, res) { + if (!is.null(req$.ser0) && !is.na(req$.ser0)) { + ser_dur <- .now() - req$.ser0 + cat( + sprintf( + '{"type":"serialize","id":"%s","path":"%s","dur_s":%.6f}\n', + req$.id %||% "", + req$.path %||% "", + ser_dur + ), + file = stderr() + ) + } +}) |> + + # ---- Exit hook: when process shuts down ---- +plumber::pr_hook("exit", function() { + cat( + sprintf( + '{"type":"exit","uptime_s":%.2f}\n', + proc.time()[["elapsed"]] + ), + file = stderr() + ) +}) |> + + # ---- Global error handler ---- +plumber::pr_set_error(function(req, res, err) { + method <- req$REQUEST_METHOD %||% "" + path <- req$PATH_INFO %||% "" + rid <- req$.id %||% "NA" + + cat( + sprintf( + '{"type":"error","id":"%s","method":"%s","path":"%s","msg":%s}\n', + rid, method, path, jsonlite::toJSON(err$message, auto_unbox = TRUE) + ), + file = stderr() + ) + + res$status <- 500 + res$body <- jsonlite::toJSON(list( + error = "Internal Server Error", + message = err$message, + path = path, + method = method, + request_id = rid + ), auto_unbox = TRUE) + + res +}) |> + + # ---- API Spec (with dynamic version injection) ---- +plumber::pr_set_api_spec(api = function(spec) { + spec$info$version <- as.character(utils::packageVersion("pipapi")) + spec +}) |> plumber::pr_set_api_spec(yaml::read_yaml(api_spec_path)) pr From d9a10a66c1d873dcae659a879830da4c1eb6aa47 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Fri, 19 Sep 2025 14:13:01 -0400 Subject: [PATCH 167/203] remove the preroute as it is in endpoints.R --- inst/plumber/v1/plumber.R | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/inst/plumber/v1/plumber.R b/inst/plumber/v1/plumber.R index 67a9ebf2..b1fea35f 100644 --- a/inst/plumber/v1/plumber.R +++ b/inst/plumber/v1/plumber.R @@ -20,27 +20,6 @@ api_spec_path <- system.file("plumber/v1/openapi.yaml", package = "pipapi") pr <- plumber::pr(endpoints_path) |> - # ---- Pre-route: attach request id + start time ---- -plumber::pr_hook("preroute", function(req, res) { - rid <- .req_id() - - # store a STRING in req$.id and in header - req$.id <- rid$id_raw - req$.id_time <- rid$timestamp # optional metadata - req$.id_rand <- rid$random # optional metadata - res$setHeader("X-Request-ID", req$.id) - - # basic context - req$.start <- .now() - req$.path <- req$PATH_INFO %||% "" - req$.meth <- req$REQUEST_METHOD %||% "" - - # mark serialization time start - req$.ser0 <- NA_real_ - - forward() -}) |> - # ---- Post-route: log route duration (handler time) ---- plumber::pr_hook("postroute", function(req, res) { if (!is.null(req$.start)) { From 7efee79b5adfd42df09854740c2bd625e57e0ae3 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Fri, 19 Sep 2025 14:13:57 -0400 Subject: [PATCH 168/203] Increment version number to 1.3.24.9008 --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 9d167253..2fd73a17 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pipapi Title: API for the Poverty and Inequality Platform -Version: 1.3.24.9007 +Version: 1.3.24.9008 Authors@R: c(person(given = "Tony", family = "Fujs", From baa6be879efae3d7a392a58870f47829867d0297 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Fri, 19 Sep 2025 15:42:02 -0400 Subject: [PATCH 169/203] fix bugs --- inst/plumber/v1/endpoints.R | 67 ++++++++++++++----------------------- inst/plumber/v1/plumber.R | 44 +++++++----------------- 2 files changed, 38 insertions(+), 73 deletions(-) diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R index dde6835f..8135a822 100644 --- a/inst/plumber/v1/endpoints.R +++ b/inst/plumber/v1/endpoints.R @@ -3,12 +3,12 @@ #* pip.worldbank.org library(pipapi) + # ---- tiny telemetry (ID + timings) -------------------------------------- + # Monotonic wall time for durations (in seconds) .now <- function() { - pt <- proc.time() - elapsed <- pt[["elapsed"]] - return(elapsed) + proc.time()[["elapsed"]] } # Generate a request id AND return a decoded structure @@ -16,26 +16,17 @@ library(pipapi) # - timestamp: POSIXct in UTC # - random: integer .req_id <- function() { - # 1) milliseconds since epoch (as numeric -> character; avoid 32-bit overflow) ts_ms_num <- as.numeric(Sys.time()) * 1000 ts_ms_chr <- format(ts_ms_num, scientific = FALSE, trim = TRUE) + rnd <- sample.int(1e9, 1) + id_raw <- paste0(ts_ms_chr, "-", rnd) - # 2) random component - rnd <- sample.int(1e9, 1) - - # 3) build id string - id_raw <- paste0(ts_ms_chr, "-", rnd) - - # 4) decoded timestamp (POSIXct, UTC) - ts_posix <- as.POSIXct(as.numeric(ts_ms_chr) / 1000, - origin = "1970-01-01", tz = "UTC") - - out <- list( + list( id_raw = id_raw, - timestamp = ts_posix, + timestamp = as.POSIXct(as.numeric(ts_ms_chr) / 1000, + origin = "1970-01-01", tz = "UTC"), random = rnd ) - return(out) } `%||%` <- function(a, b) if (!is.null(a)) a else b @@ -43,15 +34,14 @@ library(pipapi) # ---- Always-on request context (one log line per request) ---- #* @filter ctx function(req, res) { - rid <- .req_id() # <- returns list(id_raw, timestamp, random) + rid <- .req_id() # list(id_raw, timestamp, random) - # put a STRING into places that need strings - req$.id <- rid$id_raw # string token for logs & error bodies + req$.id <- rid$id_raw res$setHeader("X-Request-ID", req$.id) - # keep the decoded pieces if you want (optional) - req$.id_time <- rid$timestamp # POSIXct (UTC) - req$.id_rand <- rid$random # integer + # keep decoded pieces if helpful + req$.id_time <- rid$timestamp + req$.id_rand <- rid$random req$.start <- .now() req$.path <- req$PATH_INFO %||% "" @@ -68,8 +58,7 @@ function(req, res) { forward() } - -# ---- Serialization timing hooks (stay with the filter for coherence) ---- +# ---- Serialization timing hooks (kept here to avoid duplication) ---- #* @plumber function(pr) { pr |> @@ -87,28 +76,24 @@ function(pr) { }) } - - -# ---- kill runaway requests ---------------------------------------------- +# ---- bounded execution helper ------------------------------------------- with_req_timeout <- function(expr, - secs = Sys.getenv("PLUMBER_REQ_TIMEOUT", "150") |> - as.numeric() - ){ - - if (!is.finite(secs) || secs <= 0) - return(force(expr)) - - R.utils::withTimeout(force(expr), - timeout = secs, - onTimeout = "silent") # uses setTimeLimit() -} - + secs = as.numeric(Sys.getenv("PLUMBER_REQ_TIMEOUT", "150"))) { + if (!is.finite(secs) || secs <= 0) return(force(expr)) + R.utils::withTimeout( + expr = force(expr), + timeout = secs, + onTimeout = "silent" # uses setTimeLimit() + ) + } +# ======================================================================== # API filters ------------------------------------------------------------- -## Validate version parameter ---- +# ======================================================================== + #* Ensure that version parameter is correct #* @filter validate_version diff --git a/inst/plumber/v1/plumber.R b/inst/plumber/v1/plumber.R index b1fea35f..0b36deab 100644 --- a/inst/plumber/v1/plumber.R +++ b/inst/plumber/v1/plumber.R @@ -1,5 +1,5 @@ -# ---- process-level thread caps (Step 3) ----------------------------- -# Avoid oversubscribing on multi-core servers (common cause of hangs under load) +# ---- process-level thread caps ------------------------------------------ +# (Avoid oversubscription; helps with stability under load) Sys.setenv( OPENBLAS_NUM_THREADS = "1", MKL_NUM_THREADS = "1", @@ -7,11 +7,14 @@ Sys.setenv( ) ncores <- parallel::detectCores(logical = FALSE) - data.table::setDTthreads(max(1L, ncores)) -collapse::set_collapse(nthreads = max(1L, ncores)) +collapse::set_collapse(nthreads = max(1L, ncores)) fst::threads_fst(max(1L, ncores)) +# local fallbacks used only in this file (do not depend on endpoints.R) +`%||%` <- function(a, b) if (!is.null(a)) a else b +.now_p <- function() proc.time()[["elapsed"]] + # ---- build router -------------------------------------------------------- library(plumber) @@ -20,10 +23,10 @@ api_spec_path <- system.file("plumber/v1/openapi.yaml", package = "pipapi") pr <- plumber::pr(endpoints_path) |> - # ---- Post-route: log route duration (handler time) ---- + # ---- Post-route: log handler duration (separate from total access time) ---- plumber::pr_hook("postroute", function(req, res) { if (!is.null(req$.start)) { - dur <- .now() - req$.start + dur <- .now_p() - req$.start cat( sprintf( '{"type":"route","id":"%s","method":"%s","path":"%s","status":%s,"dur_s":%.6f}\n', @@ -38,27 +41,6 @@ plumber::pr_hook("postroute", function(req, res) { } }) |> - # ---- Pre-serialization: mark when we start serializing ---- -plumber::pr_hook("preserialize", function(req, res) { - req$.ser0 <- .now() -}) |> - - # ---- Post-serialization: log serialization duration ---- -plumber::pr_hook("postserialize", function(req, res) { - if (!is.null(req$.ser0) && !is.na(req$.ser0)) { - ser_dur <- .now() - req$.ser0 - cat( - sprintf( - '{"type":"serialize","id":"%s","path":"%s","dur_s":%.6f}\n', - req$.id %||% "", - req$.path %||% "", - ser_dur - ), - file = stderr() - ) - } -}) |> - # ---- Exit hook: when process shuts down ---- plumber::pr_hook("exit", function() { cat( @@ -70,7 +52,7 @@ plumber::pr_hook("exit", function() { ) }) |> - # ---- Global error handler ---- + # ---- Global error handler (must return a serializable object) ---- plumber::pr_set_error(function(req, res, err) { method <- req$REQUEST_METHOD %||% "" path <- req$PATH_INFO %||% "" @@ -85,15 +67,13 @@ plumber::pr_set_error(function(req, res, err) { ) res$status <- 500 - res$body <- jsonlite::toJSON(list( + list( error = "Internal Server Error", message = err$message, path = path, method = method, request_id = rid - ), auto_unbox = TRUE) - - res + ) }) |> # ---- API Spec (with dynamic version injection) ---- From 0c16445e5a5a41bd3f9017c027c72ab83a72c087 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Fri, 19 Sep 2025 15:42:13 -0400 Subject: [PATCH 170/203] Increment version number to 1.3.24.9009 --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 2fd73a17..fd2143d2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pipapi Title: API for the Poverty and Inequality Platform -Version: 1.3.24.9008 +Version: 1.3.24.9009 Authors@R: c(person(given = "Tony", family = "Fujs", From 980f47f13675b9f10fe52dcfa839c8335b7315b3 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Mon, 22 Sep 2025 11:59:00 -0400 Subject: [PATCH 171/203] add R.utils package --- DESCRIPTION | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index fd2143d2..a8267a9a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -71,7 +71,8 @@ Imports: duckdb, jsonlite, digest, - parallel + parallel, + R.utils Remotes: PIP-Technical-Team/wbpip@DEV Depends: From 011f4d3b52a7e3dfd123e37199ee3ada0c1ba825 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Mon, 22 Sep 2025 11:59:13 -0400 Subject: [PATCH 172/203] Increment version number to 1.3.24.9010 --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index a8267a9a..ef21f1b2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pipapi Title: API for the Poverty and Inequality Platform -Version: 1.3.24.9009 +Version: 1.3.24.9010 Authors@R: c(person(given = "Tony", family = "Fujs", From 28fca6c5c8eec5a539747de912aa0e96f95d67fc Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Mon, 22 Sep 2025 18:37:19 -0400 Subject: [PATCH 173/203] add function safe_endpoint --- NAMESPACE | 1 + R/create_lkups.R | 28 ++++++++++-- R/utils-plumber.R | 91 +++++++++++++++++++++++++++++++++++++ inst/plumber/v1/endpoints.R | 2 +- man/safe_endpoint.Rd | 67 +++++++++++++++++++++++++++ 5 files changed, 184 insertions(+), 5 deletions(-) create mode 100644 man/safe_endpoint.Rd diff --git a/NAMESPACE b/NAMESPACE index 96b78000..c3c4bef4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -39,6 +39,7 @@ export(pip_old_lineups) export(pipgd_lorenz_curve) export(return_correct_version) export(return_if_exists) +export(safe_endpoint) export(select_off_alt_agg) export(select_reporting_level) export(select_user_aggs) diff --git a/R/create_lkups.R b/R/create_lkups.R index 9d9c9aa7..32c82d53 100644 --- a/R/create_lkups.R +++ b/R/create_lkups.R @@ -298,14 +298,34 @@ create_lkups <- function(data_dir, versions) { update_values = TRUE) - # merge population - popl <- pivot(pop, + # TEMP: fix ARG population --- START + pw <- pivot(pop, ids = c("country_code", "data_level"), names = list(variable = "reporting_year", value = "reporting_pop"), how = "longer") |> - ftransform(reporting_year = as_integer_factor(reporting_year)) |> - frename(data_level = reporting_level) + pivot(how = "wider", + ids = c("country_code", "reporting_year"), + values = "reporting_pop", + names = "data_level") |> + setorder(country_code, reporting_year) + + pw[country_code != "CHN", `:=`( + urban = national, + rural = national + )] + + # TEMP: fix ARG population --- END + + popl <- pivot(pw, + ids = c("country_code", "reporting_year"), + names = list(variable = "reporting_level", + value = "reporting_pop"), + how = "longer") |> + ftransform(reporting_year = as_integer_factor(reporting_year), + reporting_level = as_character_factor(reporting_level)) |> + setkey(NULL) + refy_lkup <- joyn::joyn(refy_lkup, popl, by = c('country_code', diff --git a/R/utils-plumber.R b/R/utils-plumber.R index 215310b6..acfe0ecc 100644 --- a/R/utils-plumber.R +++ b/R/utils-plumber.R @@ -565,3 +565,94 @@ change_grouped_stats_to_csv <- function(out) { out$deciles <- NULL data.frame(out) } + + + + + +#' Wrap a Plumber endpoint with standardized error handling +#' +#' `safe_endpoint()` wraps an endpoint handler in a `tryCatch`, ensuring +#' consistent error handling across the API. On success, the original +#' handler's result is returned. On error, a structured JSON object is +#' returned with useful metadata (status, message, request ID, endpoint), +#' and optionally additional debug details. +#' +#' Debug mode can be enabled by either: +#' \itemize{ +#' \item Passing `debug = TRUE` explicitly, or +#' \item Setting the environment variable `PIPAPI_DEBUG=TRUE`. +#' } +#' When debug mode is active, the error payload also includes the error +#' class, call, query parameters, and a truncated traceback. +#' +#' @param fun A function `(req, res)` containing the endpoint logic. +#' This is where you parse request arguments and call the relevant +#' internal functions. +#' @param endpoint Character string giving the endpoint path +#' (e.g., `"/api/v1/pip"`). Used in error payloads so clients know +#' which endpoint failed. +#' @param debug Logical; if `NULL` (default), inherits from the +#' environment variable `PIPAPI_DEBUG`. When `TRUE`, include extended +#' diagnostic details in the error response. +#' +#' @return A function `(req, res)` suitable for use in Plumber routes. +#' On error, sets `res$status <- 500` and returns a JSON object with: +#' \describe{ +#' \item{error}{A short description ("Error in /api/v1/...")} +#' \item{message}{Either the actual error message (debug mode) or +#' `"Internal Server Error"`} +#' \item{request_id}{The Plumber request ID, if available} +#' \item{endpoint}{The endpoint string supplied} +#' \item{class}{Error class (debug mode only)} +#' \item{call}{The call that generated the error (debug mode only)} +#' \item{query}{The query parameters (debug mode only)} +#' \item{trace}{Traceback captured by `rlang::trace_back()` (debug mode only)} +#' } +#' +#' @examples +#' \dontrun{ +#' # Example: wrap a handler for /api/v1/pip +#' #* @get /api/v1/pip +#' function(req, res) { +#' safe_endpoint(function(req, res) { +#' params <- req$argsQuery +#' params$lkup <- lkups$versions_paths[[req$argsQuery$version]] +#' params$version <- NULL +#' do.call(pipapi::ui_pip, params) +#' }, endpoint = "/api/v1/pip")(req, res) +#' } +#' } +#' +#' @export +safe_endpoint <- function(fun, endpoint, debug = NULL) { + if (is.null(debug)) { + debug <- identical(Sys.getenv("PIPAPI_DEBUG"), "TRUE") + } + + function(req, res) { + tryCatch( + { + fun(req, res) + }, + error = function(e) { + res$status <- 500L + out <- list( + error = paste("Error in", endpoint), + message = if (debug) conditionMessage(e) else "Internal Server Error", + request_id = tryCatch(req$.id, error = \(.) NA), + endpoint = endpoint + ) + if (debug) { + out$class <- class(e)[[1]] + out$call <- as.character(conditionCall(e)) + out$query <- req$argsQuery + out$trace <- utils::capture.output( + rlang::trace_back(bottom = 10, simplify = "branch") + ) + } + out + } + ) + } +} diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R index 8135a822..1b47256b 100644 --- a/inst/plumber/v1/endpoints.R +++ b/inst/plumber/v1/endpoints.R @@ -977,7 +977,7 @@ function(req, res) { res$status <- 500 list(error = "Error in /api/v1/cp-charts", message = e$message, - request_id = tryCatch(req$.id, error = function(.) NA)) + request_id = tryCatch(req$.id, error = \(.) NA)) }) } diff --git a/man/safe_endpoint.Rd b/man/safe_endpoint.Rd new file mode 100644 index 00000000..e754bdc4 --- /dev/null +++ b/man/safe_endpoint.Rd @@ -0,0 +1,67 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-plumber.R +\name{safe_endpoint} +\alias{safe_endpoint} +\title{Wrap a Plumber endpoint with standardized error handling} +\usage{ +safe_endpoint(fun, endpoint, debug = NULL) +} +\arguments{ +\item{fun}{A function \verb{(req, res)} containing the endpoint logic. +This is where you parse request arguments and call the relevant +internal functions.} + +\item{endpoint}{Character string giving the endpoint path +(e.g., \code{"/api/v1/pip"}). Used in error payloads so clients know +which endpoint failed.} + +\item{debug}{Logical; if \code{NULL} (default), inherits from the +environment variable \code{PIPAPI_DEBUG}. When \code{TRUE}, include extended +diagnostic details in the error response.} +} +\value{ +A function \verb{(req, res)} suitable for use in Plumber routes. +On error, sets \code{res$status <- 500} and returns a JSON object with: +\describe{ +\item{error}{A short description ("Error in /api/v1/...")} +\item{message}{Either the actual error message (debug mode) or +\code{"Internal Server Error"}} +\item{request_id}{The Plumber request ID, if available} +\item{endpoint}{The endpoint string supplied} +\item{class}{Error class (debug mode only)} +\item{call}{The call that generated the error (debug mode only)} +\item{query}{The query parameters (debug mode only)} +\item{trace}{Traceback captured by \code{rlang::trace_back()} (debug mode only)} +} +} +\description{ +\code{safe_endpoint()} wraps an endpoint handler in a \code{tryCatch}, ensuring +consistent error handling across the API. On success, the original +handler's result is returned. On error, a structured JSON object is +returned with useful metadata (status, message, request ID, endpoint), +and optionally additional debug details. +} +\details{ +Debug mode can be enabled by either: +\itemize{ +\item Passing \code{debug = TRUE} explicitly, or +\item Setting the environment variable \code{PIPAPI_DEBUG=TRUE}. +} +When debug mode is active, the error payload also includes the error +class, call, query parameters, and a truncated traceback. +} +\examples{ +\dontrun{ +# Example: wrap a handler for /api/v1/pip +#* @get /api/v1/pip +function(req, res) { + safe_endpoint(function(req, res) { + params <- req$argsQuery + params$lkup <- lkups$versions_paths[[req$argsQuery$version]] + params$version <- NULL + do.call(pipapi::ui_pip, params) + }, endpoint = "/api/v1/pip")(req, res) +} +} + +} From d9e9420e2ed21ca3cb49704cf86853435fa515d6 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Mon, 22 Sep 2025 21:10:50 -0400 Subject: [PATCH 174/203] impore with_req_timeout --- NAMESPACE | 1 + R/utils-plumber.R | 40 +++++++++++++++++++++++++++++++++++++ inst/plumber/v1/endpoints.R | 14 +------------ 3 files changed, 42 insertions(+), 13 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index c3c4bef4..af6ba74d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -59,6 +59,7 @@ export(use_new_lineup_version) export(valid_years) export(validate_input_grouped_stats) export(version_dataframe) +export(with_req_timeout) export(wld_lineup_year) import(collapse, except = fdroplevels) import(data.table) diff --git a/R/utils-plumber.R b/R/utils-plumber.R index acfe0ecc..13d784e1 100644 --- a/R/utils-plumber.R +++ b/R/utils-plumber.R @@ -656,3 +656,43 @@ safe_endpoint <- function(fun, endpoint, debug = NULL) { ) } } + +# ---- bounded execution helper ------------------------------------------- +#' Evaluate an expression with a timeout +#' +#' Wraps [R.utils::withTimeout()] but returns a structured failure +#' object instead of stopping the whole process. This allows +#' `safe_endpoint()` to handle timeouts like normal errors without +#' killing the API process. +#' +#' @param expr Expression to evaluate. +#' @param secs Timeout in seconds (default: from env var `PLUMBER_REQ_TIMEOUT`, +#' or 150 if unset). +#' +#' @return Result of `expr` if it finishes in time; otherwise a list +#' with `ok = FALSE`, `error = "timeout"`, and `elapsed` seconds. +#' @export +with_req_timeout <- function(expr, + secs = as.numeric(Sys.getenv("PLUMBER_REQ_TIMEOUT", "150"))) { + if (!is.finite(secs) || secs <= 0) return(force(expr)) + + start <- proc.time()[["elapsed"]] + tryCatch( + { + R.utils::withTimeout( + expr = force(expr), + timeout = secs, + onTimeout = "error" + ) + + }, + TimeoutException = \(e) { + elapsed <- proc.time()[["elapsed"]] - start + list( + ok = FALSE, + error = sprintf("Request exceeded timeout of %s seconds", secs), + elapsed = elapsed + ) + } + ) +} diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R index 1b47256b..a9382b45 100644 --- a/inst/plumber/v1/endpoints.R +++ b/inst/plumber/v1/endpoints.R @@ -76,19 +76,7 @@ function(pr) { }) } -# ---- bounded execution helper ------------------------------------------- -with_req_timeout <- - function(expr, - secs = as.numeric(Sys.getenv("PLUMBER_REQ_TIMEOUT", "150"))) { - if (!is.finite(secs) || secs <= 0) return(force(expr)) - - R.utils::withTimeout( - expr = force(expr), - timeout = secs, - onTimeout = "silent" # uses setTimeLimit() - ) - } # ======================================================================== # API filters ------------------------------------------------------------- @@ -760,7 +748,7 @@ function(req, res) { params$lkup <- lkups$versions_paths[[req$argsQuery$version]] params$version <- NULL out <- do.call(pipapi::ui_hp_stacked, params) |> - with_req_timeout() + with_req_timeout() if (is.null(out)) { res$status <- 503 return(list( From 8c8151db2035379af1c15d82024edf86131d73b2 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Mon, 22 Sep 2025 21:13:23 -0400 Subject: [PATCH 175/203] test safe_endpoint with cp-charts --- inst/plumber/v1/endpoints.R | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R index a9382b45..6d185d65 100644 --- a/inst/plumber/v1/endpoints.R +++ b/inst/plumber/v1/endpoints.R @@ -945,29 +945,17 @@ function(req, res) { #* @param ppp_version:[chr] ppp year to be used #* @param version:[chr] Data version. Defaults to most recent version. See api/v1/versions #* @serializer json -function(req, res) { - tryCatch({ - params <- req$argsQuery - params$lkup <- lkups$versions_paths[[req$argsQuery$version]] - params$version <- NULL - out <- do.call(pipapi::ui_cp_charts, params) |> +cp_charts <- safe_endpoint(function(req, res) { + params <- req$argsQuery + params$lkup <- lkups$versions_paths[[req$argsQuery$version]] + params$version <- NULL + + # wrap the heavy work in with_req_timeout + do.call(pipapi::ui_cp_charts, params) |> with_req_timeout() - if (is.null(out)) { - res$status <- 503 - return(list( - error = "Request timed out", - request_id = req$.id, - endpoint = "/api/v1/cp-charts" - )) - } - out - }, error = function(e) { - res$status <- 500 - list(error = "Error in /api/v1/cp-charts", - message = e$message, - request_id = tryCatch(req$.id, error = \(.) NA)) - }) -} + + }, + endpoint = "/api/v1/cp-charts") ### cp-download ----------- #* Return Country Profile - Downloads From 46373a4c8cf29e9dbaefa84493aafed14b3c4feb Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Mon, 22 Sep 2025 21:13:37 -0400 Subject: [PATCH 176/203] Increment version number to 1.3.24.9011 --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index ef21f1b2..f16b98a1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pipapi Title: API for the Poverty and Inequality Platform -Version: 1.3.24.9010 +Version: 1.3.24.9011 Authors@R: c(person(given = "Tony", family = "Fujs", From 219167e0f6ff0e92ba77967d2f412611ed09f2f9 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Tue, 23 Sep 2025 09:55:54 -0400 Subject: [PATCH 177/203] document with_req_timeout --- man/with_req_timeout.Rd | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 man/with_req_timeout.Rd diff --git a/man/with_req_timeout.Rd b/man/with_req_timeout.Rd new file mode 100644 index 00000000..e6bc2c4c --- /dev/null +++ b/man/with_req_timeout.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-plumber.R +\name{with_req_timeout} +\alias{with_req_timeout} +\title{Evaluate an expression with a timeout} +\usage{ +with_req_timeout( + expr, + secs = as.numeric(Sys.getenv("PLUMBER_REQ_TIMEOUT", "150")) +) +} +\arguments{ +\item{expr}{Expression to evaluate.} + +\item{secs}{Timeout in seconds (default: from env var \code{PLUMBER_REQ_TIMEOUT}, +or 150 if unset).} +} +\value{ +Result of \code{expr} if it finishes in time; otherwise a list +with \code{ok = FALSE}, \code{error = "timeout"}, and \code{elapsed} seconds. +} +\description{ +Wraps \code{\link[R.utils:withTimeout]{R.utils::withTimeout()}} but returns a structured failure +object instead of stopping the whole process. This allows +\code{safe_endpoint()} to handle timeouts like normal errors without +killing the API process. +} From 57b25480cc6703f87386d8126f3b479184b5b721 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Wed, 24 Sep 2025 09:47:09 -0400 Subject: [PATCH 178/203] update news --- NEWS.md | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 4808fc19..46f378fb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,29 @@ # pipapi (development version) -* add new way to infer poverty lines +* add function safe_endpoint +* remove the preroute as it is in endpoints.R +* new plumber setup. +* revamp plumber +* implement timeout wrapper to more endpoint +* add Request timed out to pip and pip-grp endpoints +* Add a tiny request-ID + timings filter +* add some tryCatch in some endpoints +* handle errors +* fix bug req$args to req$argsQuery +* remove gc() +* filter properly proverty lines to update master (inter cache) file +* make use of missing data file instead of recreate i +* remove verbose in join +* sort for surveys with more than 1 reporting_level +* update empty data +* add multiple popshare for fillgaps +* remove filter_Lkup from fg_pip() +* allow to select regions from any variable. this must be changed once we allow many other aggregations +* add prosperity gap to ui_cp_charts +* multiple popshare for svy years. +* improved version of infer_poverty_line() +* update data documentation +* store vars in env defined in zzz +* Fix several bugs # pipapi 1.3.24 From 6b6643d7953dd5cf84fdfe9620d1e9c90b29a284 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Wed, 24 Sep 2025 09:47:23 -0400 Subject: [PATCH 179/203] Increment version number to 1.4.0 --- DESCRIPTION | 2 +- NEWS.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index f16b98a1..8130c10b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pipapi Title: API for the Poverty and Inequality Platform -Version: 1.3.24.9011 +Version: 1.4.0 Authors@R: c(person(given = "Tony", family = "Fujs", diff --git a/NEWS.md b/NEWS.md index 46f378fb..79aac109 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ -# pipapi (development version) +# pipapi 1.4.0 * add function safe_endpoint * remove the preroute as it is in endpoints.R * new plumber setup. From a01a71ecb7b70aa2d534cb63c5a8d54a35f7a7e4 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Wed, 24 Sep 2025 11:23:36 -0400 Subject: [PATCH 180/203] remove old regions from pc_charts --- R/ui_poverty_indicators.R | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/R/ui_poverty_indicators.R b/R/ui_poverty_indicators.R index b4e2b26d..899e4a34 100644 --- a/R/ui_poverty_indicators.R +++ b/R/ui_poverty_indicators.R @@ -96,7 +96,18 @@ ui_pc_regional <- function(country = "ALL", reporting_pop = reporting_pop / pop_units )] - out <- out[estimate_type == "actual"] + # TEMP START: remove old aggregations -------------- + cl <- lkup$aux_files$country_list + + regs <- cl[, .(region_code, africa_split_code)] |> + unlist() |> # convert to vector + na_omit() |> + unique() |> + unname() |> + c("WLD") # add the world + # TEMP END: remove old aggregations -------------- + + out <- out[estimate_type == "actual" & region_code %in% regs] return(out) } From 48b1a2ca5dc3c404bb915c7f61c26ab8c3fc168f Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Wed, 24 Sep 2025 11:23:47 -0400 Subject: [PATCH 181/203] Increment version number to 1.4.0.9000 --- DESCRIPTION | 2 +- NEWS.md | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 8130c10b..c60d4f17 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pipapi Title: API for the Poverty and Inequality Platform -Version: 1.4.0 +Version: 1.4.0.9000 Authors@R: c(person(given = "Tony", family = "Fujs", diff --git a/NEWS.md b/NEWS.md index 79aac109..31ff8530 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,5 @@ +# pipapi (development version) + # pipapi 1.4.0 * add function safe_endpoint * remove the preroute as it is in endpoints.R From 243ef1d1b185171923a425e693c2329ad5150ca8 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Wed, 24 Sep 2025 11:28:40 -0400 Subject: [PATCH 182/203] fix descriptions --- NEWS.md | 51 ++++++++++++++++++++++++++------------------------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/NEWS.md b/NEWS.md index 31ff8530..5eab34d6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,31 +1,32 @@ # pipapi (development version) +* Removed old regions ggregate from `pc_chart`. # pipapi 1.4.0 -* add function safe_endpoint -* remove the preroute as it is in endpoints.R -* new plumber setup. -* revamp plumber -* implement timeout wrapper to more endpoint -* add Request timed out to pip and pip-grp endpoints -* Add a tiny request-ID + timings filter -* add some tryCatch in some endpoints -* handle errors -* fix bug req$args to req$argsQuery -* remove gc() -* filter properly proverty lines to update master (inter cache) file -* make use of missing data file instead of recreate i -* remove verbose in join -* sort for surveys with more than 1 reporting_level -* update empty data -* add multiple popshare for fillgaps -* remove filter_Lkup from fg_pip() -* allow to select regions from any variable. this must be changed once we allow many other aggregations -* add prosperity gap to ui_cp_charts -* multiple popshare for svy years. -* improved version of infer_poverty_line() -* update data documentation -* store vars in env defined in zzz -* Fix several bugs +* Added `safe_endpoint` function for safer endpoint handling. +* Removed `preroute` as it is now managed in `endpoints.R`. +* Introduced a new and improved Plumber setup. +* Refactored Plumber configuration for better maintainability. +* Implemented a timeout wrapper for more endpoints. +* Added "Request timed out" responses to `pip` and `pip-grp` endpoints. +* Introduced a lightweight request ID and timing filter for better request tracking. +* Added `tryCatch` blocks to improve error handling in several endpoints. +* Enhanced error handling throughout the API. +* Fixed bug by changing `req$args` to `req$argsQuery`. +* Removed unnecessary calls to `gc()`. +* Improved filtering of poverty lines when updating the master (intermediate cache) file. +* Utilized the missing data file instead of recreating it. +* Removed verbose output in join operations. +* Ensured correct sorting for surveys with multiple `reporting_level` values. +* Updated handling of empty data responses. +* Enabled support for multiple `popshare` values in fill gaps operations. +* Removed `filter_Lkup` from `fg_pip()`. +* Allowed region selection from any variable (to be revised when supporting more aggregations). +* Added prosperity gap metric to `ui_cp_charts`. +* Enabled multiple `popshare` values for survey years. +* Improved the `infer_poverty_line()` function. +* Updated data documentation for clarity and completeness. +* Stored variables in the environment defined in `zzz`. +* Fixed several bugs across the codebase. # pipapi 1.3.24 From ff3fd8a2c3d0f759c51ed3d3baec518528764001 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Sat, 27 Sep 2025 14:30:40 -0400 Subject: [PATCH 183/203] remove MDG censoring --- inst/plumber/v1/endpoints.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R index 6d185d65..428a82db 100644 --- a/inst/plumber/v1/endpoints.R +++ b/inst/plumber/v1/endpoints.R @@ -1005,7 +1005,7 @@ function(req, res) { if (req$argsQuery$table == "countries" && exclude == TRUE) { # hardcoded - to_remove <- c("MDG", "UKR") + to_remove <- c("UKR") out <- out[!(country_code %in% to_remove)] } } From 74b77ab49edce14745d66f24058df0ac6f3c771c Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Sat, 27 Sep 2025 14:43:57 -0400 Subject: [PATCH 184/203] excliude regions that should not go in UI --- R/get_aux_table.R | 26 +++++++++++++++++++++++++- inst/plumber/v1/endpoints.R | 8 -------- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/R/get_aux_table.R b/R/get_aux_table.R index 5f2bae95..e26db872 100644 --- a/R/get_aux_table.R +++ b/R/get_aux_table.R @@ -45,15 +45,39 @@ get_aux_table <- function(data_dir = NULL, #' Helper function to the UI #' @param data_dir character: Data directory #' @param table character: Name of auxiliary table +#' @param esclude logical: whether or not to exclude some countries or regions... #' #' @return data.frame #' @export #' -get_aux_table_ui <- function(data_dir, table) { +get_aux_table_ui <- function(data_dir, + table, + exclude = TRUE) { out <- get_aux_table(data_dir = data_dir, table = table, long_format = FALSE) + if (table == "regions") { + # TEMP START: remove old aggregations -------------- + cl <- lkup$aux_files$country_list + + regs <- cl[, .(region_code, africa_split_code)] |> + unlist() |> # convert to vector + na_omit() |> + unique() |> + unname() |> + c("WLD") # add the world + # TEMP END: remove old aggregations -------------- + + out <- out[region_code %in% regs] + + } else if (table == "countries" && exclude == TRUE) { + # hardcoded + to_remove <- c("UKR") + out <- out[!(country_code %in% to_remove)] + } + + return(out) } diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R index 428a82db..359167d2 100644 --- a/inst/plumber/v1/endpoints.R +++ b/inst/plumber/v1/endpoints.R @@ -989,7 +989,6 @@ function(req, res) { #* @param exclude:[bool] exclude countries. only applies for "countries" table function(req, res) { params <- req$argsQuery - exclude <- req$argsQuery$exclude res$serializer <- pipapi::assign_serializer(format = params$format) if (is.null(req$argsQuery$table)) { # return all available tables if none selected @@ -1000,14 +999,7 @@ function(req, res) { params$data_dir <- lkups$versions_paths[[params$version]]$data_root params$format <- NULL params$version <- NULL - params$exclude <- NULL out <- do.call(pipapi::get_aux_table_ui, params) - - if (req$argsQuery$table == "countries" && exclude == TRUE) { - # hardcoded - to_remove <- c("UKR") - out <- out[!(country_code %in% to_remove)] - } } out } From 2555f1a65c0b6e5b513754c8a9813421ccd79641 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Sat, 27 Sep 2025 14:44:52 -0400 Subject: [PATCH 185/203] document --- man/get_aux_table_ui.Rd | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/man/get_aux_table_ui.Rd b/man/get_aux_table_ui.Rd index 8d5edc4b..21d1a485 100644 --- a/man/get_aux_table_ui.Rd +++ b/man/get_aux_table_ui.Rd @@ -5,12 +5,14 @@ \title{Return specified auxiliary data in wide format Helper function to the UI} \usage{ -get_aux_table_ui(data_dir, table) +get_aux_table_ui(data_dir, table, exclude = TRUE) } \arguments{ \item{data_dir}{character: Data directory} \item{table}{character: Name of auxiliary table} + +\item{esclude}{logical: whether or not to exclude some countries or regions...} } \value{ data.frame From fa4639935433cfba64bc0e4f4c5805d8901fd808 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Sat, 27 Sep 2025 14:45:39 -0400 Subject: [PATCH 186/203] update news --- NEWS.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 5eab34d6..71da4a50 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,7 @@ # pipapi (development version) -* Removed old regions ggregate from `pc_chart`. +* Removed old regions aggregate from `pc_chart`. +* Removed old regions aggregate from `get_aux_table_ui`. +* Bring back MDG. # pipapi 1.4.0 * Added `safe_endpoint` function for safer endpoint handling. From 54e64bf7feac76e47be5985b785cf2e6f45c7bd6 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Sat, 27 Sep 2025 14:45:48 -0400 Subject: [PATCH 187/203] Increment version number to 1.4.0.9001 --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index c60d4f17..a0e8b314 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pipapi Title: API for the Poverty and Inequality Platform -Version: 1.4.0.9000 +Version: 1.4.0.9001 Authors@R: c(person(given = "Tony", family = "Fujs", From 1235870a7ef32e30ad180982cc0717ba5c1af69f Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Sat, 27 Sep 2025 15:02:56 -0400 Subject: [PATCH 188/203] add hashed to all UI endpoint --- R/create_lkups.R | 9 +++++++-- R/ui_home_page.R | 4 +++- R/ui_poverty_indicators.R | 8 ++++++-- inst/plumber/v1/endpoints.R | 3 ++- 4 files changed, 18 insertions(+), 6 deletions(-) diff --git a/R/create_lkups.R b/R/create_lkups.R index 32c82d53..9bb03efd 100644 --- a/R/create_lkups.R +++ b/R/create_lkups.R @@ -654,12 +654,17 @@ create_lkups <- function(data_dir, versions) { ## `pipapi` runs onto, such as path to data files, which will not be the same ## on my laptop, and on the PROD VM. These variables therefore need to be removed ## prior to the creation of the cache_data_id - hash_svy_lkup <- svy_lkup + hash_svy_lkup <- data.table::copy(svy_lkup) hash_svy_lkup$path <- NULL - hash_ref_lkup <- ref_lkup + if (use_new_lineup_version) { + hash_ref_lkup <- data.table::copy(refy_lkup) + } else { + hash_ref_lkup <- data.table::copy(ref_lkup) + } hash_ref_lkup$path <- NULL + query_controls_hash <- query_controls query_controls_hash$version <- NULL diff --git a/R/ui_home_page.R b/R/ui_home_page.R index 6b230aea..0cc201a6 100644 --- a/R/ui_home_page.R +++ b/R/ui_home_page.R @@ -68,7 +68,9 @@ ui_hp_stacked <- function(povline = 1.9, ui_hp_countries <- function(country = c("IDN", "CIV"), povline = 1.9, pop_units = 1e6, - lkup) { + lkup, + lkup_hash = lkup$cache_data_id$hash_pip + ) { out <- pip( country = country, year = "all", diff --git a/R/ui_poverty_indicators.R b/R/ui_poverty_indicators.R index 899e4a34..4821aa7a 100644 --- a/R/ui_poverty_indicators.R +++ b/R/ui_poverty_indicators.R @@ -15,7 +15,9 @@ ui_pc_charts <- function(country = c("AGO"), reporting_level = c("all", "national", "rural", "urban"), pop_units = 1e6, censor = TRUE, - lkup) { + lkup, + lkup_hash = lkup$cache_data_id$hash_pip_grp + ) { # Set returned columns return_cols <- lkup$return_cols$ui_pc_charts$cols inequality_indicators <- lkup$return_cols$ui_pc_charts$inequality_indicators @@ -73,7 +75,9 @@ ui_pc_regional <- function(country = "ALL", year = "ALL", povline = 1.9, pop_units = 1e6, - lkup) { + lkup, + lkup_hash = lkup$cache_data_id$hash_pip_grp + ) { # TEMPORARY UNTIL SELECTION MECHANISM IS BEING IMPROVED country <- toupper(country) diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R index 359167d2..e0fc4651 100644 --- a/inst/plumber/v1/endpoints.R +++ b/inst/plumber/v1/endpoints.R @@ -881,7 +881,8 @@ function(req, res) { params$lkup <- lkups$versions_paths[[req$argsQuery$version]] params$version <- NULL out <- do.call(pipapi::ui_pc_regional, params) |> - with_req_timeout() + with_req_timeout() + if (is.null(out)) { res$status <- 503 return(list( From ecaff99f00205bf69329eebcee7f469d8fad501a Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Sat, 27 Sep 2025 15:04:17 -0400 Subject: [PATCH 189/203] Increment version number to 1.4.0.9002 --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index a0e8b314..40237391 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pipapi Title: API for the Poverty and Inequality Platform -Version: 1.4.0.9001 +Version: 1.4.0.9002 Authors@R: c(person(given = "Tony", family = "Fujs", From 368462dcfa16bc3c82fa205d5dde87191968b7a5 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Sat, 27 Sep 2025 15:33:46 -0400 Subject: [PATCH 190/203] temp fix with indicators file --- NEWS.md | 1 + inst/plumber/v1/endpoints.R | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 71da4a50..74fe6b6c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,7 @@ * Removed old regions aggregate from `pc_chart`. * Removed old regions aggregate from `get_aux_table_ui`. * Bring back MDG. +* Add hash to all UI enpdoints # pipapi 1.4.0 * Added `safe_endpoint` function for safer endpoint handling. diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R index e0fc4651..307e0d84 100644 --- a/inst/plumber/v1/endpoints.R +++ b/inst/plumber/v1/endpoints.R @@ -718,8 +718,12 @@ function(req) { #* @param version:[chr] Data version. Defaults to most recent version. See api/v1/versions #* @serializer json list(na="null") function(req) { - pipapi::get_aux_table(data_dir = lkups$versions_paths[[req$argsQuery$version]]$data_root, + out <- pipapi::get_aux_table(data_dir = lkups$versions_paths[[req$argsQuery$version]]$data_root, table = "indicators") + + # TEMP patch + out |> + collapse::funique(cols = "indicator_code") } ### decomposition-vars -------------- From 38de1febf15385ab0eadd68be897602870a801ff Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Sat, 27 Sep 2025 15:33:55 -0400 Subject: [PATCH 191/203] Increment version number to 1.4.0.9003 --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 40237391..c1ba3e99 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pipapi Title: API for the Poverty and Inequality Platform -Version: 1.4.0.9002 +Version: 1.4.0.9003 Authors@R: c(person(given = "Tony", family = "Fujs", From a66f362a3faf3ca0f6bfe6d565fa8c394017177b Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Sat, 27 Sep 2025 21:34:36 -0400 Subject: [PATCH 192/203] fiw distribution_type in fg gaps and lkups --- R/create_lkups.R | 43 +++++++++++++++++++++++++++++++++++++------ R/fg_pip.R | 19 +++++++++++++++---- 2 files changed, 52 insertions(+), 10 deletions(-) diff --git a/R/create_lkups.R b/R/create_lkups.R index 9bb03efd..a6086bd6 100644 --- a/R/create_lkups.R +++ b/R/create_lkups.R @@ -198,9 +198,43 @@ create_lkups <- function(data_dir, versions) { refy_lkup <- fst::read_fst(refy_lkup_path, as.data.table = TRUE) + ## TEMP START: add distribution type ----------- + dt <- ref_lkup[, .(country_code, + reporting_year, + welfare_type, + reporting_level, + distribution_type)] + + + dt[, + y := as.integer(length(unique(distribution_type)) == 1), + by = .(country_code, + reporting_year, + welfare_type, + reporting_level) + ] + dt[y == 0, + distribution_type := "mixed" + ][, y := NULL] + + dt <- funique(dt) + refy_lkup <- joyn::joyn(refy_lkup, dt, + by = c("country_code", + "reporting_year", + "welfare_type", + "reporting_level"), + match_type = "1:1", + keep = "left", + update_values = TRUE, + reportvar = FALSE) + + + ## TEMP END: add distribution type ----------- + + # ZP ADD - CREATE OBJECT: lineup years - #___________________________________________________________________________ + #______________________________________________________________ lineup_years_path <- fs::path(data_dir, "estimations/lineup_years.fst") @@ -298,7 +332,7 @@ create_lkups <- function(data_dir, versions) { update_values = TRUE) - # TEMP: fix ARG population --- START + ## TEMP START: fix ARG population ---- pw <- pivot(pop, ids = c("country_code", "data_level"), names = list(variable = "reporting_year", @@ -315,7 +349,7 @@ create_lkups <- function(data_dir, versions) { rural = national )] - # TEMP: fix ARG population --- END + ## TEMP END: fix ARG population ------ popl <- pivot(pw, ids = c("country_code", "reporting_year"), @@ -337,9 +371,6 @@ create_lkups <- function(data_dir, versions) { update_values = TRUE) - - - # --- END inclussion of CMD data. refy_lkup <- refy_lkup[reporting_year %in% lineup_years$lineup_years, ] diff --git a/R/fg_pip.R b/R/fg_pip.R index 726a0779..6adb29eb 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -111,7 +111,7 @@ fg_pip <- function(country, rm(LDTg) } rm(lfst) - + # Add just mean and median @@ -121,9 +121,20 @@ fg_pip <- function(country, # try metadata unique code tmp_metadata <- copy(metadata) # I think we can avoid this inefficiency. # Handle multiple distribution types (for aggregated distributions) - if (length(unique(tmp_metadata$distribution_type)) > 1) { - tmp_metadata[, distribution_type := "mixed"] - } + + tmp_metadata[, + y := as.integer(length(unique(distribution_type)) == 1), + by = .(country_code, + reporting_year, + welfare_type, + reporting_level) + ] + + tmp_metadata[y == 0, + distribution_type := "mixed" + ][, + y := NULL] + # convert survey_comparability to NA # NOTE: This should not be necessary. for the new lineup distribution # metadata should come without this variable. From 88d1df84458b8d096c7b4cef5cf777b629cede52 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Sat, 27 Sep 2025 22:05:41 -0400 Subject: [PATCH 193/203] formatting --- R/fg_pip.R | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/R/fg_pip.R b/R/fg_pip.R index 6adb29eb..70111a53 100644 --- a/R/fg_pip.R +++ b/R/fg_pip.R @@ -142,14 +142,18 @@ fg_pip <- function(country, # get all vars meta_vars <- setdiff(names(tmp_metadata), "reporting_year") # transform to NA when necessary - i.e. when interpolated (two rows per reporting_year) - tmp_metadata[, (meta_vars) := lapply(.SD, \(x) { - if (uniqueN(x) == 1) { - x - } else { - NA - }}), - by = c("reporting_year", "country_code", "reporting_level", "welfare_type"), - .SDcols = meta_vars] + tmp_metadata[, + (meta_vars) := lapply(.SD, \(x) { + if (uniqueN(x) == 1) { + x + } else { + NA + }}), + by = c("reporting_year", + "country_code", + "reporting_level", + "welfare_type"), + .SDcols = meta_vars] # Remove duplicate rows by reporting_year (keep only one row per # reporting_year) @@ -158,11 +162,12 @@ fg_pip <- function(country, out <- join(res, tmp_metadata_unique, - on = c("country_code", "reporting_year", - "reporting_level"), - how = "left", # ZP: change from full to left, - # this rm nowcast years - i.e. years not included - # as lineup years + on = c("country_code", + "reporting_year", + "reporting_level"), + how = "left", # ZP: change from full to left, + # this rm nowcast years - i.e. years not included + # as lineup years validate = "m:1", drop.dup.cols = TRUE, verbose = 0, From a40eb05a2ab1b477aa88985df6f862e6342e217e Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Sat, 27 Sep 2025 22:06:04 -0400 Subject: [PATCH 194/203] Increment version number to 1.4.0.9004 --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index c1ba3e99..b4207281 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pipapi Title: API for the Poverty and Inequality Platform -Version: 1.4.0.9003 +Version: 1.4.0.9004 Authors@R: c(person(given = "Tony", family = "Fujs", From 4a7d196f1d3f43629bc9ec735dad6a51b43ff8e1 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Sat, 27 Sep 2025 23:08:43 -0400 Subject: [PATCH 195/203] add cache_data_id to pip_grp_new --- R/pip_grp_new.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/pip_grp_new.R b/R/pip_grp_new.R index efa25bb8..bf45c3e0 100644 --- a/R/pip_grp_new.R +++ b/R/pip_grp_new.R @@ -7,7 +7,8 @@ pip_grp_new <- \(country = "ALL", reporting_level = c("all", "national"), lkup, censor = TRUE, - additional_ind = FALSE) { + additional_ind = FALSE, + lkup_hash = lkup$cache_data_id$hash_pip_grp) { welfare_type <- match.arg(welfare_type) reporting_level <- match.arg(reporting_level) From 63064662d1588feeb0ef495ec7f4968ee05c6667 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Sat, 27 Sep 2025 23:09:04 -0400 Subject: [PATCH 196/203] Increment version number to 1.4.0.9005 --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index b4207281..b4c9ec4f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pipapi Title: API for the Poverty and Inequality Platform -Version: 1.4.0.9004 +Version: 1.4.0.9005 Authors@R: c(person(given = "Tony", family = "Fujs", From 4bd5a614b4b29e719b78db60543ac4a51f67bf6a Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Mon, 29 Sep 2025 11:00:35 -0400 Subject: [PATCH 197/203] remove patch in indicators files --- inst/plumber/v1/endpoints.R | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R index 307e0d84..98485220 100644 --- a/inst/plumber/v1/endpoints.R +++ b/inst/plumber/v1/endpoints.R @@ -720,10 +720,7 @@ function(req) { function(req) { out <- pipapi::get_aux_table(data_dir = lkups$versions_paths[[req$argsQuery$version]]$data_root, table = "indicators") - - # TEMP patch - out |> - collapse::funique(cols = "indicator_code") + out } ### decomposition-vars -------------- From 81498da6529dd42a5b9b6f37d2da55b774fd96fa Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Mon, 29 Sep 2025 11:17:00 -0400 Subject: [PATCH 198/203] Increment version number to 1.4.0.9006 --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index b4c9ec4f..7660d51c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pipapi Title: API for the Poverty and Inequality Platform -Version: 1.4.0.9005 +Version: 1.4.0.9006 Authors@R: c(person(given = "Tony", family = "Fujs", From 8ff8c4ba9605194fd130d6f1558a47fc955e2b0a Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Mon, 29 Sep 2025 13:36:46 -0400 Subject: [PATCH 199/203] fix issue with regions table in the UI --- R/get_aux_table.R | 3 ++- inst/plumber/v1/endpoints.R | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/R/get_aux_table.R b/R/get_aux_table.R index e26db872..5e517666 100644 --- a/R/get_aux_table.R +++ b/R/get_aux_table.R @@ -52,7 +52,8 @@ get_aux_table <- function(data_dir = NULL, #' get_aux_table_ui <- function(data_dir, table, - exclude = TRUE) { + exclude = TRUE, + lkup) { out <- get_aux_table(data_dir = data_dir, table = table, diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R index 98485220..e887034d 100644 --- a/inst/plumber/v1/endpoints.R +++ b/inst/plumber/v1/endpoints.R @@ -1001,6 +1001,7 @@ function(req, res) { params$data_dir <- lkups$versions_paths[[params$version]]$data_root params$format <- NULL params$version <- NULL + params$lkup <- lkups$versions_paths[[params$version]] out <- do.call(pipapi::get_aux_table_ui, params) } out From 53aadbd6f160d9223fce28b68197797beb38cd15 Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Mon, 29 Sep 2025 13:36:58 -0400 Subject: [PATCH 200/203] Increment version number to 1.4.0.9007 --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 7660d51c..18e9e001 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pipapi Title: API for the Poverty and Inequality Platform -Version: 1.4.0.9006 +Version: 1.4.0.9007 Authors@R: c(person(given = "Tony", family = "Fujs", From 6e20c180f1b3fd23d6d98abe607260d2b1a5a91f Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Mon, 29 Sep 2025 14:27:17 -0400 Subject: [PATCH 201/203] fix bug --- inst/TMP/TMP_API_launcher.R | 9 +++------ inst/plumber/v1/endpoints.R | 2 +- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/inst/TMP/TMP_API_launcher.R b/inst/TMP/TMP_API_launcher.R index 72e28195..7ce361b3 100644 --- a/inst/TMP/TMP_API_launcher.R +++ b/inst/TMP/TMP_API_launcher.R @@ -4,12 +4,9 @@ library(pipapi) # devtools::load_all(".") if (Sys.info()[["user"]] == "wb384996") { - force <- FALSE - if (!"lkups" %in% ls() || isTRUE(force)) { - data_dir <- Sys.getenv("PIPAPI_DATA_ROOT_FOLDER_LOCAL") |> - fs::path() - fs::dir_ls(data_dir, recurse = FALSE) - } + data_dir <- Sys.getenv("PIPAPI_DATA_ROOT_FOLDER_LOCAL") |> + fs::path() + fs::dir_ls(data_dir, recurse = FALSE) latest_version <- diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R index e887034d..7acae90b 100644 --- a/inst/plumber/v1/endpoints.R +++ b/inst/plumber/v1/endpoints.R @@ -999,9 +999,9 @@ function(req, res) { } else { # Return only requested table params$data_dir <- lkups$versions_paths[[params$version]]$data_root + params$lkup <- lkups$versions_paths[[params$version]] params$format <- NULL params$version <- NULL - params$lkup <- lkups$versions_paths[[params$version]] out <- do.call(pipapi::get_aux_table_ui, params) } out From c2d0a3340ae9c2a3771fd939eb3b5c3be0ca31ab Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Mon, 29 Sep 2025 14:27:29 -0400 Subject: [PATCH 202/203] Increment version number to 1.4.0.9008 --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 18e9e001..d724e400 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pipapi Title: API for the Poverty and Inequality Platform -Version: 1.4.0.9007 +Version: 1.4.0.9008 Authors@R: c(person(given = "Tony", family = "Fujs", From a521953b53f222206657671219c5020c97ea856a Mon Sep 17 00:00:00 2001 From: "R.Andres Castaneda" Date: Mon, 29 Sep 2025 16:14:21 -0400 Subject: [PATCH 203/203] Increment version number to 1.4.1 --- DESCRIPTION | 2 +- NEWS.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index d724e400..9aedcc13 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pipapi Title: API for the Poverty and Inequality Platform -Version: 1.4.0.9008 +Version: 1.4.1 Authors@R: c(person(given = "Tony", family = "Fujs", diff --git a/NEWS.md b/NEWS.md index 74fe6b6c..4c3be5b2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ -# pipapi (development version) +# pipapi 1.4.1 * Removed old regions aggregate from `pc_chart`. * Removed old regions aggregate from `get_aux_table_ui`. * Bring back MDG.