From ddebbfd712b947089a1a9be117b8759ab9ae4b97 Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Tue, 27 May 2025 16:05:02 -0400
Subject: [PATCH 001/203] add refy_lkup

---
 R/create_lkups.R | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/R/create_lkups.R b/R/create_lkups.R
index add360c4..052d663d 100644
--- a/R/create_lkups.R
+++ b/R/create_lkups.R
@@ -175,6 +175,40 @@ create_lkups <- function(data_dir, versions) {
                                             collapse = "|"),
              by = .(interpolation_id)]
 
+  # ZP ADD - CREATE OBJECT: refy_lkup
+  #___________________________________________________________________________
+  refy_lkup_path <- fs::path(data_dir,
+                             "estimations/prod_svy_estimation.fst")
+  refy_lkup      <- fst::read_fst(refy_lkup_path,
+                                  as.data.table = TRUE)
+  refy_lkup      <- refy_lkup[cache_id %in% paths_ids]
+  refy_lkup[ ,
+             path := {
+               fs::path(data_dir,
+                        "lineup_data",
+                        paste0(country_code,
+                               "_",
+                               reporting_year),
+                        ext = "qs") |>
+                 as.character()
+               }
+             ]
+
+  if ("region_code" %in% names(svy_lkup)) {
+    refy_lkup[,
+              region_code := NULL]
+  }
+  refy_lkup <- joyn::joyn(x         = refy_lkup,
+                          y         = countries,
+                          by        = 'country_code',
+                          keep      = "left",
+                          reportvar = FALSE)
+
+
+
+  #___________________________________________________________________________
+
+
 
   # CREATE OBJECT: interpolation_list ----
   # This is to facilitate interpolation computations
@@ -521,6 +555,7 @@ create_lkups <- function(data_dir, versions) {
   lkup <- list(
     svy_lkup           = svy_lkup,
     ref_lkup           = ref_lkup,
+    refy_lkup          = refy_lkup,
     dist_stats         = dist_stats,
     pop_region         = pop_region,
     cp_lkups           = cp_lkups,

From 2a60af20c6e1ecf5e4d63b1ea5bbb8dd433cebb7 Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Tue, 27 May 2025 17:03:56 -0400
Subject: [PATCH 002/203] use unique version of refy df for refy

---
 R/create_lkups.R | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/R/create_lkups.R b/R/create_lkups.R
index 052d663d..60ca600e 100644
--- a/R/create_lkups.R
+++ b/R/create_lkups.R
@@ -178,10 +178,12 @@ create_lkups <- function(data_dir, versions) {
   # ZP ADD - CREATE OBJECT: refy_lkup
   #___________________________________________________________________________
   refy_lkup_path <- fs::path(data_dir,
-                             "estimations/prod_svy_estimation.fst")
+                             "estimations/prod_refy_estimation.fst")
+
+  # NOTE: THIS `prod_refy_estimation.fst` is the refy table
   refy_lkup      <- fst::read_fst(refy_lkup_path,
                                   as.data.table = TRUE)
-  refy_lkup      <- refy_lkup[cache_id %in% paths_ids]
+  print(dim(refy_lkup))
   refy_lkup[ ,
              path := {
                fs::path(data_dir,
@@ -194,17 +196,20 @@ create_lkups <- function(data_dir, versions) {
                }
              ]
 
-  if ("region_code" %in% names(svy_lkup)) {
+  print(dim(refy_lkup))
+  if ("region_code" %in% names(refy_lkup)) {
     refy_lkup[,
               region_code := NULL]
   }
+
   refy_lkup <- joyn::joyn(x         = refy_lkup,
                           y         = countries,
                           by        = 'country_code',
                           keep      = "left",
-                          reportvar = FALSE)
-
+                          reportvar = FALSE,
+                          match_type = "m:1")
 
+  print(dim(refy_lkup))
 
   #___________________________________________________________________________
 

From 1eeaa0fa61dd92f31eb8498d45dd9f90bd3f252f Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Wed, 11 Jun 2025 17:57:03 -0400
Subject: [PATCH 003/203] rm print

---
 R/create_lkups.R | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/R/create_lkups.R b/R/create_lkups.R
index 60ca600e..a1aa7e6c 100644
--- a/R/create_lkups.R
+++ b/R/create_lkups.R
@@ -180,10 +180,10 @@ create_lkups <- function(data_dir, versions) {
   refy_lkup_path <- fs::path(data_dir,
                              "estimations/prod_refy_estimation.fst")
 
-  # NOTE: THIS `prod_refy_estimation.fst` is the refy table
+  # NOTE: THIS `prod_refy_estimation.fst` is the refy table (not really)
   refy_lkup      <- fst::read_fst(refy_lkup_path,
                                   as.data.table = TRUE)
-  print(dim(refy_lkup))
+
   refy_lkup[ ,
              path := {
                fs::path(data_dir,
@@ -196,21 +196,18 @@ create_lkups <- function(data_dir, versions) {
                }
              ]
 
-  print(dim(refy_lkup))
   if ("region_code" %in% names(refy_lkup)) {
     refy_lkup[,
               region_code := NULL]
   }
 
-  refy_lkup <- joyn::joyn(x         = refy_lkup,
-                          y         = countries,
-                          by        = 'country_code',
-                          keep      = "left",
-                          reportvar = FALSE,
+  refy_lkup <- joyn::joyn(x          = refy_lkup,
+                          y          = countries,
+                          by         = 'country_code',
+                          keep       = "left",
+                          reportvar  = FALSE,
                           match_type = "m:1")
 
-  print(dim(refy_lkup))
-
   #___________________________________________________________________________
 
 
From 78ff6a5d04e4a3d0009e5c49be38bbec9d7664e1 Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Wed, 11 Jun 2025 18:01:32 -0400
Subject: [PATCH 004/203] testing fg_pip pipload additions

---
 R/fg_pip.R | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 95 insertions(+), 3 deletions(-)

diff --git a/R/fg_pip.R b/R/fg_pip.R
index 364d029e..2667e098 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -18,6 +18,7 @@ fg_pip <- function(country,
   interpolation_list  <- lkup$interpolation_list
   data_dir            <- lkup$data_root
   ref_lkup            <- lkup$ref_lkup
+  refy_lkup           <- lkup$refy_lkup
 
   cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb")
   # fg_pip is called from multiple places like pip, pip_grp_logic. We have connection object created
@@ -49,12 +50,103 @@ fg_pip <- function(country,
                           popshare = popshare)
   setDT(metadata)
 
-
   # Return empty dataframe if no metadata is found
-  if (nrow(metadata) == 0) {
-    return(list(main_data = pipapi::empty_response_fg, data_in_cache = data_present_in_master))
+  # if (nrow(metadata) == 0) {
+  #   return(list(main_data = pipapi::empty_response_fg, data_in_cache = data_present_in_master))
+  # }
+
+
+  # ZP Add: load refy data
+  #-------------------------
+  # Extract unique combinations of country-year
+  if (any(c("ALL", "WLD") %in% country)) {
+    cntry <- ref_lkup$country_code |>
+      unique()
+  } else {
+    cntry <- refy_lkup[country_code %in% country,
+                       .(country_code)] |>
+      funique()
+  }
+  if (any(c("ALL") %in% year)) {
+    yr <- ref_lkup$reporting_year |>
+      unique()
+  } else {
+    yr <- refy_lkup[reporting_year %in% year,
+                       .(reporting_year)] |>
+      funique()
   }
 
+  lt <-
+    pipload::load_list_refy(input_list = list(country_code = cntry,
+                                              year         = yr),
+                            path = fs::path(data_dir,
+                                            "lineup_data"))
+  lt <- lapply(lt,
+               FUN = \(x) {
+                  x <- x |>
+                   pipload::attr_to_column("reporting_level_rows") |> # only rep level????
+                   pipload::attr_to_column("country_code") |>
+                   pipload::attr_to_column("reporting_year") |>
+                   fmutate(file = paste0(country_code,
+                                         "_",
+                                         reporting_year))
+
+                  if ("welfare_refy" %in% names(x)) {
+                    setnames(x,
+                             old = c("welfare_refy",
+                                     "weight_refy"),
+                             new = c("welfare",
+                                     "weight"))
+                  }
+
+                  x
+               })
+
+  rlang::env_poke(env   = globalenv(),
+                  nm    = "pipload_list",
+                  value = lt)
+
+  # ZP Add: do fgt estimations using `res <- lapply(lt, process_dt, povline = povline)`
+  #-------------------------
+  res <- lapply(lt, process_dt, povline = povline)
+  res <- rbindlist(res, fill = TRUE)
+
+  # TO BE REMOVED, ONLY FOR TESTING!!!
+  rlang::env_poke(env   = globalenv(),
+                  nm    = "res_povest",
+                  value = res)
+
+  # ZP Add: join to metadata
+  #-------------------------
+  metadata[,
+           file := basename(path)]
+
+  out <- join(res,
+              metadata,
+              on       = c("file",
+                           "reporting_level"),
+              how      = "full",
+              validate = "m:1",
+              verbose  = 0)
+
+  out[, `:=`(
+    #mean   = survey_mean_ppp,
+    #median = survey_median_ppp,
+    file   = NULL
+  )]
+
+  setnames(out,
+           "povline",
+           "poverty_line")
+
+
+
+
+
+
+
+
+
   unique_survey_files <- unique(metadata$data_interpolation_id)
 
   # Interpolation list

From 0ee3477894f908b91f1066a4f3ab3452d38b1f1c Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Wed, 11 Jun 2025 18:39:26 -0400
Subject: [PATCH 005/203] add more error messages

---
 R/duckdb_func.R | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/R/duckdb_func.R b/R/duckdb_func.R
index 70779668..8dc4dc13 100644
--- a/R/duckdb_func.R
+++ b/R/duckdb_func.R
@@ -263,8 +263,11 @@ connect_with_retry <- function(db_path = NULL,
         message("Attempt ", attempt,
                " failed: ", conditionMessage(e))
       }
+      # if (attempt == max_attempts) {
+      #   stop("Failed to connect after ", max_attempts, " attempts.")
+      # }
       if (attempt == max_attempts) {
-        stop("Failed to connect after ", max_attempts, " attempts.")
+        stop("Failed to connect after ", max_attempts, " attempts.\nLast error: ", conditionMessage(e))
       }
       Sys.sleep(delay_sec)
       attempt <<- attempt + 1

From 33134eb11a25387f9c11388a55ae0946148cd001 Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Tue, 17 Jun 2025 11:55:48 -0400
Subject: [PATCH 006/203] temp store metadata

---
 R/fg_pip.R | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/R/fg_pip.R b/R/fg_pip.R
index 2288b6b3..cc5a7ecf 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -38,8 +38,7 @@ fg_pip <- function(country,
     data_dir        = data_dir,
     povline         = povline,
     cache_file_path = cache_file_path,
-    fill_gaps = TRUE
-  )
+    fill_gaps       = TRUE)
 
   data_present_in_master <- metadata$data_present_in_master
   povline  <- metadata$povline
@@ -51,9 +50,10 @@ fg_pip <- function(country,
   setDT(metadata)
 
   # Return empty dataframe if no metadata is found
-  # if (nrow(metadata) == 0) {
-  #   return(list(main_data = pipapi::empty_response_fg, data_in_cache = data_present_in_master))
-  # }
+  if (nrow(metadata) == 0) {
+    return(list(main_data     = pipapi::empty_response_fg,
+                data_in_cache = data_present_in_master))
+  }
 
 
   # ZP Add: load refy data
@@ -108,8 +108,11 @@ fg_pip <- function(country,
 
   # ZP Add: do fgt estimations using `res <- lapply(lt, process_dt, povline = povline)`
   #-------------------------
-  res <- lapply(lt, process_dt, povline = povline)
-  res <- rbindlist(res, fill = TRUE)
+  res <- lapply(lt,
+                process_dt,
+                povline = povline)
+  res <- rbindlist(res,
+                   fill = TRUE)
 
   # TO BE REMOVED, ONLY FOR TESTING!!!
   rlang::env_poke(env   = globalenv(),
@@ -120,6 +123,10 @@ fg_pip <- function(country,
   #-------------------------
   metadata[,
            file := basename(path)]
+  # TO BE REMOVED, ONLY FOR TESTING!!!
+  rlang::env_poke(env   = globalenv(),
+                  nm    = "metadata_check",
+                  value = metadata)
 
   out <- join(res,
               metadata,

From 105859b73f269ef17a897dd8dd0f7f404b0ed59e Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Tue, 17 Jun 2025 23:27:26 -0400
Subject: [PATCH 007/203] fix joining of metadata

---
 R/fg_pip.R | 162 ++++++++++++++---------------------------------------
 1 file changed, 42 insertions(+), 120 deletions(-)

diff --git a/R/fg_pip.R b/R/fg_pip.R
index cc5a7ecf..58858511 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -87,6 +87,10 @@ fg_pip <- function(country,
                    pipload::attr_to_column("reporting_level_rows") |> # only rep level????
                    pipload::attr_to_column("country_code") |>
                    pipload::attr_to_column("reporting_year") |>
+                   pipload::attr_to_column("mean",
+                                           dist_stats = TRUE) |>
+                   pipload::attr_to_column("median",
+                                           dist_stats = TRUE) |>
                    fmutate(file = paste0(country_code,
                                          "_",
                                          reporting_year))
@@ -110,7 +114,8 @@ fg_pip <- function(country,
   #-------------------------
   res <- lapply(lt,
                 process_dt,
-                povline = povline)
+                povline      = povline,
+                mean_and_med = TRUE)
   res <- rbindlist(res,
                    fill = TRUE)
 
@@ -127,9 +132,40 @@ fg_pip <- function(country,
   rlang::env_poke(env   = globalenv(),
                   nm    = "metadata_check",
                   value = metadata)
+  # try metadata unique code
+  tmp_metadata <- metadata
+  # Handle multiple distribution types (for aggregated distributions)
+  if (length(unique(tmp_metadata$distribution_type)) > 1) {
+    tmp_metadata[, distribution_type := "mixed"]
+  }
+  # convert survey_comparability to NA
+  # NOTE: This should not be necessary. for the new lineup distribution
+  # metadata should come without this variable.
+  tmp_metadata[, survey_comparability := NA]
+  # get all vars
+  meta_vars <- setdiff(names(tmp_metadata), "reporting_year")
+  # transform to NA when necessary - i.e. when interpolated (two rows per reporting_year)
+  tmp_metadata[, (meta_vars) := lapply(.SD, \(x) {
+    if (uniqueN(x) == 1) {
+      x
+    } else {
+      NA
+    }}),
+    by = reporting_year, .SDcols = meta_vars]
+
+  # Remove duplicate rows by reporting_year (keep only one row per
+  # reporting_year)
+  tmp_metadata_unique <- unique(tmp_metadata, by = "reporting_year")
+  tmp_metadata_unique[,
+                      file := paste0(country_code,
+                                     "_",
+                                     reporting_year)]
+  rlang::env_poke(env   = globalenv(),
+                  nm    = "tmp_metadata_unique_check",
+                  value = tmp_metadata_unique)
 
   out <- join(res,
-              metadata,
+              tmp_metadata_unique,
               on       = c("file",
                            "reporting_level"),
               how      = "full",
@@ -142,128 +178,12 @@ fg_pip <- function(country,
     file   = NULL
   )]
 
+
   setnames(out,
            "povline",
            "poverty_line")
 
 
-
-
-
-
-
-
-
-  unique_survey_files <- unique(metadata$data_interpolation_id)
-
-  # Interpolation list
-  interpolation_list <- interpolation_list[names(interpolation_list) %in% unique_survey_files]
-
-  # Unique set of survey data to be read
-  out <- vector(mode = "list", length = length(unique_survey_files))
-
-  #NEW: iterate over survey files
-  for (svy_id in seq_along(unique_survey_files)) {
-    # Extract country-years for which stats will be computed from the same files
-    # tmp_metadata <- interpolation_list[[unique_survey_files[svy_id]]]$tmp_metadata
-    iteration           <- interpolation_list[[unique_survey_files[svy_id]]]
-    svy_data <- get_svy_data(svy_id          = iteration$cache_ids,
-                             reporting_level = iteration$reporting_level,
-                             path            = iteration$paths)
-
-    # Extract unique combinations of country-year
-    ctry_years <- subset_ctry_years(country       = country,
-                                    year          = year,
-                                    lkup          = iteration$ctry_years,
-                                    valid_regions = valid_regions,
-                                    data_dir      = data_dir)
-
-    # Join because some data might be coming from cache so it might be absent in
-    # metadata
-    ctry_years <- collapse::join(ctry_years, metadata |>
-                                collapse::fselect(intersect(names(ctry_years),
-                                                            names(metadata))),
-                                verbose = 0,
-                                how = "inner",
-                                overid = 2)
-
-    results_subset <- vector(mode = "list", length = nrow(ctry_years))
-
-    for (ctry_year_id in seq_along(ctry_years$interpolation_id)) {
-      # Extract records to be used for a single country-year estimation
-      interp_id    <- ctry_years[["interpolation_id"]][ctry_year_id]
-      tmp_metadata <- metadata[metadata$interpolation_id == interp_id, ]
-
-      report_year <- ctry_years[["reporting_year"]][ctry_year_id]
-
-      # Compute estimated statistics using the fill_gap method
-      tmp_stats <- wbpip:::prod_fg_compute_pip_stats(
-        request_year           = report_year,
-        data                   = svy_data,
-        predicted_request_mean = tmp_metadata[["predicted_mean_ppp"]],
-        svy_mean_lcu           = tmp_metadata[["survey_mean_lcu"]],
-        svy_median_lcu         = tmp_metadata$survey_median_lcu,
-        svy_median_ppp         = tmp_metadata$survey_median_ppp,
-        survey_year            = tmp_metadata[["survey_year"]],
-        default_ppp            = tmp_metadata[["ppp"]],
-        ppp                    = ppp,
-        distribution_type      = tmp_metadata[["distribution_type"]],
-        poverty_line           = povline,
-        popshare               = popshare
-      )
-
-      # Handle multiple distribution types (for aggregated distributions)
-      if (length(unique(tmp_metadata$distribution_type)) > 1) {
-        tmp_metadata[, distribution_type := "mixed"]
-      }
-      #
-      # tmp_metadata <- unique(tmp_metadata)
-      # Add stats columns to data frame
-
-      # Convert Statas into Data.table
-      ts_DT <- as.data.table(tmp_stats)
-      # Add reporting year to merge
-      ts_DT[, reporting_year := report_year]
-
-      # convert survey_comparability to NA
-      # NOTE: This should not be necessary. for the new lineup distribution
-      # metadata should come without this variable.
-      tmp_metadata[, survey_comparability := NA]
-
-      # get all vars
-      meta_vars <- setdiff(names(tmp_metadata), "reporting_year")
-      # transform to NA when necessary
-      tmp_metadata[, (meta_vars) := lapply(.SD, \(x) {
-        if (uniqueN(x) == 1) {
-          x
-          } else {
-            NA
-            }}),
-        by = reporting_year, .SDcols = meta_vars]
-
-      # Remove duplicate rows by reporting_year (keep only one row per
-      # reporting_year)
-      tmp_metadata_unique <- unique(tmp_metadata, by = "reporting_year")
-
-      # Now join as usual
-
-      ts_md <- join(ts_DT,
-                    tmp_metadata_unique,
-                    on = "reporting_year",
-                    how = "left",
-                    verbose = 0,
-                    overid = 2)
-
-      results_subset[[ctry_year_id]] <- ts_md
-    }
-    out[[svy_id]] <- results_subset
-  }
-  out <- unlist(out, recursive = FALSE)
-  out <- data.table::rbindlist(out)
-
-  # Remove median
-  # out[, median := NULL]
-
   # Ensure that out does not have duplicates
   out <- fg_remove_duplicates(out)
 
@@ -278,7 +198,9 @@ fg_pip <- function(country,
     out[, max_year := NULL]
   }
 
-  return(list(main_data = out, data_in_cache = data_present_in_master))
+  return(list(main_data     = out,
+              data_in_cache = data_present_in_master))
+
 }
 
 #' Remove duplicated rows created during the interpolation process

From 928606e7c304fecacf93478a40bbd442f2f036c6 Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Tue, 17 Jun 2025 23:27:51 -0400
Subject: [PATCH 008/203] add mean and med to compute_fgt_dt

---
 R/rg_pip.R | 44 ++++++++++++++++++++++++++++++--------------
 1 file changed, 30 insertions(+), 14 deletions(-)

diff --git a/R/rg_pip.R b/R/rg_pip.R
index 5ff8552b..019fde2e 100644
--- a/R/rg_pip.R
+++ b/R/rg_pip.R
@@ -95,11 +95,11 @@ rg_pip <- function(country,
 #'
 #' @return data.table with estimates poverty estimates
 #' @keywords internal
-compute_fgt_dt <- function(dt, welfare, weight, povlines) {
-  w <- dt[[welfare]]
-  wt <- dt[[weight]]
-  n <- length(w)
-  m <- length(povlines)
+compute_fgt_dt <- function(dt, welfare, weight, povlines, mean_and_med = FALSE) {
+  w   <- dt[[welfare]]
+  wt  <- dt[[weight]]
+  n   <- length(w)
+  m   <- length(povlines)
 
   # Pre-allocate result matrix
   res <- matrix(NA_real_, nrow = m, ncol = 3)
@@ -128,18 +128,34 @@ compute_fgt_dt <- function(dt, welfare, weight, povlines) {
       watts_vec[i] <- 0
     }
   }
-  data.table(
-    povline = povlines,
-    headcount = res[, 1],
-    poverty_gap = res[, 2],
-    poverty_severity = res[, 3],
-    watts = watts_vec
-  )
+
+  if (mean_and_med) {
+    mn  <- funique(dt$mean)
+    med <- funique(dt$median)
+    out <- data.table(
+      povline          = povlines,
+      headcount        = res[, 1],
+      poverty_gap      = res[, 2],
+      poverty_severity = res[, 3],
+      watts            = watts_vec,
+      mean             = mn,
+      median           = med)
+  } else {
+    out <- data.table(
+      povline          = povlines,
+      headcount        = res[, 1],
+      poverty_gap      = res[, 2],
+      poverty_severity = res[, 3],
+      watts            = watts_vec)
+  }
+
+  out
+
 }
 
 
-process_dt <- function(dt, povline) {
-  dt[, compute_fgt_dt(.SD, "welfare", "weight", povline),
+process_dt <- function(dt, povline, mean_and_med = FALSE) {
+  dt[, compute_fgt_dt(.SD, "welfare", "weight", povline, mean_and_med),
      by = .(file, reporting_level)]
 }
 

From 74e4093bef45ff9a51595f42344ea586bc73b079 Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Wed, 18 Jun 2025 17:27:58 -0400
Subject: [PATCH 009/203] comment out env checks

---
 R/fg_pip.R | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/R/fg_pip.R b/R/fg_pip.R
index 58858511..36eec629 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -106,9 +106,9 @@ fg_pip <- function(country,
                   x
                })
 
-  rlang::env_poke(env   = globalenv(),
-                  nm    = "pipload_list",
-                  value = lt)
+  # rlang::env_poke(env   = globalenv(),
+  #                 nm    = "pipload_list",
+  #                 value = lt)
 
   # ZP Add: do fgt estimations using `res <- lapply(lt, process_dt, povline = povline)`
   #-------------------------
@@ -120,18 +120,18 @@ fg_pip <- function(country,
                    fill = TRUE)
 
   # TO BE REMOVED, ONLY FOR TESTING!!!
-  rlang::env_poke(env   = globalenv(),
-                  nm    = "res_povest",
-                  value = res)
+  # rlang::env_poke(env   = globalenv(),
+  #                 nm    = "res_povest",
+  #                 value = res)
 
   # ZP Add: join to metadata
   #-------------------------
   metadata[,
            file := basename(path)]
   # TO BE REMOVED, ONLY FOR TESTING!!!
-  rlang::env_poke(env   = globalenv(),
-                  nm    = "metadata_check",
-                  value = metadata)
+  # rlang::env_poke(env   = globalenv(),
+  #                 nm    = "metadata_check",
+  #                 value = metadata)
   # try metadata unique code
   tmp_metadata <- metadata
   # Handle multiple distribution types (for aggregated distributions)
@@ -160,9 +160,9 @@ fg_pip <- function(country,
                       file := paste0(country_code,
                                      "_",
                                      reporting_year)]
-  rlang::env_poke(env   = globalenv(),
-                  nm    = "tmp_metadata_unique_check",
-                  value = tmp_metadata_unique)
+  # rlang::env_poke(env   = globalenv(),
+  #                 nm    = "tmp_metadata_unique_check",
+  #                 value = tmp_metadata_unique)
 
   out <- join(res,
               tmp_metadata_unique,

From 4574c8f6fcf27e310d53d9035c1ca7f28de8b385 Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Mon, 23 Jun 2025 16:34:39 -0400
Subject: [PATCH 010/203] fix full_list for file ingestion

---
 R/fg_pip.R | 113 ++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 91 insertions(+), 22 deletions(-)

diff --git a/R/fg_pip.R b/R/fg_pip.R
index 36eec629..d4038d3d 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -17,8 +17,8 @@ fg_pip <- function(country,
   valid_regions       <- lkup$query_controls$region$values
   interpolation_list  <- lkup$interpolation_list
   data_dir            <- lkup$data_root
-  ref_lkup            <- lkup$ref_lkup
-  refy_lkup           <- lkup$refy_lkup
+  ref_lkup            <- lkup$ref_lkup # the normal refy table, some country-years have two rows (interpolation)
+  refy_lkup           <- lkup$refy_lkup # cleaned refy table, unique by country-years but some columns removed in order to do that
 
   cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb")
   # fg_pip is called from multiple places like pip, pip_grp_logic. We have connection object created
@@ -33,7 +33,7 @@ fg_pip <- function(country,
     year            = year,
     welfare_type    = welfare_type,
     reporting_level = reporting_level,
-    lkup            = ref_lkup,
+    lkup            = ref_lkup, # only place this is used, for 'interpolation_id'
     valid_regions   = valid_regions,
     data_dir        = data_dir,
     povline         = povline,
@@ -51,6 +51,7 @@ fg_pip <- function(country,
 
   # Return empty dataframe if no metadata is found
   if (nrow(metadata) == 0) {
+    print("no metadata")
     return(list(main_data     = pipapi::empty_response_fg,
                 data_in_cache = data_present_in_master))
   }
@@ -59,28 +60,90 @@ fg_pip <- function(country,
   # ZP Add: load refy data
   #-------------------------
   # Extract unique combinations of country-year
+  # if (any(c("ALL", "WLD") %in% country)) {
+  #   cntry <- refy_lkup$country_code |>
+  #     unique()
+  #   print("A")
+  #   #cntry[!cntry %in% c("SSD", "SVK", "TLS", "VEN", "XKX")] # to be removed
+  # } else {
+  #   cntry <- refy_lkup[country_code %in% country,
+  #                      .(country_code)] |>
+  #     funique()
+  #   print("B")
+  # }
+  # if (any(c("ALL") %in% year)) {
+  #   yr <- refy_lkup$reporting_year |>
+  #     unique()
+  #   print("C")
+  # } else {
+  #   yr <- refy_lkup[reporting_year %in% year,
+  #                      .(reporting_year)] |>
+  #     funique()
+  #   print("D")
+  # }
+  #
+  # print(as.vector(cntry))
+  # print(yr)
+  # lt <-
+  #   pipload::load_list_refy(input_list = list(country_code = cntry,
+  #                                             year         = yr),
+  #                           path = fs::path(data_dir,
+  #                                           "lineup_data"))
+
+
+  #'     # ZP Add: load refy data
+  #-------------------------
+  # Extract unique combinations of country-year
   if (any(c("ALL", "WLD") %in% country)) {
-    cntry <- ref_lkup$country_code |>
+    cntry <- refy_lkup$country_code |>
       unique()
+    print("A")
+    #cntry[!cntry %in% c("SSD", "SVK", "TLS", "VEN", "XKX")] # to be removed
   } else {
     cntry <- refy_lkup[country_code %in% country,
                        .(country_code)] |>
       funique()
+    print("B")
   }
   if (any(c("ALL") %in% year)) {
-    yr <- ref_lkup$reporting_year |>
+    yr <- refy_lkup$reporting_year |>
       unique()
+    print("C")
   } else {
     yr <- refy_lkup[reporting_year %in% year,
-                       .(reporting_year)] |>
+                    .(reporting_year)] |>
       funique()
+    print("D")
   }
-
+  dtemp <-
+    ref_lkup |>
+    fsubset(country_code     %in% cntry &
+              reporting_year %in% yr) |>
+    fselect(country_code,
+            year = reporting_year) |>
+    funique()
+
+  # Split years by country
+  full_list <- dtemp[, .(year = list(year)), by = country_code][
+    , .(country_code, year = year)
+  ]
+
+  # Convert to desired structure
+  full_list <- list(
+    country_code = full_list$country_code,
+    year = lapply(full_list$year, as.numeric)
+  )
+  #return(full_list)
+  print(as.vector(cntry))
+  print(yr)
   lt <-
-    pipload::load_list_refy(input_list = list(country_code = cntry,
-                                              year         = yr),
+    pipload::load_list_refy(input_list = full_list,
                             path = fs::path(data_dir,
                                             "lineup_data"))
+
+
+
+  print(names(lt))
   lt <- lapply(lt,
                FUN = \(x) {
                   x <- x |>
@@ -106,9 +169,9 @@ fg_pip <- function(country,
                   x
                })
 
-  # rlang::env_poke(env   = globalenv(),
-  #                 nm    = "pipload_list",
-  #                 value = lt)
+  rlang::env_poke(env   = globalenv(),
+                  nm    = "pipload_list",
+                  value = lt)
 
   # ZP Add: do fgt estimations using `res <- lapply(lt, process_dt, povline = povline)`
   #-------------------------
@@ -120,18 +183,18 @@ fg_pip <- function(country,
                    fill = TRUE)
 
   # TO BE REMOVED, ONLY FOR TESTING!!!
-  # rlang::env_poke(env   = globalenv(),
-  #                 nm    = "res_povest",
-  #                 value = res)
+  rlang::env_poke(env   = globalenv(),
+                  nm    = "res_povest",
+                  value = res)
 
   # ZP Add: join to metadata
   #-------------------------
   metadata[,
            file := basename(path)]
   # TO BE REMOVED, ONLY FOR TESTING!!!
-  # rlang::env_poke(env   = globalenv(),
-  #                 nm    = "metadata_check",
-  #                 value = metadata)
+  rlang::env_poke(env   = globalenv(),
+                  nm    = "metadata_check",
+                  value = metadata)
   # try metadata unique code
   tmp_metadata <- metadata
   # Handle multiple distribution types (for aggregated distributions)
@@ -160,9 +223,9 @@ fg_pip <- function(country,
                       file := paste0(country_code,
                                      "_",
                                      reporting_year)]
-  # rlang::env_poke(env   = globalenv(),
-  #                 nm    = "tmp_metadata_unique_check",
-  #                 value = tmp_metadata_unique)
+  rlang::env_poke(env   = globalenv(),
+                  nm    = "tmp_metadata_unique_check",
+                  value = tmp_metadata_unique)
 
   out <- join(res,
               tmp_metadata_unique,
@@ -203,6 +266,13 @@ fg_pip <- function(country,
 
 }
 
+
+# process_dt_fg <- function(dt, povline, mean_and_med = FALSE) {
+#   dt[, compute_fgt_dt(.SD, "welfare", "weight", povline, mean_and_med),
+#      by = .(file, reporting_level)]
+# }
+
+
 #' Remove duplicated rows created during the interpolation process
 #'
 #' @param df data.table: Table of results created in `fg_pip()`
@@ -210,7 +280,6 @@ fg_pip <- function(country,
 #'
 #' @return data.table
 #'
-
 fg_remove_duplicates <- function(df,
                                  cols = c("comparable_spell",
                                           "cpi",

From e92e3b835009a6d450e7f4073b72c5b61ec9ded0 Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Mon, 23 Jun 2025 16:35:23 -0400
Subject: [PATCH 011/203] allow processing to return mean, med, country, year

---
 R/rg_pip.R | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/R/rg_pip.R b/R/rg_pip.R
index 019fde2e..8b555753 100644
--- a/R/rg_pip.R
+++ b/R/rg_pip.R
@@ -132,6 +132,8 @@ compute_fgt_dt <- function(dt, welfare, weight, povlines, mean_and_med = FALSE)
   if (mean_and_med) {
     mn  <- funique(dt$mean)
     med <- funique(dt$median)
+    cy  <- funique(dt$coutnry_code)
+    ry  <- funique(dt$reporting_year)
     out <- data.table(
       povline          = povlines,
       headcount        = res[, 1],
@@ -139,7 +141,9 @@ compute_fgt_dt <- function(dt, welfare, weight, povlines, mean_and_med = FALSE)
       poverty_severity = res[, 3],
       watts            = watts_vec,
       mean             = mn,
-      median           = med)
+      median           = med,
+      country_code     = cy,
+      reporting_year   = ry)
   } else {
     out <- data.table(
       povline          = povlines,

From 67b1d8c1b14b5a9f157c3dd25e5999cc585ecc67 Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Tue, 5 Aug 2025 22:14:17 +0200
Subject: [PATCH 012/203] create old funcs

---
 R/fg_pip_old.R | 258 +++++++++++++++++++++++++++++++++++++++++++++
 R/pip_zp_old.R | 281 +++++++++++++++++++++++++++++++++++++++++++++++++
 R/rg_pip_old.R | 199 ++++++++++++++++++++++++++++++++++
 3 files changed, 738 insertions(+)
 create mode 100644 R/fg_pip_old.R
 create mode 100644 R/pip_zp_old.R
 create mode 100644 R/rg_pip_old.R

diff --git a/R/fg_pip_old.R b/R/fg_pip_old.R
new file mode 100644
index 00000000..b5fab88b
--- /dev/null
+++ b/R/fg_pip_old.R
@@ -0,0 +1,258 @@
+#' Compute imputed year stats
+#'
+#' Compute the main PIP poverty and inequality statistics for imputed years.
+#'
+#' @inheritParams pip
+#' @return data.frame
+#' @keywords internal
+fg_pip_old <- function(country,
+                   year,
+                   povline,
+                   popshare,
+                   welfare_type,
+                   reporting_level,
+                   ppp,
+                   lkup) {
+
+  valid_regions       <- lkup$query_controls$region$values
+  interpolation_list  <- lkup$interpolation_list
+  data_dir            <- lkup$data_root
+  ref_lkup            <- lkup$ref_lkup
+
+  cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb")
+  # fg_pip is called from multiple places like pip, pip_grp_logic. We have connection object created
+  # when calling from `pip`. For other functions we create it here.
+  # if (is.null(con)) {
+  #   cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb")
+  #   con <- duckdb::dbConnect(duckdb::duckdb(), dbdir = cache_file_path, read_only = TRUE)
+  # }
+  # Handle interpolation
+  metadata <- subset_lkup(
+    country         = country,
+    year            = year,
+    welfare_type    = welfare_type,
+    reporting_level = reporting_level,
+    lkup            = ref_lkup,
+    valid_regions   = valid_regions,
+    data_dir        = data_dir,
+    povline         = povline,
+    cache_file_path = cache_file_path,
+    fill_gaps = TRUE
+  )
+
+  data_present_in_master <- metadata$data_present_in_master
+  povline  <- metadata$povline
+  metadata <- metadata$lkup
+  # Remove aggregate distribution if popshare is specified
+  # TEMPORARY FIX UNTIL popshare is supported for aggregate distributions
+  metadata <- filter_lkup(metadata = metadata,
+                          popshare = popshare)
+  setDT(metadata)
+
+
+  # Return empty dataframe if no metadata is found
+  if (nrow(metadata) == 0) {
+    return(list(main_data = pipapi::empty_response_fg, data_in_cache = data_present_in_master))
+  }
+
+  unique_survey_files <- unique(metadata$data_interpolation_id)
+
+  # Interpolation list
+  interpolation_list <- interpolation_list[names(interpolation_list) %in% unique_survey_files]
+
+  # Unique set of survey data to be read
+  out <- vector(mode = "list", length = length(unique_survey_files))
+
+  #NEW: iterate over survey files
+  for (svy_id in seq_along(unique_survey_files)) {
+    # Extract country-years for which stats will be computed from the same files
+    # tmp_metadata <- interpolation_list[[unique_survey_files[svy_id]]]$tmp_metadata
+    iteration           <- interpolation_list[[unique_survey_files[svy_id]]]
+    svy_data <- get_svy_data(svy_id          = iteration$cache_ids,
+                             reporting_level = iteration$reporting_level,
+                             path            = iteration$paths)
+
+    # Extract unique combinations of country-year
+    ctry_years <- subset_ctry_years(country       = country,
+                                    year          = year,
+                                    lkup          = iteration$ctry_years,
+                                    valid_regions = valid_regions,
+                                    data_dir      = data_dir)
+
+    # Join because some data might be coming from cache so it might be absent in
+    # metadata
+    ctry_years <- collapse::join(ctry_years, metadata |>
+                                   collapse::fselect(intersect(names(ctry_years),
+                                                               names(metadata))),
+                                 verbose = 0,
+                                 how = "inner",
+                                 overid = 2)
+
+    results_subset <- vector(mode = "list", length = nrow(ctry_years))
+
+    for (ctry_year_id in seq_along(ctry_years$interpolation_id)) {
+      # Extract records to be used for a single country-year estimation
+      interp_id    <- ctry_years[["interpolation_id"]][ctry_year_id]
+      tmp_metadata <- metadata[metadata$interpolation_id == interp_id, ]
+
+      report_year <- ctry_years[["reporting_year"]][ctry_year_id]
+
+      # Compute estimated statistics using the fill_gap method
+      tmp_stats <- wbpip:::prod_fg_compute_pip_stats(
+        request_year           = report_year,
+        data                   = svy_data,
+        predicted_request_mean = tmp_metadata[["predicted_mean_ppp"]],
+        svy_mean_lcu           = tmp_metadata[["survey_mean_lcu"]],
+        svy_median_lcu         = tmp_metadata$survey_median_lcu,
+        svy_median_ppp         = tmp_metadata$survey_median_ppp,
+        survey_year            = tmp_metadata[["survey_year"]],
+        default_ppp            = tmp_metadata[["ppp"]],
+        ppp                    = ppp,
+        distribution_type      = tmp_metadata[["distribution_type"]],
+        poverty_line           = povline,
+        popshare               = popshare
+      )
+
+      # Handle multiple distribution types (for aggregated distributions)
+      if (length(unique(tmp_metadata$distribution_type)) > 1) {
+        tmp_metadata[, distribution_type := "mixed"]
+      }
+      #
+      # tmp_metadata <- unique(tmp_metadata)
+      # Add stats columns to data frame
+
+      # Convert Statas into Data.table
+      ts_DT <- as.data.table(tmp_stats)
+      # Add reporting year to merge
+      ts_DT[, reporting_year := report_year]
+
+      # convert survey_comparability to NA
+      # NOTE: This should not be necessary. for the new lineup distribution
+      # metadata should come without this variable.
+      tmp_metadata[, survey_comparability := NA]
+
+      # get all vars
+      meta_vars <- setdiff(names(tmp_metadata), "reporting_year")
+      # transform to NA when necessary
+      tmp_metadata[, (meta_vars) := lapply(.SD, \(x) {
+        if (uniqueN(x) == 1) {
+          x
+        } else {
+          NA
+        }}),
+        by = reporting_year, .SDcols = meta_vars]
+
+      # Remove duplicate rows by reporting_year (keep only one row per
+      # reporting_year)
+      tmp_metadata_unique <- unique(tmp_metadata, by = "reporting_year")
+
+      # Now join as usual
+
+      ts_md <- join(ts_DT,
+                    tmp_metadata_unique,
+                    on = "reporting_year",
+                    how = "left",
+                    verbose = 0,
+                    overid = 2)
+
+      results_subset[[ctry_year_id]] <- ts_md
+    }
+    out[[svy_id]] <- results_subset
+  }
+  out <- unlist(out, recursive = FALSE)
+  out <- data.table::rbindlist(out)
+
+  # Remove median
+  # out[, median := NULL]
+
+  # Ensure that out does not have duplicates
+  out <- fg_remove_duplicates_old(out)
+
+  # Fix issue with rounding of poverty lines
+  out[,
+      poverty_line := round(poverty_line, digits = 3) ]
+
+  # Formatting. MUST be done in data.table tom modify by reference
+  out[, path := as.character(path)]
+
+  if ("max_year" %in% names(out)) {
+    out[, max_year := NULL]
+  }
+
+  return(list(main_data = out, data_in_cache = data_present_in_master))
+}
+
+#' OLD: Remove duplicated rows created during the interpolation process
+#'
+#' @param df data.table: Table of results created in `fg_pip()`
+#' @param cols character: Columns with potential duplicate values
+#'
+#' @return data.table
+#'
+fg_remove_duplicates_old <- function(df,
+                                 cols = c("comparable_spell",
+                                          "cpi",
+                                          "display_cp",
+                                          "gd_type",
+                                          # "interpolation_id",
+                                          "path",
+                                          "predicted_mean_ppp",
+                                          "survey_acronym",
+                                          "survey_comparability",
+                                          "survey_coverage",
+                                          "survey_id",
+                                          "survey_mean_lcu",
+                                          "survey_mean_ppp",
+                                          "survey_median_lcu",
+                                          "survey_median_ppp",
+                                          "survey_time",
+                                          "survey_year",
+                                          "surveyid_year")) {
+  # Modify cache_id
+  # * Ensures that cache_id is unique for both extrapolated and interpolated surveys
+  # * Ensures that cache_id can be kept as an output of fg_pip() while still removing duplicated rows
+  df$cache_id <- fg_standardize_cache_id_old(cache_id = df$cache_id,
+                                         interpolation_id = df$data_interpolation_id,
+                                         reporting_level = df$reporting_level)
+  # Set collapse vars to NA (by type)
+  df <- fg_assign_nas_values_to_dup_cols_old(df = df,
+                                         cols = cols)
+
+  # Ensure that out does not have duplicates
+  df <- unique(df)
+
+  return(df)
+}
+
+#' OLD: Standardize cache_id format to avoid duplication of rows
+#'
+#' @param cache_id character
+#' @param interpolation_id character
+#' @param reporting_level character
+#'
+#' @return character
+fg_standardize_cache_id_old <- function(cache_id,
+                                    interpolation_id,
+                                    reporting_level) {
+
+  out <- ifelse(grepl("|", interpolation_id, fixed = TRUE),
+                gsub(paste0("_",
+                            unique(reporting_level),
+                            collapse = '|'),
+                     '',
+                     interpolation_id),
+                cache_id)
+  return(out)
+}
+
+#' OLD: Coerce variable causing potential duplicates to NAs
+#'
+#' @inheritParams fg_remove_duplicates_old
+#'
+#' @return data.table
+fg_assign_nas_values_to_dup_cols_old <- function(df,
+                                             cols) {
+  #Classes are maintained by default.
+  df[, (cols) := NA]
+  return(df)
+}
diff --git a/R/pip_zp_old.R b/R/pip_zp_old.R
new file mode 100644
index 00000000..2278ccce
--- /dev/null
+++ b/R/pip_zp_old.R
@@ -0,0 +1,281 @@
+#' Compute PIP statistics
+#'
+#' Compute the main PIP poverty and inequality statistics.
+#'
+#' @param country character: Country ISO 3 codes
+#' @param year integer: Reporting year
+#' @param povline numeric: Poverty line
+#' @param popshare numeric: Proportion of the population living below the
+#'   poverty line
+#' @param fill_gaps logical: If set to TRUE, will interpolate / extrapolate
+#'   values for missing years
+#' @param group_by character: Will return aggregated values for predefined
+#'   sub-groups
+#' @param welfare_type character: Welfare type
+#' @param reporting_level character: Geographical reporting level
+#' @param ppp numeric: Custom Purchase Power Parity value
+#' @param lkup list: A list of lkup tables
+#' @param censor logical: Triggers censoring of country/year statistics
+#' @param lkup_hash character: hash of pip
+#' @param additional_ind logical: If TRUE add new set of indicators. Default if
+#'   FALSE
+#'
+#' @return data.table
+#' @examples
+#' \dontrun{
+#' # Create lkups
+#' lkups <- create_lkups("<data-folder>")
+#'
+#' # A single country and year
+#' pip(country = "AGO",
+#'     year = 2000,
+#'     povline = 1.9,
+#'     lkup = lkups)
+#'
+#' # All years for a single country
+#' pip(country = "AGO",
+#'     year = "all",
+#'     povline = 1.9,
+#'     lkup = lkups)
+#'
+#' # Fill gaps
+#' pip(country = "AGO",
+#'     year = "all",
+#'     povline = 1.9,
+#'     fill_gaps = TRUE,
+#'     lkup = lkups)
+#'
+#' # Group by regions
+#' pip(country = "all",
+#'     year = "all",
+#'     povline = 1.9,
+#'     group_by = "wb",
+#'     lkup = lkups)
+#' }
+#' @export
+pip_zp_old <- function(country         = "ALL",
+                year            = "ALL",
+                povline         = 1.9,
+                popshare        = NULL,
+                fill_gaps       = FALSE,
+                group_by        = c("none", "wb"),
+                welfare_type    = c("all", "consumption", "income"),
+                reporting_level = c("all", "national", "rural", "urban"),
+                ppp             = NULL,
+                lkup,
+                censor          = TRUE,
+                lkup_hash       = lkup$cache_data_id$hash_pip,
+                additional_ind  = FALSE) {
+
+
+  # set up -------------
+  welfare_type    <- match.arg(welfare_type)
+  reporting_level <- match.arg(reporting_level)
+  group_by        <- match.arg(group_by)
+  povline         <- round(povline, digits = 3)
+
+
+
+  # TEMPORARY UNTIL SELECTION MECHANISM IS BEING IMPROVED
+  country <- toupper(country)
+  if (is.character(year)) {
+    year <- toupper(year)
+  }
+
+  # If svy_lkup is not part of lkup throw an error.
+  if (!all(c('svy_lkup') %in% names(lkup)))
+    stop("You are probably passing more than one dataset as lkup argument.
+  Try passing a single one by subsetting it lkup <- lkups$versions_paths$dataset_name_PROD")
+
+
+  # **** TO BE REMOVED **** REMOVAL STARTS HERE
+  # Once `pip-grp` has been integrated in ingestion pipeline
+  # Forces fill_gaps to TRUE when using group_by option
+  if (group_by != "none") {
+    fill_gaps <- TRUE
+    message("Info: argument group_by in pip() is deprecated; please use pip_grp() instead.")
+  }
+  # **** TO BE REMOVED **** REMOVAL ENDS HERE
+
+  # Countries vector ------------
+  lcv <- # List with countries vectors
+    create_countries_vctr(
+      country         =  country,
+      year            =  year,
+      valid_years     =  lkup$valid_years,
+      aux_files       =  lkup$aux_files
+    )
+  # lcv$est_ctrs has all the country_code that we are interested in
+
+  cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb")
+  if (!file.exists(cache_file_path)) {
+    # Create an empty duckdb file
+    create_duckdb_file(cache_file_path)
+  }
+  # mains estimates ---------------
+  if (fill_gaps) {
+    ## lineup years-----------------
+    out <- fg_pip_old(
+      country            = lcv$est_ctrs,
+      year               = year,
+      povline            = povline,
+      popshare           = popshare,
+      welfare_type       = welfare_type,
+      reporting_level    = reporting_level,
+      ppp                = ppp,
+      lkup               = lkup
+    )
+  } else {
+    ## survey years ------------------
+    out <- rg_pip_old(
+      country         = lcv$est_ctrs,
+      year            = year,
+      povline         = povline,
+      popshare        = popshare,
+      welfare_type    = welfare_type,
+      reporting_level = reporting_level,
+      ppp             = ppp,
+      lkup            = lkup
+    )
+  }
+
+  cached_data <- out$data_in_cache
+  main_data <- out$main_data
+
+  if (nrow(main_data) > 0) {
+    out <- main_data |>
+      rowbind(cached_data)
+
+    update_master_file(main_data, cache_file_path, fill_gaps)
+
+  } else {
+    out <- cached_data
+  }
+  if (!data.table::is.data.table(out)) {
+    setDT(out)
+  }
+  # Early return for empty table---------------
+  if (nrow(out) == 0) return(pipapi::empty_response)
+
+  # aggregate distributions ------------------
+  if (reporting_level %in% c("national", "all")) {
+    out <- add_agg_stats(
+      df = out,
+      return_cols = lkup$return_cols$ag_average_poverty_stats
+    )
+    if (reporting_level == "national") {
+      out <- out[reporting_level == "national"]
+    }
+  }
+
+
+
+  add_vars_out_of_pipeline(out, fill_gaps = fill_gaps, lkup = lkup)
+
+  # **** TO BE REMOVED **** REMOVAL STARTS HERE
+  # Once `pip-grp` has been integrated in ingestion pipeline
+  # Handles grouped aggregations
+  if (group_by != "none") {
+    # Handle potential (insignificant) difference in poverty_line values that
+    # may mess-up the grouping
+    out$poverty_line <- povline
+
+    out <- pip_aggregate_by(
+      df          = out,
+      group_lkup  = lkup[["pop_region"]],
+      return_cols = lkup$return_cols$pip_grp
+    )
+    # Censor regional values
+    if (censor) {
+      out <- censor_rows(out, lkup[["censored"]], type = "regions")
+    }
+
+    out <- out[, c("region_name",
+                   "region_code",
+                   "reporting_year",
+                   "reporting_pop",
+                   "poverty_line",
+                   "headcount",
+                   "poverty_gap",
+                   "poverty_severity",
+                   "watts",
+                   "mean",
+                   "pop_in_poverty")]
+
+    return(out)
+  }
+  # **** TO BE REMOVED **** REMOVAL ENDS HERE
+
+
+  # pre-computed distributional stats ---------------
+  crr_names  <- names(out)    # current variables
+  names2keep <- lkup$return_cols$pip$cols # all variables
+
+  out <- add_dist_stats(
+    df = out,
+    dist_stats = lkup[["dist_stats"]]
+  )
+
+  # Add aggregate medians ----------------
+  out <- add_agg_medians(
+    df        = out,
+    fill_gaps = fill_gaps,
+    data_dir  = lkup$data_root
+  )
+
+  # format ----------------
+
+
+  if (fill_gaps) {
+
+    ## Inequality indicators to NA for lineup years ----
+    dist_vars  <- names2keep[!(names2keep %in% crr_names)]
+    out[,
+        (dist_vars) := NA_real_]
+
+    ## estimate_var -----
+    out <- estimate_type_ctr_lnp(out, lkup)
+
+  } else {
+    out[, estimate_type := NA_character_]
+  }
+  ## Handle survey coverage ------------
+  if (reporting_level != "all") {
+    keep <- out$reporting_level == reporting_level
+    out <- out[keep, ]
+  }
+
+  # Censor country values
+  if (censor) {
+    out <- censor_rows(out, lkup[["censored"]], type = "countries")
+  }
+
+
+  # Select columns
+  if (additional_ind) {
+    get_additional_indicators(out)
+    added_names <- attr(out, "new_indicators_names")
+    names2keep  <- c(names2keep, added_names)
+
+  }
+  # Keep relevant variables
+  out  <- out[, .SD, .SDcols = names2keep]
+
+
+  # make sure we always report the same precision in all numeric variables
+  doub_vars <-
+    names(out)[unlist(lapply(out, is.double))] |>
+    data.table::copy()
+
+  out[, (doub_vars) := lapply(.SD, round, digits = 12),
+      .SDcols = doub_vars]
+
+  # Order rows by country code and reporting year
+  data.table::setorder(out, country_code, reporting_year, reporting_level, welfare_type)
+  #}
+
+  # Make sure no duplicate remains
+  out <- out |> collapse::funique()
+  # return -------------
+  return(out)
+}
diff --git a/R/rg_pip_old.R b/R/rg_pip_old.R
new file mode 100644
index 00000000..e5f71be3
--- /dev/null
+++ b/R/rg_pip_old.R
@@ -0,0 +1,199 @@
+#' OLD: Compute survey year stats
+#'
+#' Compute the main PIP poverty and inequality statistics for survey years.
+#'
+#' @inheritParams pip
+#' @return data.frame
+#' @keywords internal
+rg_pip_old <- function(country,
+                   year,
+                   povline,
+                   popshare,
+                   welfare_type,
+                   reporting_level,
+                   ppp,
+                   lkup) {
+  # get values from lkup
+  valid_regions <- lkup$query_controls$region$values
+  svy_lkup      <- lkup$svy_lkup
+  data_dir      <- lkup$data_root
+
+  cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb")
+
+  metadata <- subset_lkup(
+    country         = country,
+    year            = year,
+    welfare_type    = welfare_type,
+    reporting_level = reporting_level,
+    lkup            = svy_lkup,
+    valid_regions   = valid_regions,
+    data_dir        = data_dir,
+    povline         = povline,
+    cache_file_path = cache_file_path,
+    fill_gaps       = FALSE
+  )
+
+  data_present_in_master <- metadata$data_present_in_master
+  povline  <- metadata$povline
+  metadata  <- metadata$lkup
+
+
+  # Remove aggregate distribution if popshare is specified
+  # TEMPORARY FIX UNTIL popshare is supported for aggregate distributions
+  metadata <- filter_lkup(metadata = metadata,
+                          popshare = popshare)
+
+  # return empty dataframe if no metadata is found
+  if (nrow(metadata) == 0) {
+    return(list(main_data = pipapi::empty_response,
+                data_in_cache = data_present_in_master))
+  }
+
+  # load data
+  lt <- load_data_list_old(metadata)
+
+  # parallelization
+  # res <- get_pov_estimates(lt, povline = povline)
+
+  # Regular lapply
+  res <- lapply(lt, process_dt_old, povline = povline)
+
+  res <- rbindlist(res, fill = TRUE)
+
+
+  # clean data
+  metadata[, file := basename(path)]
+
+  out <- join(res,
+              metadata,
+              on = c("file", "reporting_level"),
+              how = "full",
+              validate = "m:1",
+              verbose = 0)
+
+  out[, `:=`(
+    mean = survey_mean_ppp,
+    median = survey_median_ppp,
+    file = NULL
+  )]
+
+  setnames(out, "povline", "poverty_line")
+
+
+  return(list(main_data = out, data_in_cache = data_present_in_master))
+}
+
+
+
+# OLD: Efficient FGT calculation for a data.table and vector of poverty lines
+#' Title
+#'
+#' @param dt data frame with `welfare` and `weight` columns
+#' @param welfare character: welfare variable name
+#' @param weight character: weight variable name
+#' @param povlines double: vector with poveryt lines
+#'
+#' @return data.table with estimates poverty estimates
+#' @keywords internal
+compute_fgt_dt_old <- function(dt, welfare, weight, povlines) {
+  w <- dt[[welfare]]
+  wt <- dt[[weight]]
+  n <- length(w)
+  m <- length(povlines)
+
+  # Pre-allocate result matrix
+  res <- matrix(NA_real_, nrow = m, ncol = 3)
+  colnames(res) <- c("FGT0", "FGT1", "FGT2")
+  watts_vec <- numeric(m)
+
+  # Precompute log(w) for efficiency
+  logw <- rep(NA_real_, n)
+  pos <- w > 0
+  logw[pos] <- log(w[pos])
+
+  for (i in seq_along(povlines)) {
+    pov <- povlines[i]
+    poor <- w < pov
+    rel_dist <- 1 - (w / pov)
+    rel_dist[!poor] <- 0
+    res[i, 1] <- fmean(poor, w = wt) # FGT0
+    res[i, 2] <- fmean(rel_dist, w = wt) # FGT1
+    res[i, 3] <- fmean(rel_dist^2, w = wt) # FGT2
+
+    # Optimized Watts index calculation
+    keep <- poor & pos
+    if (any(keep, na.rm = TRUE)) {
+      watts_vec[i] <- (fsum((log(pov) - logw[keep]) * wt[keep])) / fsum(wt)
+    } else {
+      watts_vec[i] <- 0
+    }
+  }
+  data.table(
+    povline = povlines,
+    headcount = res[, 1],
+    poverty_gap = res[, 2],
+    poverty_severity = res[, 3],
+    watts = watts_vec
+  )
+}
+
+
+process_dt_old <- function(dt, povline) {
+  dt[, compute_fgt_dt_old(.SD, "welfare", "weight", povline),
+     by = .(file, reporting_level)]
+}
+
+#' OLD: load survey year files and store them in a list
+#'
+#' @param metadata data frame from `subset_lkup()`
+#'
+#' @return list with survey years data
+#' @keywords internal
+load_data_list_old <- \(metadata) {
+
+  # unique values
+  mdout      <- metadata[, lapply(.SD, list), by = path]
+  upaths     <- mdout$path
+  urep_level <- mdout$reporting_level
+  uppp       <- mdout$ppp
+  ucpi       <- mdout$cpi
+
+  seq_along(upaths) |>
+    lapply(\(f) {
+      path      <- upaths[f]
+      rep_level <- urep_level[f][[1]]
+      ppp       <- uppp[f][[1]]
+      cpi       <- ucpi[f][[1]]
+
+      # Build a data.table to merge cpi and ppp
+      fdt <- data.table(reporting_level = as.character(rep_level),
+                        ppp             = ppp,
+                        cpi             = cpi)
+
+      # load data and format
+      dt <-  fst::read_fst(path, as.data.table = TRUE)
+
+      if (length(rep_level) == 1) {
+        if (rep_level == "national") dt[, area := "national"]
+      }
+      setnames(dt, "area", "reporting_level")
+      dt[,
+         `:=`(
+           file = basename(path),
+           reporting_level = as.character(reporting_level)
+         )
+      ]
+
+      dt <- join(dt, fdt,
+                 on = "reporting_level",
+                 validate = "m:1",
+                 how = "left",
+                 verbose = 0)
+
+      dt[, welfare := welfare/(cpi * ppp)
+      ][,
+        c("cpi", "ppp") := NULL]
+
+    })
+
+}

From dd4a8b7905412332308753aa723abde27b71b071 Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Wed, 6 Aug 2025 09:37:21 -0400
Subject: [PATCH 013/203] master_file tryCatch for duckdb con error

---
 R/duckdb_func.R | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/R/duckdb_func.R b/R/duckdb_func.R
index 8dc4dc13..d108fda0 100644
--- a/R/duckdb_func.R
+++ b/R/duckdb_func.R
@@ -22,8 +22,20 @@ return_if_exists <- function(slkup,
                 lkup = slkup,
                 povline = povline))
   }
-  master_file <- load_inter_cache(cache_file_path = cache_file_path,
-                                  fill_gaps = fill_gaps)
+
+  # ZP new temp code to avoid error from load_inter_cache due to dbConnect
+  master_file <- tryCatch(
+    load_inter_cache(cache_file_path = cache_file_path,
+                     fill_gaps       = fill_gaps),
+    error = function(e) {
+      cli::cli_warn("Failed to load intermediate cache: {e$message}")
+      master_file <- slkup[0]  # zero-row data.table with same columns as lkup
+    }
+  )
+
+  # ZP old code:
+  # master_file <- load_inter_cache(cache_file_path = cache_file_path,
+  #                                 fill_gaps = fill_gaps)
 
   if (fnrow(master_file) == 0) {
     return(list(data_present_in_master = NULL,
@@ -37,6 +49,9 @@ return_if_exists <- function(slkup,
     # convert survey_comparability to NA
     # NOTE: This should not be necessary. for the new lineup distribution
     # metadata should come without this variable.
+
+    # ZP comment: if using refy_lkup, this should be removed because
+    #             it does not include survey_comparability
     slkup[, survey_comparability := NA]
 
 
@@ -47,9 +62,13 @@ return_if_exists <- function(slkup,
 
 
   # This is probably unnecesary
+  # ZP comment: in my quick checks this has no impact, meaning
+  #             slkup is already unique
+  # ZP Question: is this to get rid of duplicates from df_refy???
   lkup_kvars <- slkup |>
     copy() |>
     funique() # this is not big.
+
   # get all vars
   slkup_vars <- setdiff(names(slkup), key_vars)
   # transform to NA when necessary
@@ -75,15 +94,15 @@ return_if_exists <- function(slkup,
   # lkup_kvars_pov <- lkup_kvars[, .(poverty_line = povline),
   #                              by = eval(names(lkup_kvars))]
   lkup_kvars_pov <- lkup_kvars[rep(seq_len(nrow(lkup_kvars)),
-                                   each = length(povline))]
+                                   each = length(povline))] # ZP: add povline
   lkup_kvars_pov[, poverty_line := rep(povline, times = nrow(lkup_kvars))]
 
 
   # Find which (key_vars, poverty_line) are present in master_file
   lk_not_ms <- join(x = lkup_kvars_pov,
-                    y = master_file,
+                    y = master_file, # ZP: remember, master_file is full cache file
                     on = key_vars_pl,
-                    how = "anti",
+                    how = "anti", # rows in lkup not in master_file to know what new to do
                     # validate = "1:1",
                     overid = 2,
                     verbose = 0,

From 633fc8bc1fcef66f5f9484295e4f64af3abef246 Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Wed, 6 Aug 2025 10:25:43 -0400
Subject: [PATCH 014/203] make the NA of survey_comparability if fg type double
 for consistency

---
 R/duckdb_func.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/duckdb_func.R b/R/duckdb_func.R
index d108fda0..135fb858 100644
--- a/R/duckdb_func.R
+++ b/R/duckdb_func.R
@@ -52,7 +52,7 @@ return_if_exists <- function(slkup,
 
     # ZP comment: if using refy_lkup, this should be removed because
     #             it does not include survey_comparability
-    slkup[, survey_comparability := NA]
+    slkup[, survey_comparability := NA_real_]
 
 
   } else {

From a0357c112164d7ab7400b0bf9215e4c9d58247f9 Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Wed, 6 Aug 2025 10:26:02 -0400
Subject: [PATCH 015/203] add interpolation id for df_refy

---
 R/create_lkups.R | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/R/create_lkups.R b/R/create_lkups.R
index a1aa7e6c..8e7f797a 100644
--- a/R/create_lkups.R
+++ b/R/create_lkups.R
@@ -195,6 +195,11 @@ create_lkups <- function(data_dir, versions) {
                  as.character()
                }
              ]
+  refy_lkup[,
+            interpolation_id := paste(country_code,
+                                      reporting_year,
+                                      reporting_level,
+                                      sep = "_")]
 
   if ("region_code" %in% names(refy_lkup)) {
     refy_lkup[,

From 3fb45a263014ab697e4c775716228e562639fdd7 Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Wed, 6 Aug 2025 11:22:14 -0400
Subject: [PATCH 016/203] rm _refy from lineup columns

---
 R/fg_pip.R | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/R/fg_pip.R b/R/fg_pip.R
index d4038d3d..74dcebe2 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -17,7 +17,7 @@ fg_pip <- function(country,
   valid_regions       <- lkup$query_controls$region$values
   interpolation_list  <- lkup$interpolation_list
   data_dir            <- lkup$data_root
-  ref_lkup            <- lkup$ref_lkup # the normal refy table, some country-years have two rows (interpolation)
+  ref_lkup            <- lkup$ref_lkup  # the normal refy table, some country-years have two rows (interpolation)
   refy_lkup           <- lkup$refy_lkup # cleaned refy table, unique by country-years but some columns removed in order to do that
 
   cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb")
@@ -27,19 +27,21 @@ fg_pip <- function(country,
   #   cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb")
   #   con <- duckdb::dbConnect(duckdb::duckdb(), dbdir = cache_file_path, read_only = TRUE)
   # }
+
   # Handle interpolation
   metadata <- subset_lkup(
     country         = country,
     year            = year,
     welfare_type    = welfare_type,
     reporting_level = reporting_level,
-    lkup            = ref_lkup, # only place this is used, for 'interpolation_id'
+    lkup            = refy_lkup, # only place this is used, for 'interpolation_id'
     valid_regions   = valid_regions,
     data_dir        = data_dir,
     povline         = povline,
     cache_file_path = cache_file_path,
     fill_gaps       = TRUE)
 
+
   data_present_in_master <- metadata$data_present_in_master
   povline  <- metadata$povline
   metadata <- metadata$lkup
@@ -51,7 +53,7 @@ fg_pip <- function(country,
 
   # Return empty dataframe if no metadata is found
   if (nrow(metadata) == 0) {
-    print("no metadata")
+    print("ZP: no metadata - i.e. nothing additional to estimate")
     return(list(main_data     = pipapi::empty_response_fg,
                 data_in_cache = data_present_in_master))
   }
@@ -158,14 +160,6 @@ fg_pip <- function(country,
                                          "_",
                                          reporting_year))
 
-                  if ("welfare_refy" %in% names(x)) {
-                    setnames(x,
-                             old = c("welfare_refy",
-                                     "weight_refy"),
-                             new = c("welfare",
-                                     "weight"))
-                  }
-
                   x
                })
 

From 85ba9247a5ab482f81d2083de20a5b4f4b1b1d7a Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Thu, 7 Aug 2025 05:14:11 -0400
Subject: [PATCH 017/203] standardize format of country and year selection as
 vector

---
 R/fg_pip.R | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/R/fg_pip.R b/R/fg_pip.R
index 74dcebe2..16ce4c35 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -98,12 +98,11 @@ fg_pip <- function(country,
   # Extract unique combinations of country-year
   if (any(c("ALL", "WLD") %in% country)) {
     cntry <- refy_lkup$country_code |>
-      unique()
+      funique()
     print("A")
-    #cntry[!cntry %in% c("SSD", "SVK", "TLS", "VEN", "XKX")] # to be removed
   } else {
     cntry <- refy_lkup[country_code %in% country,
-                       .(country_code)] |>
+                       ]$country_code |>
       funique()
     print("B")
   }
@@ -113,7 +112,7 @@ fg_pip <- function(country,
     print("C")
   } else {
     yr <- refy_lkup[reporting_year %in% year,
-                    .(reporting_year)] |>
+                    ]$reporting_year |>
       funique()
     print("D")
   }

From 36945016d700f00912afbc72d1f60355a7f234c6 Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Thu, 7 Aug 2025 05:19:07 -0400
Subject: [PATCH 018/203] replace pipload with pipdata

---
 R/fg_pip.R | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/R/fg_pip.R b/R/fg_pip.R
index 16ce4c35..ebe2facc 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -87,7 +87,7 @@ fg_pip <- function(country,
   # print(as.vector(cntry))
   # print(yr)
   # lt <-
-  #   pipload::load_list_refy(input_list = list(country_code = cntry,
+  #   pipdata::load_list_refy(input_list = list(country_code = cntry,
   #                                             year         = yr),
   #                           path = fs::path(data_dir,
   #                                           "lineup_data"))
@@ -138,7 +138,7 @@ fg_pip <- function(country,
   print(as.vector(cntry))
   print(yr)
   lt <-
-    pipload::load_list_refy(input_list = full_list,
+    pipdata::load_list_refy(input_list = full_list,
                             path = fs::path(data_dir,
                                             "lineup_data"))
 
@@ -148,12 +148,12 @@ fg_pip <- function(country,
   lt <- lapply(lt,
                FUN = \(x) {
                   x <- x |>
-                   pipload::attr_to_column("reporting_level_rows") |> # only rep level????
-                   pipload::attr_to_column("country_code") |>
-                   pipload::attr_to_column("reporting_year") |>
-                   pipload::attr_to_column("mean",
+                   pipdata::attr_to_column("reporting_level_rows") |> # only rep level????
+                   pipdata::attr_to_column("country_code") |>
+                   pipdata::attr_to_column("reporting_year") |>
+                   pipdata::attr_to_column("mean",
                                            dist_stats = TRUE) |>
-                   pipload::attr_to_column("median",
+                   pipdata::attr_to_column("median",
                                            dist_stats = TRUE) |>
                    fmutate(file = paste0(country_code,
                                          "_",
@@ -163,7 +163,7 @@ fg_pip <- function(country,
                })
 
   rlang::env_poke(env   = globalenv(),
-                  nm    = "pipload_list",
+                  nm    = "pipdata_list",
                   value = lt)
 
   # ZP Add: do fgt estimations using `res <- lapply(lt, process_dt, povline = povline)`

From a7da9071a96f1b48c77755d81f4a84ee56480ae9 Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Thu, 7 Aug 2025 09:05:19 -0400
Subject: [PATCH 019/203] add valid lineup years to lkup

---
 R/create_lkups.R | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/R/create_lkups.R b/R/create_lkups.R
index 8e7f797a..02ed2b78 100644
--- a/R/create_lkups.R
+++ b/R/create_lkups.R
@@ -213,8 +213,19 @@ create_lkups <- function(data_dir, versions) {
                           reportvar  = FALSE,
                           match_type = "m:1")
 
+
+  # ZP ADD - CREATE OBJECT: refy_lkup
   #___________________________________________________________________________
+  lineup_years_path <-
+    fs::path(data_dir,
+             "estimations/lineup_years.fst")
+
+  # NOTE: THIS `prod_refy_estimation.fst` is the refy table (not really)
+  lineup_years <- fst::read_fst(lineup_years_path) |>
+    as.list()
+
 
+  #___________________________________________________________________________
 
 
   # CREATE OBJECT: interpolation_list ----
@@ -455,6 +466,8 @@ create_lkups <- function(data_dir, versions) {
 
   # CREATE OBJECT: valid_years ----
   valid_years <- valid_years(data_dir)
+  valid_years <- c(valid_years,
+                   lineup_years) # add lineup years
 
   # CREATE OBJECT: query_controls ----
   # Create list of query controls

From 5466131179c97c2fdd2deeef4dbfa9aede5ab9a1 Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Thu, 7 Aug 2025 10:39:23 -0400
Subject: [PATCH 020/203] only fg lineup years that are valid

---
 R/fg_pip.R | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/R/fg_pip.R b/R/fg_pip.R
index ebe2facc..c90705f1 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -120,20 +120,23 @@ fg_pip <- function(country,
     ref_lkup |>
     fsubset(country_code     %in% cntry &
               reporting_year %in% yr) |>
+    fsubset(reporting_year %in% lkup$valid_years$lineup_years) |>
     fselect(country_code,
             year = reporting_year) |>
     funique()
 
   # Split years by country
-  full_list <- dtemp[, .(year = list(year)), by = country_code][
+  full_list <- dtemp[,
+                     .(year = list(year)),
+                     by = country_code][
     , .(country_code, year = year)
   ]
 
   # Convert to desired structure
   full_list <- list(
     country_code = full_list$country_code,
-    year = lapply(full_list$year, as.numeric)
-  )
+    year         = lapply(full_list$year,
+                          as.numeric))
   #return(full_list)
   print(as.vector(cntry))
   print(yr)

From b29cc6b7ab2ae9df657fdbaee3c70e729030677d Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Mon, 11 Aug 2025 14:52:05 -0400
Subject: [PATCH 021/203] load dist stats in lkups

---
 R/create_lkups.R | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/R/create_lkups.R b/R/create_lkups.R
index 02ed2b78..bf6e69e6 100644
--- a/R/create_lkups.R
+++ b/R/create_lkups.R
@@ -180,7 +180,8 @@ create_lkups <- function(data_dir, versions) {
   refy_lkup_path <- fs::path(data_dir,
                              "estimations/prod_refy_estimation.fst")
 
-  # NOTE: THIS `prod_refy_estimation.fst` is the refy table (not really)
+  # NOTE: THIS `prod_refy_estimation.fst` is the refy table but
+  #             unique at the country-year level
   refy_lkup      <- fst::read_fst(refy_lkup_path,
                                   as.data.table = TRUE)
 
@@ -214,16 +215,30 @@ create_lkups <- function(data_dir, versions) {
                           match_type = "m:1")
 
 
-  # ZP ADD - CREATE OBJECT: refy_lkup
+  # ZP ADD - CREATE OBJECT: lineup years
   #___________________________________________________________________________
   lineup_years_path <-
     fs::path(data_dir,
              "estimations/lineup_years.fst")
 
-  # NOTE: THIS `prod_refy_estimation.fst` is the refy table (not really)
   lineup_years <- fst::read_fst(lineup_years_path) |>
     as.list()
 
+  # ZP ADD - CREATE OBJECT: lineup dist stats
+  #___________________________________________________________________________
+  lineup_dist_stats <-
+    fs::path(data_dir,
+             "estimations/lineup_dist_stats.fst")
+
+  lineup_dist_stats <- fst::read_fst(lineup_dist_stats,
+                                     as.data.table = TRUE) |>
+    fmutate(file = paste(country_code,
+                         reporting_year,
+                         sep = "_"))
+  gv(lineup_dist_stats,
+     c("min",
+       "max")) <- NULL
+
 
   #___________________________________________________________________________
 
@@ -588,7 +603,8 @@ create_lkups <- function(data_dir, versions) {
     aux_tables         = aux_tables,
     interpolation_list = interpolation_list,
     valid_years        = valid_years,
-    cache_data_id      = cache_data_id
+    cache_data_id      = cache_data_id,
+    lineup_dist_stats  = lineup_dist_stats
   )
 
   return(lkup)

From b1484a0671a9964989fc9527c69885846474147b Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Mon, 11 Aug 2025 14:52:24 -0400
Subject: [PATCH 022/203] use dist stats in fg

---
 R/fg_pip.R | 41 +++++++++++++++++++++++------------------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/R/fg_pip.R b/R/fg_pip.R
index c90705f1..792b56e9 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -145,8 +145,6 @@ fg_pip <- function(country,
                             path = fs::path(data_dir,
                                             "lineup_data"))
 
-
-
   print(names(lt))
   lt <- lapply(lt,
                FUN = \(x) {
@@ -154,10 +152,6 @@ fg_pip <- function(country,
                    pipdata::attr_to_column("reporting_level_rows") |> # only rep level????
                    pipdata::attr_to_column("country_code") |>
                    pipdata::attr_to_column("reporting_year") |>
-                   pipdata::attr_to_column("mean",
-                                           dist_stats = TRUE) |>
-                   pipdata::attr_to_column("median",
-                                           dist_stats = TRUE) |>
                    fmutate(file = paste0(country_code,
                                          "_",
                                          reporting_year))
@@ -183,6 +177,20 @@ fg_pip <- function(country,
                   nm    = "res_povest",
                   value = res)
 
+  # ZP Add: join to dist_stats
+  #-------------------------
+  ly_dist <- lkup$lineup_dist_stats
+  res <- res |>
+    joyn::left_join(y            = ly_dist,
+                    by           = c("file",
+                                     "reporting_level",
+                                     "reporting_year"),
+                    relationship = "one-to-one",
+                    reportvar    = FALSE)
+  rlang::env_poke(env   = globalenv(),
+                  nm    = "res_dist",
+                  value = res)
+
   # ZP Add: join to metadata
   #-------------------------
   metadata[,
@@ -232,19 +240,21 @@ fg_pip <- function(country,
               verbose  = 0)
 
   out[, `:=`(
-    #mean   = survey_mean_ppp,
-    #median = survey_median_ppp,
     file   = NULL
   )]
 
-
   setnames(out,
            "povline",
            "poverty_line")
 
-
+  rlang::env_poke(env   = globalenv(),
+                  nm    = "out1",
+                  value = out)
   # Ensure that out does not have duplicates
   out <- fg_remove_duplicates(out)
+  rlang::env_poke(env   = globalenv(),
+                  nm    = "out2",
+                  value = out)
 
   # Fix issue with rounding of poverty lines
   out[,
@@ -263,12 +273,6 @@ fg_pip <- function(country,
 }
 
 
-# process_dt_fg <- function(dt, povline, mean_and_med = FALSE) {
-#   dt[, compute_fgt_dt(.SD, "welfare", "weight", povline, mean_and_med),
-#      by = .(file, reporting_level)]
-# }
-
-
 #' Remove duplicated rows created during the interpolation process
 #'
 #' @param df data.table: Table of results created in `fg_pip()`
@@ -295,6 +299,9 @@ fg_remove_duplicates <- function(df,
                                           "survey_time",
                                           "survey_year",
                                           "surveyid_year")) {
+  # not all cols need to be changes
+  cols <- setdiff(cols,
+                  colnames(df))
   # Modify cache_id
   # * Ensures that cache_id is unique for both extrapolated and interpolated surveys
   # * Ensures that cache_id can be kept as an output of fg_pip() while still removing duplicated rows
@@ -318,7 +325,6 @@ fg_remove_duplicates <- function(df,
 #' @param reporting_level character
 #'
 #' @return character
-
 fg_standardize_cache_id <- function(cache_id,
                                     interpolation_id,
                                     reporting_level) {
@@ -338,7 +344,6 @@ fg_standardize_cache_id <- function(cache_id,
 #' @inheritParams fg_remove_duplicates
 #'
 #' @return data.table
-
 fg_assign_nas_values_to_dup_cols <- function(df,
                                              cols) {
   #Classes are maintained by default.

From f0e0fffa8fb9ff8521b6ad5b2a9095750f5a0349 Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Mon, 11 Aug 2025 14:53:45 -0400
Subject: [PATCH 023/203] don't show joyn message

---
 R/fg_pip.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/R/fg_pip.R b/R/fg_pip.R
index 792b56e9..9f1376c8 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -186,7 +186,8 @@ fg_pip <- function(country,
                                      "reporting_level",
                                      "reporting_year"),
                     relationship = "one-to-one",
-                    reportvar    = FALSE)
+                    reportvar    = FALSE,
+                    verbose      = FALSE)
   rlang::env_poke(env   = globalenv(),
                   nm    = "res_dist",
                   value = res)

From b081b44d2fa283ec110ccb5faf5f1aaa3e16e326 Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Mon, 11 Aug 2025 16:05:13 -0400
Subject: [PATCH 024/203] change full join to left join - this removes nowcast
 years - i.e. years not included as lineup years

---
 R/fg_pip.R | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/R/fg_pip.R b/R/fg_pip.R
index 9f1376c8..e54cd5d8 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -234,11 +234,14 @@ fg_pip <- function(country,
 
   out <- join(res,
               tmp_metadata_unique,
-              on       = c("file",
-                           "reporting_level"),
-              how      = "full",
-              validate = "m:1",
-              verbose  = 0)
+              on            = c("file",
+                                "reporting_level"),
+              how           = "left", # ZP: change from full to left,
+                                      #  this rm nowcast years - i.e. years not included
+                                      #  as lineup years
+              validate      = "m:1",
+              drop.dup.cols = TRUE,
+              verbose       = 0)
 
   out[, `:=`(
     file   = NULL
@@ -306,9 +309,9 @@ fg_remove_duplicates <- function(df,
   # Modify cache_id
   # * Ensures that cache_id is unique for both extrapolated and interpolated surveys
   # * Ensures that cache_id can be kept as an output of fg_pip() while still removing duplicated rows
-  df$cache_id <- fg_standardize_cache_id(cache_id = df$cache_id,
-                                         interpolation_id = df$data_interpolation_id,
-                                         reporting_level = df$reporting_level)
+  # df$cache_id <- fg_standardize_cache_id(cache_id = df$cache_id,
+  #                                        interpolation_id = df$data_interpolation_id,
+  #                                        reporting_level = df$reporting_level)
   # Set collapse vars to NA (by type)
   df <- fg_assign_nas_values_to_dup_cols(df = df,
                                          cols = cols)

From d0bd7bf7756ce60e00c3f5a286185bf10022aa07 Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Tue, 12 Aug 2025 00:35:41 -0400
Subject: [PATCH 025/203] fix cache different columns

---
 R/duckdb_func.R | 14 ++++----
 R/fg_pip.R      | 76 ++++++++++++-------------------------------
 R/pip.R         | 36 +++++++++++++++------
 R/utils.R       | 85 +++++++++++++++++++++++++++++++------------------
 4 files changed, 110 insertions(+), 101 deletions(-)

diff --git a/R/duckdb_func.R b/R/duckdb_func.R
index 135fb858..fc91b3f1 100644
--- a/R/duckdb_func.R
+++ b/R/duckdb_func.R
@@ -12,15 +12,15 @@ return_if_exists <- function(slkup,
 
   if (fnrow(slkup) == 0 ) {
     return(list(data_present_in_master = NULL,
-                lkup = slkup,
-                povline = povline))
+                lkup                   = slkup,
+                povline                = povline))
   }
 
 
   if (getOption("pipapi.query_live_data")) {
     return(list(data_present_in_master = NULL,
-                lkup = slkup,
-                povline = povline))
+                lkup                   = slkup,
+                povline                = povline))
   }
 
   # ZP new temp code to avoid error from load_inter_cache due to dbConnect
@@ -32,6 +32,8 @@ return_if_exists <- function(slkup,
       master_file <- slkup[0]  # zero-row data.table with same columns as lkup
     }
   )
+  # temp ZP just to bypass cache
+  #master_file <- slkup[0]
 
   # ZP old code:
   # master_file <- load_inter_cache(cache_file_path = cache_file_path,
@@ -39,8 +41,8 @@ return_if_exists <- function(slkup,
 
   if (fnrow(master_file) == 0) {
     return(list(data_present_in_master = NULL,
-                lkup = slkup,
-                povline = povline))
+                lkup                   = slkup,
+                povline                = povline))
   }
 
 
diff --git a/R/fg_pip.R b/R/fg_pip.R
index e54cd5d8..1f8c5404 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -59,62 +59,24 @@ fg_pip <- function(country,
   }
 
 
-  # ZP Add: load refy data
-  #-------------------------
-  # Extract unique combinations of country-year
-  # if (any(c("ALL", "WLD") %in% country)) {
-  #   cntry <- refy_lkup$country_code |>
-  #     unique()
-  #   print("A")
-  #   #cntry[!cntry %in% c("SSD", "SVK", "TLS", "VEN", "XKX")] # to be removed
-  # } else {
-  #   cntry <- refy_lkup[country_code %in% country,
-  #                      .(country_code)] |>
-  #     funique()
-  #   print("B")
-  # }
-  # if (any(c("ALL") %in% year)) {
-  #   yr <- refy_lkup$reporting_year |>
-  #     unique()
-  #   print("C")
-  # } else {
-  #   yr <- refy_lkup[reporting_year %in% year,
-  #                      .(reporting_year)] |>
-  #     funique()
-  #   print("D")
-  # }
-  #
-  # print(as.vector(cntry))
-  # print(yr)
-  # lt <-
-  #   pipdata::load_list_refy(input_list = list(country_code = cntry,
-  #                                             year         = yr),
-  #                           path = fs::path(data_dir,
-  #                                           "lineup_data"))
-
-
   #'     # ZP Add: load refy data
   #-------------------------
   # Extract unique combinations of country-year
   if (any(c("ALL", "WLD") %in% country)) {
     cntry <- refy_lkup$country_code |>
       funique()
-    print("A")
   } else {
     cntry <- refy_lkup[country_code %in% country,
                        ]$country_code |>
       funique()
-    print("B")
   }
   if (any(c("ALL") %in% year)) {
     yr <- refy_lkup$reporting_year |>
       unique()
-    print("C")
   } else {
     yr <- refy_lkup[reporting_year %in% year,
                     ]$reporting_year |>
       funique()
-    print("D")
   }
   dtemp <-
     ref_lkup |>
@@ -137,21 +99,19 @@ fg_pip <- function(country,
     country_code = full_list$country_code,
     year         = lapply(full_list$year,
                           as.numeric))
-  #return(full_list)
-  print(as.vector(cntry))
-  print(yr)
   lt <-
     pipdata::load_list_refy(input_list = full_list,
                             path = fs::path(data_dir,
                                             "lineup_data"))
 
-  print(names(lt))
   lt <- lapply(lt,
                FUN = \(x) {
                   x <- x |>
                    pipdata::attr_to_column("reporting_level_rows") |> # only rep level????
                    pipdata::attr_to_column("country_code") |>
                    pipdata::attr_to_column("reporting_year") |>
+                    pipdata::attr_to_column("mean", dist_stats = TRUE) |>
+                    pipdata::attr_to_column("median", dist_stats = TRUE) |>
                    fmutate(file = paste0(country_code,
                                          "_",
                                          reporting_year))
@@ -179,18 +139,18 @@ fg_pip <- function(country,
 
   # ZP Add: join to dist_stats
   #-------------------------
-  ly_dist <- lkup$lineup_dist_stats
-  res <- res |>
-    joyn::left_join(y            = ly_dist,
-                    by           = c("file",
-                                     "reporting_level",
-                                     "reporting_year"),
-                    relationship = "one-to-one",
-                    reportvar    = FALSE,
-                    verbose      = FALSE)
-  rlang::env_poke(env   = globalenv(),
-                  nm    = "res_dist",
-                  value = res)
+  # ly_dist <- lkup$lineup_dist_stats
+  # res <- res |>
+  #   joyn::left_join(y            = ly_dist,
+  #                   by           = c("file",
+  #                                    "reporting_level",
+  #                                    "reporting_year"),
+  #                   relationship = "one-to-one",
+  #                   reportvar    = FALSE,
+  #                   verbose      = FALSE)
+  # rlang::env_poke(env   = globalenv(),
+  #                 nm    = "res_dist",
+  #                 value = res)
 
   # ZP Add: join to metadata
   #-------------------------
@@ -231,6 +191,9 @@ fg_pip <- function(country,
   rlang::env_poke(env   = globalenv(),
                   nm    = "tmp_metadata_unique_check",
                   value = tmp_metadata_unique)
+  rlang::env_poke(env   = globalenv(),
+                  nm    = "res_final",
+                  value = res)
 
   out <- join(res,
               tmp_metadata_unique,
@@ -243,6 +206,9 @@ fg_pip <- function(country,
               drop.dup.cols = TRUE,
               verbose       = 0)
 
+  rlang::env_poke(env   = globalenv(),
+                  nm    = "out_check",
+                  value = out)
   out[, `:=`(
     file   = NULL
   )]
@@ -313,7 +279,7 @@ fg_remove_duplicates <- function(df,
   #                                        interpolation_id = df$data_interpolation_id,
   #                                        reporting_level = df$reporting_level)
   # Set collapse vars to NA (by type)
-  df <- fg_assign_nas_values_to_dup_cols(df = df,
+  df <- fg_assign_nas_values_to_dup_cols(df   = df,
                                          cols = cols)
 
   # Ensure that out does not have duplicates
diff --git a/R/pip.R b/R/pip.R
index dd62cd20..dc180832 100644
--- a/R/pip.R
+++ b/R/pip.R
@@ -139,12 +139,27 @@ pip <- function(country         = "ALL",
       )
     }
 
-    cached_data <- out$data_in_cache
-    main_data <- out$main_data
 
+  # Cache new data
+  #---------------------------------------------
+    cached_data <- out$data_in_cache
+  #print(out)
+  cached_data <- qDT(cached_data)
+    main_data   <- qDT(out$main_data)
+    # print(colnames(main_data))
+    # print(colnames(cached_data))
+    #print(setdiff(colnames(main_data), colnames(cached_data)))
     if (nrow(main_data) > 0) {
-      out <- main_data |>
-        rowbind(cached_data)
+      #print(is.data.table(cached_data))
+      if (is.null(out$data_in_cache)) {
+        out <- main_data
+      } else {
+        if (fill_gaps) {
+          cached_data <- fg_remove_duplicates(cached_data)
+        }
+        out <- main_data |>
+          rowbind(cached_data)
+      }
 
         update_master_file(main_data, cache_file_path, fill_gaps)
 
@@ -168,7 +183,8 @@ pip <- function(country         = "ALL",
       }
     }
 
-
+    # Add out of pipeline variablse
+    #---------------------------------------------
 
     add_vars_out_of_pipeline(out, fill_gaps = fill_gaps, lkup = lkup)
 
@@ -212,10 +228,11 @@ pip <- function(country         = "ALL",
     names2keep <- lkup$return_cols$pip$cols # all variables
 
     out <- add_dist_stats(
-      df = out,
-      dist_stats = lkup[["dist_stats"]]
-    )
-
+      df         = out,
+      lkup       = lkup,
+      fill_gaps  = fill_gaps)
+    print("fooooooo")
+    return(out)
     # Add aggregate medians ----------------
     out <- add_agg_medians(
       df        = out,
@@ -239,6 +256,7 @@ pip <- function(country         = "ALL",
     } else {
       out[, estimate_type := NA_character_]
     }
+
     ## Handle survey coverage ------------
     if (reporting_level != "all") {
       keep <- out$reporting_level == reporting_level
diff --git a/R/utils.R b/R/utils.R
index ae4fdc89..9b4bd945 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -24,11 +24,11 @@ subset_lkup <- function(country,
   # STEP 2 - Select countries
   keep <- select_country(lkup, keep, country, valid_regions)
   # STEP 3 - Select years
-  keep <- select_years(lkup = lkup,
-                       keep = keep,
-                       year = year,
-                       country = country,
-                       data_dir =  data_dir,
+  keep <- select_years(lkup          = lkup,
+                       keep          = keep,
+                       year          = year,
+                       country       = country,
+                       data_dir      =  data_dir,
                        valid_regions = valid_regions)
 
   # # step 4. Select MRV
@@ -39,18 +39,18 @@ subset_lkup <- function(country,
     keep <- keep & lkup$welfare_type == welfare_type
   }
   # STEP 5 - Select reporting_level
-  keep <- select_reporting_level(lkup = lkup,
-                                 keep = keep,
+  keep <- select_reporting_level(lkup            = lkup,
+                                 keep            = keep,
                                  reporting_level = reporting_level[1])
 
 
   lkup <- lkup[keep, ]
 
   # Return with grace
-  return_if_exists(slkup = lkup,
-                   povline = povline,
+  return_if_exists(slkup           = lkup,
+                   povline         = povline,
                    cache_file_path = cache_file_path,
-                   fill_gaps = fill_gaps)
+                   fill_gaps       = fill_gaps)
 }
 
 #' select_country
@@ -290,30 +290,53 @@ get_svy_data <- function(svy_id,
 #' @return data.table
 #' @export
 #'
-add_dist_stats <- function(df, dist_stats) {
-  # Keep only relevant columns
-  cols <- c(
-    "cache_id",
-    # "country_code",
-    # "reporting_year",
-    # "welfare_type",
-    "reporting_level",
-    "gini",
-    "polarization",
-    "mld",
-    sprintf("decile%s", 1:10)
-  )
-  dist_stats <- dist_stats[, .SD, .SDcols = cols]
+add_dist_stats <- function(df, lkup, fill_gaps) {
+
+  if (fill_gaps) {
+    dist_stats <- lkup[["lineup_dist_stats"]]
+  } else {
+    dist_stats <- lkup[["dist_stats"]]
+  }
 
-  # merge dist stats with main table
-  # data.table::setnames(dist_stats, "survey_median_ppp", "median")
+  if (fill_gaps) {
 
-  df <- dist_stats[df,
-                   on = .(cache_id, reporting_level), #.(country_code, reporting_year, welfare_type, reporting_level),
-                   allow.cartesian = TRUE
-  ]
+    df <- df |>
+      joyn::joyn(y                = dist_stats,
+                 by               = c("country_code",
+                                      "reporting_level",
+                                      "reporting_year"),
+                 match_type       = "1:1",
+                 keep_common_vars = FALSE,
+                 reportvar        = FALSE,
+                 verbose          = FALSE,
+                 keep             = "left")
 
-  return(df)
+  } else {
+    # Keep only relevant columns
+    cols <- c(
+      "cache_id",
+      # "country_code",
+      # "reporting_year",
+      # "welfare_type",
+      "reporting_level",
+      "gini",
+      "polarization",
+      "mld",
+      sprintf("decile%s", 1:10)
+    )
+    dist_stats <- dist_stats[, .SD, .SDcols = cols]
+
+    # merge dist stats with main table
+    # data.table::setnames(dist_stats, "survey_median_ppp", "median")
+
+    df <- dist_stats[df,
+                     on = .(cache_id, reporting_level), #.(country_code, reporting_year, welfare_type, reporting_level),
+                     allow.cartesian = TRUE
+    ]
+  }
+
+
+  df
 }
 
 #' Collapse rows

From 7cc0295ddf7cd96ddb2126734b9dc816ca12f32b Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Tue, 12 Aug 2025 14:39:15 -0400
Subject: [PATCH 026/203] add to lkup whether to use new lineups

---
 R/create_lkups.R | 75 ++++++++++++++++++++++++++++++++++++------------
 1 file changed, 56 insertions(+), 19 deletions(-)

diff --git a/R/create_lkups.R b/R/create_lkups.R
index bf6e69e6..fb936bac 100644
--- a/R/create_lkups.R
+++ b/R/create_lkups.R
@@ -11,7 +11,6 @@ create_versioned_lkups <-
 
     vintage_pattern <- create_vintage_pattern_call(vintage_pattern)
 
-
     data_dirs <- extract_data_dirs(data_dir = data_dir,
                                    vintage_pattern = vintage_pattern)
 
@@ -175,6 +174,7 @@ create_lkups <- function(data_dir, versions) {
                                             collapse = "|"),
              by = .(interpolation_id)]
 
+
   # ZP ADD - CREATE OBJECT: refy_lkup
   #___________________________________________________________________________
   refy_lkup_path <- fs::path(data_dir,
@@ -240,6 +240,8 @@ create_lkups <- function(data_dir, versions) {
        "max")) <- NULL
 
 
+  use_new_lineup_version <- use_new_lineup_version(versions)
+
   #___________________________________________________________________________
 
 
@@ -588,24 +590,24 @@ create_lkups <- function(data_dir, versions) {
 
   # Create list of lkups
   lkup <- list(
-    svy_lkup           = svy_lkup,
-    ref_lkup           = ref_lkup,
-    refy_lkup          = refy_lkup,
-    dist_stats         = dist_stats,
-    pop_region         = pop_region,
-    cp_lkups           = cp_lkups,
-    pl_lkup            = pl_lkup,
-    censored           = censored,
-    aux_files          = aux_files,
-    return_cols        = return_cols,
-    query_controls     = query_controls,
-    data_root          = data_dir,
-    aux_tables         = aux_tables,
-    interpolation_list = interpolation_list,
-    valid_years        = valid_years,
-    cache_data_id      = cache_data_id,
-    lineup_dist_stats  = lineup_dist_stats
-  )
+    svy_lkup               = svy_lkup,
+    ref_lkup               = ref_lkup,
+    refy_lkup              = refy_lkup,
+    dist_stats             = dist_stats,
+    pop_region             = pop_region,
+    cp_lkups               = cp_lkups,
+    pl_lkup                = pl_lkup,
+    censored               = censored,
+    aux_files              = aux_files,
+    return_cols            = return_cols,
+    query_controls         = query_controls,
+    data_root              = data_dir,
+    aux_tables             = aux_tables,
+    interpolation_list     = interpolation_list,
+    valid_years            = valid_years,
+    cache_data_id          = cache_data_id,
+    lineup_dist_stats      = lineup_dist_stats,
+    use_new_lineup_version = use_new_lineup_version)
 
   return(lkup)
 }
@@ -805,3 +807,38 @@ available_versions <- function(data_dir) {
                   test_regex = vintage_pattern$test_regex)
 
 }
+
+
+#' Should the new lineup approach be used?
+#'
+#' Check if the date in a string is more recent than May 2025
+#'
+#' This function extracts the first 8 characters from an input string,
+#' interprets them as a date in the format \code{YYYYMMDD}, and checks
+#' whether this date is more recent than May 1st, 2025.
+#'
+#' @param x A character vector where each element starts with an
+#'   8-digit date in the format \code{YYYYMMDD}.
+#'
+#' @return A logical vector: \code{TRUE} if the extracted date is
+#'   after May 1st, 2025, otherwise \code{FALSE}.
+#'
+#' @examples
+#' use_new_lineup_version("20250401_2021_01_02_PROD") # FALSE
+#' use_new_lineup_version("20250615_2021_01_02_PROD") # TRUE
+#'
+#' @export
+use_new_lineup_version <- function(x) {
+  # Extract YYYYMMDD
+  date_str <- substr(x, 1, 8)
+
+  # Convert to Date
+  date_val <- as.Date(date_str, format = "%Y%m%d")
+
+  # Threshold date
+  threshold <- as.Date("2025-05-01")
+
+  # Compare
+  date_val > threshold
+}
+

From 9a9546afd6c4bf3c7637d1260071a53cf67da797 Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Tue, 12 Aug 2025 17:59:09 -0400
Subject: [PATCH 027/203] standardise ref and refy in lkup

---
 R/create_lkups.R | 159 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 102 insertions(+), 57 deletions(-)

diff --git a/R/create_lkups.R b/R/create_lkups.R
index fb936bac..896bf42b 100644
--- a/R/create_lkups.R
+++ b/R/create_lkups.R
@@ -16,7 +16,6 @@ create_versioned_lkups <-
 
     versions <- names(data_dirs)
     # versions[1] <- "latest_release"
-
     versions_paths <- lapply(data_dirs, create_lkups, versions = versions)
     names(versions_paths) <- versions
 
@@ -35,6 +34,9 @@ extract_data_dirs <-
   function(data_dir,
            vintage_pattern
            ) {
+
+
+
   # List data directories under data_dir
 
   data_dirs  <- fs::dir_ls(data_dir, type = "directory")
@@ -71,6 +73,11 @@ extract_data_dirs <-
 #' @return list
 create_lkups <- function(data_dir, versions) {
 
+  # Use new lineup approach? -----
+  use_new_lineup_version <- use_new_lineup_version(versions)
+  # ZP temp
+  use_new_lineup_version <- TRUE
+
   # Get survey paths ----
   paths <- list.files(fs::path(data_dir, "survey_data"))
   paths_ids <- tools::file_path_sans_ext(paths)
@@ -175,72 +182,110 @@ create_lkups <- function(data_dir, versions) {
              by = .(interpolation_id)]
 
 
+
+
   # ZP ADD - CREATE OBJECT: refy_lkup
   #___________________________________________________________________________
-  refy_lkup_path <- fs::path(data_dir,
-                             "estimations/prod_refy_estimation.fst")
-
-  # NOTE: THIS `prod_refy_estimation.fst` is the refy table but
-  #             unique at the country-year level
-  refy_lkup      <- fst::read_fst(refy_lkup_path,
-                                  as.data.table = TRUE)
-
-  refy_lkup[ ,
-             path := {
-               fs::path(data_dir,
-                        "lineup_data",
-                        paste0(country_code,
-                               "_",
-                               reporting_year),
-                        ext = "qs") |>
-                 as.character()
-               }
-             ]
-  refy_lkup[,
-            interpolation_id := paste(country_code,
-                                      reporting_year,
-                                      reporting_level,
-                                      sep = "_")]
-
-  if ("region_code" %in% names(refy_lkup)) {
-    refy_lkup[,
-              region_code := NULL]
-  }
+  if (use_new_lineup_version) {
+      refy_lkup_path <- fs::path(data_dir,
+                                 "estimations/prod_refy_estimation.fst")
+
+      # NOTE: THIS `prod_refy_estimation.fst` is the refy table but
+      #             unique at the country-year level
+      refy_lkup      <- fst::read_fst(refy_lkup_path,
+                                      as.data.table = TRUE)
+
+      refy_lkup[ ,
+                 path := {
+                   fs::path(data_dir,
+                            "lineup_data",
+                            paste0(country_code,
+                                   "_",
+                                   reporting_year),
+                            ext = "qs") |>
+                     as.character()
+                 }
+      ]
+      refy_lkup[,
+                interpolation_id := paste(country_code,
+                                          reporting_year,
+                                          reporting_level,
+                                          sep = "_")]
+
+      if ("region_code" %in% names(refy_lkup)) {
+        refy_lkup[,
+                  region_code := NULL]
+      }
 
-  refy_lkup <- joyn::joyn(x          = refy_lkup,
-                          y          = countries,
-                          by         = 'country_code',
-                          keep       = "left",
-                          reportvar  = FALSE,
-                          match_type = "m:1")
 
+      refy_lkup[,
+                data_interpolation_id := paste(cache_id,
+                                               reporting_level,
+                                               sep = "_")
+      ]
+
+      refy_lkup[,
+                data_interpolation_id := paste(unique(data_interpolation_id),
+                                               collapse = "|"),
+                by = .(interpolation_id)]
+
+      refy_lkup <- joyn::joyn(x          = refy_lkup,
+                              y          = countries,
+                              by         = 'country_code',
+                              keep       = "left",
+                              reportvar  = FALSE,
+                              match_type = "m:1")
+
+
+      gv(refy_lkup,
+         c("monotonic",
+           "same_direction",
+           "mult_factor",
+           "nac",
+           "nac_sy",
+           "svy_mean",
+           #"data_interpolation",
+           "relative_distance")) <- NULL
+      gv(ref_lkup,
+         c("monotonic",
+           "same_direction",
+           "nac",
+           "nac_sy",
+           "svy_mean",
+           #"data_interpolation",
+           "relative_distance")) <- NULL
+
+
+      # ZP ADD - CREATE OBJECT: lineup years
+      #___________________________________________________________________________
+      lineup_years_path <-
+        fs::path(data_dir,
+                 "estimations/lineup_years.fst")
+
+      lineup_years <- fst::read_fst(lineup_years_path) |>
+        as.list()
+
+      # ZP ADD - CREATE OBJECT: lineup dist stats
+      #___________________________________________________________________________
+      lineup_dist_stats <-
+        fs::path(data_dir,
+                 "estimations/lineup_dist_stats.fst")
+
+      lineup_dist_stats <- fst::read_fst(lineup_dist_stats,
+                                         as.data.table = TRUE) |>
+        fmutate(file = paste(country_code,
+                             reporting_year,
+                             sep = "_"))
+      gv(lineup_dist_stats,
+         c("min",
+           "max")) <- NULL
 
-  # ZP ADD - CREATE OBJECT: lineup years
-  #___________________________________________________________________________
-  lineup_years_path <-
-    fs::path(data_dir,
-             "estimations/lineup_years.fst")
+  }
 
-  lineup_years <- fst::read_fst(lineup_years_path) |>
-    as.list()
 
-  # ZP ADD - CREATE OBJECT: lineup dist stats
-  #___________________________________________________________________________
-  lineup_dist_stats <-
-    fs::path(data_dir,
-             "estimations/lineup_dist_stats.fst")
 
-  lineup_dist_stats <- fst::read_fst(lineup_dist_stats,
-                                     as.data.table = TRUE) |>
-    fmutate(file = paste(country_code,
-                         reporting_year,
-                         sep = "_"))
-  gv(lineup_dist_stats,
-     c("min",
-       "max")) <- NULL
 
 
-  use_new_lineup_version <- use_new_lineup_version(versions)
 
   #___________________________________________________________________________
 

From cda895ffccdce2f2557e729ae676d4f8b7a5bff9 Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Tue, 12 Aug 2025 19:15:58 -0400
Subject: [PATCH 028/203] rm nowcast years from refy

---
 R/create_lkups.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/R/create_lkups.R b/R/create_lkups.R
index 896bf42b..fdfa31ed 100644
--- a/R/create_lkups.R
+++ b/R/create_lkups.R
@@ -264,6 +264,7 @@ create_lkups <- function(data_dir, versions) {
 
       lineup_years <- fst::read_fst(lineup_years_path) |>
         as.list()
+      refy_lkup <- refy_lkup[reporting_year %in% lineup_years$lineup_years, ]
 
       # ZP ADD - CREATE OBJECT: lineup dist stats
       #___________________________________________________________________________

From 3aa55355feb2d04847ea611352e3b6feb3c5bcbd Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Wed, 13 Aug 2025 13:44:46 -0400
Subject: [PATCH 029/203] add explanation comments

---
 R/duckdb_func.R | 10 +++++++++-
 R/fg_pip.R      |  2 +-
 R/utils.R       |  1 -
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/R/duckdb_func.R b/R/duckdb_func.R
index fc91b3f1..663056c9 100644
--- a/R/duckdb_func.R
+++ b/R/duckdb_func.R
@@ -10,19 +10,21 @@ return_if_exists <- function(slkup,
                              fill_gaps,
                              verbose = getOption("pipapi.verbose")) {
 
+  # none selected
   if (fnrow(slkup) == 0 ) {
     return(list(data_present_in_master = NULL,
                 lkup                   = slkup,
                 povline                = povline))
   }
 
-
+  # don't use cache
   if (getOption("pipapi.query_live_data")) {
     return(list(data_present_in_master = NULL,
                 lkup                   = slkup,
                 povline                = povline))
   }
 
+  # load cache
   # ZP new temp code to avoid error from load_inter_cache due to dbConnect
   master_file <- tryCatch(
     load_inter_cache(cache_file_path = cache_file_path,
@@ -39,6 +41,7 @@ return_if_exists <- function(slkup,
   # master_file <- load_inter_cache(cache_file_path = cache_file_path,
   #                                 fill_gaps = fill_gaps)
 
+  # if no cached files, return selected lkup
   if (fnrow(master_file) == 0) {
     return(list(data_present_in_master = NULL,
                 lkup                   = slkup,
@@ -121,8 +124,13 @@ return_if_exists <- function(slkup,
                                  verbose = 0,
                                  multiple = TRUE)
 
+  # now we have two dfs: lk_not_ms and data_present_in_master
+  #    which gives the lkup rows not in cache (master_file),
+  #    and the lkup rows in cache (master_file)
+
 
   # If no data is present in master
+  #  i.e. if no common rows between
   if (fnrow(data_present_in_master) == 0) {
     return(list(data_present_in_master = NULL,
                 lkup = slkup,
diff --git a/R/fg_pip.R b/R/fg_pip.R
index 1f8c5404..7c5cbc02 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -51,7 +51,7 @@ fg_pip <- function(country,
                           popshare = popshare)
   setDT(metadata)
 
-  # Return empty dataframe if no metadata is found
+  # Return empty dataframe if no metadata is found (i.e. all in cache)
   if (nrow(metadata) == 0) {
     print("ZP: no metadata - i.e. nothing additional to estimate")
     return(list(main_data     = pipapi::empty_response_fg,
diff --git a/R/utils.R b/R/utils.R
index 9b4bd945..3ff1150d 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -289,7 +289,6 @@ get_svy_data <- function(svy_id,
 #'
 #' @return data.table
 #' @export
-#'
 add_dist_stats <- function(df, lkup, fill_gaps) {
 
   if (fill_gaps) {

From e57badab43867906882c01096dd8dbdbeabcca58 Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Wed, 13 Aug 2025 13:47:17 -0400
Subject: [PATCH 030/203] rm NA dist stats for FG

---
 R/pip.R | 69 +++++++++++++++++++++++++++++++--------------------------
 1 file changed, 37 insertions(+), 32 deletions(-)

diff --git a/R/pip.R b/R/pip.R
index dc180832..b7f313c6 100644
--- a/R/pip.R
+++ b/R/pip.R
@@ -139,31 +139,35 @@ pip <- function(country         = "ALL",
       )
     }
 
-
   # Cache new data
   #---------------------------------------------
-    cached_data <- out$data_in_cache
-  #print(out)
-  cached_data <- qDT(cached_data)
+    cached_data <- qDT(out$data_in_cache)
     main_data   <- qDT(out$main_data)
-    # print(colnames(main_data))
-    # print(colnames(cached_data))
-    #print(setdiff(colnames(main_data), colnames(cached_data)))
+    print("A")
+
     if (nrow(main_data) > 0) {
-      #print(is.data.table(cached_data))
+
+      print("B")
+
       if (is.null(out$data_in_cache)) {
+        print("C")
         out <- main_data
       } else {
+        print("D")
         if (fill_gaps) {
+          print("E")
           cached_data <- fg_remove_duplicates(cached_data)
         }
+        print("F")
         out <- main_data |>
           rowbind(cached_data)
       }
+      print("G")
 
         update_master_file(main_data, cache_file_path, fill_gaps)
 
     } else {
+      print("H")
       out <- cached_data
     }
     if (!data.table::is.data.table(out)) {
@@ -175,9 +179,8 @@ pip <- function(country         = "ALL",
     # aggregate distributions ------------------
     if (reporting_level %in% c("national", "all")) {
       out <- add_agg_stats(
-        df = out,
-        return_cols = lkup$return_cols$ag_average_poverty_stats
-        )
+        df          = out,
+        return_cols = lkup$return_cols$ag_average_poverty_stats)
       if (reporting_level == "national") {
         out <- out[reporting_level == "national"]
       }
@@ -185,8 +188,9 @@ pip <- function(country         = "ALL",
 
     # Add out of pipeline variablse
     #---------------------------------------------
-
-    add_vars_out_of_pipeline(out, fill_gaps = fill_gaps, lkup = lkup)
+    add_vars_out_of_pipeline(out,
+                             fill_gaps = fill_gaps,
+                             lkup      = lkup)
 
     # **** TO BE REMOVED **** REMOVAL STARTS HERE
     # Once `pip-grp` has been integrated in ingestion pipeline
@@ -199,11 +203,13 @@ pip <- function(country         = "ALL",
       out <- pip_aggregate_by(
         df          = out,
         group_lkup  = lkup[["pop_region"]],
-        return_cols = lkup$return_cols$pip_grp
-      )
+        return_cols = lkup$return_cols$pip_grp)
+
       # Censor regional values
       if (censor) {
-        out <- censor_rows(out, lkup[["censored"]], type = "regions")
+        out <- censor_rows(out,
+                           lkup[["censored"]],
+                           type = "regions")
       }
 
       out <- out[, c("region_name",
@@ -231,8 +237,7 @@ pip <- function(country         = "ALL",
       df         = out,
       lkup       = lkup,
       fill_gaps  = fill_gaps)
-    print("fooooooo")
-    return(out)
+
     # Add aggregate medians ----------------
     out <- add_agg_medians(
       df        = out,
@@ -242,20 +247,20 @@ pip <- function(country         = "ALL",
 
     # format ----------------
 
-
-    if (fill_gaps) {
-
-    ## Inequality indicators to NA for lineup years ----
-      dist_vars  <- names2keep[!(names2keep %in% crr_names)]
-      out[,
-          (dist_vars) := NA_real_]
-
-      ## estimate_var -----
-      out <- estimate_type_ctr_lnp(out, lkup)
-
-    } else {
-      out[, estimate_type := NA_character_]
-    }
+# ZP temp NA lineups
+    # if (fill_gaps) {
+    #
+    # ## Inequality indicators to NA for lineup years ----
+    #   dist_vars  <- names2keep[!(names2keep %in% crr_names)]
+    #   out[,
+    #       (dist_vars) := NA_real_]
+    #
+    #   ## estimate_var -----
+    #   out <- estimate_type_ctr_lnp(out, lkup)
+    #
+    # } else {
+       out[, estimate_type := NA_character_]
+    # }
 
     ## Handle survey coverage ------------
     if (reporting_level != "all") {

From 842d57f8254c99707d6d6280b788d0c3fb43417a Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Wed, 13 Aug 2025 13:47:48 -0400
Subject: [PATCH 031/203] fg_pip_old in pip_old

---
 R/pip_old.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/pip_old.R b/R/pip_old.R
index a6ed2c2b..52eaf0fb 100644
--- a/R/pip_old.R
+++ b/R/pip_old.R
@@ -113,7 +113,7 @@ pip_old <- function(country         = "ALL",
   # mains estimates ---------------
   if (fill_gaps) {
     ## lineup years-----------------
-    out <- fg_pip(
+    out <- fg_pip_old(
       country            = lcv$est_ctrs,
       year               = year,
       povline            = povline,

From 1d0d4459ec6f25dc350ef08d52d741cc64ac2bca Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Wed, 13 Aug 2025 14:01:17 -0400
Subject: [PATCH 032/203] dist_stats_old fn for old pip

---
 R/pip_old.R    |  2 +-
 R/pip_zp_old.R |  2 +-
 R/utils.R      | 37 +++++++++++++++++++++++++++++++++++++
 3 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/R/pip_old.R b/R/pip_old.R
index 52eaf0fb..096b90ed 100644
--- a/R/pip_old.R
+++ b/R/pip_old.R
@@ -213,7 +213,7 @@ pip_old <- function(country         = "ALL",
   crr_names  <- names(out)    # current variables
   names2keep <- lkup$return_cols$pip$cols # all variables
 
-  out <- add_dist_stats(
+  out <- add_dist_stats_old(
     df = out,
     dist_stats = lkup[["dist_stats"]]
   )
diff --git a/R/pip_zp_old.R b/R/pip_zp_old.R
index 2278ccce..ee22541b 100644
--- a/R/pip_zp_old.R
+++ b/R/pip_zp_old.R
@@ -211,7 +211,7 @@ pip_zp_old <- function(country         = "ALL",
   crr_names  <- names(out)    # current variables
   names2keep <- lkup$return_cols$pip$cols # all variables
 
-  out <- add_dist_stats(
+  out <- add_dist_stats_old(
     df = out,
     dist_stats = lkup[["dist_stats"]]
   )
diff --git a/R/utils.R b/R/utils.R
index 3ff1150d..2be6a305 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -338,6 +338,43 @@ add_dist_stats <- function(df, lkup, fill_gaps) {
   df
 }
 
+
+
+#' Add pre-computed distributional stats
+#'
+#' @param df data.table: Data frame of poverty statistics
+#' @param dist_stats data.table: Distributional stats lookup
+#'
+#' @return data.table
+#' @export
+#'
+add_dist_stats_old <- function(df, dist_stats) {
+  # Keep only relevant columns
+  cols <- c(
+    "cache_id",
+    # "country_code",
+    # "reporting_year",
+    # "welfare_type",
+    "reporting_level",
+    "gini",
+    "polarization",
+    "mld",
+    sprintf("decile%s", 1:10)
+  )
+  dist_stats <- dist_stats[, .SD, .SDcols = cols]
+
+  # merge dist stats with main table
+  # data.table::setnames(dist_stats, "survey_median_ppp", "median")
+
+  df <- dist_stats[df,
+                   on = .(cache_id, reporting_level), #.(country_code, reporting_year, welfare_type, reporting_level),
+                   allow.cartesian = TRUE
+  ]
+
+  return(df)
+}
+
+
 #' Collapse rows
 #' @return data.table
 #' @noRd

From b6b94427bd82ea754b67e0bfe908ea390153a9b6 Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Thu, 14 Aug 2025 09:39:43 -0400
Subject: [PATCH 033/203] add estimate type to fg

---
 R/pip.R | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/R/pip.R b/R/pip.R
index b7f313c6..44ee4184 100644
--- a/R/pip.R
+++ b/R/pip.R
@@ -247,20 +247,21 @@ pip <- function(country         = "ALL",
 
     # format ----------------
 
-# ZP temp NA lineups
-    # if (fill_gaps) {
-    #
+
+    if (fill_gaps) {
+      # ZP temp NA lineups:
+      #---------------------
     # ## Inequality indicators to NA for lineup years ----
     #   dist_vars  <- names2keep[!(names2keep %in% crr_names)]
     #   out[,
     #       (dist_vars) := NA_real_]
     #
-    #   ## estimate_var -----
-    #   out <- estimate_type_ctr_lnp(out, lkup)
-    #
-    # } else {
-       out[, estimate_type := NA_character_]
-    # }
+      ## estimate_var -----
+      out <- estimate_type_ctr_lnp(out, lkup)
+
+    } else {
+      out[, estimate_type := NA_character_]
+    }
 
     ## Handle survey coverage ------------
     if (reporting_level != "all") {

From fe4dc9530fb6556c8bd302e933ab52f5c91426f1 Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Thu, 14 Aug 2025 09:53:41 -0400
Subject: [PATCH 034/203] fix non-numeric surveyid_year for fg

---
 R/utils.R | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/R/utils.R b/R/utils.R
index 2be6a305..65f2df74 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -1203,8 +1203,8 @@ add_distribution_type <- function(df, lkup, fill_gaps) {
     dt[,
        # distribution type by year
        distribution_type := fcase(use_groupdata == 1, "group",
-                                  use_imputed == 1,   "imputed",
-                                  default = "micro")
+                                  use_imputed   == 1, "imputed",
+                                  default        =    "micro")
     ]
 
     dt <- dt[, # collapse by reporting_year and keep relevant variables
@@ -1213,9 +1213,12 @@ add_distribution_type <- function(df, lkup, fill_gaps) {
 
   }
 
-  df[,
-      surveyid_year := as.numeric(surveyid_year)
-    ][dt,
+  if (!fill_gaps) {
+    df <- df[,
+       surveyid_year := as.numeric(surveyid_year)
+    ]
+  }
+  df[dt,
       on = by_vars,
       distribution_type := i.distribution_type
       ][,
@@ -1404,6 +1407,7 @@ get_caller_names <- function() {
 #' @keywords internal
 #' @return data.table from pip or pip_grp functions.
 add_vars_out_of_pipeline <- function(out, fill_gaps, lkup) {
+
   ## Add SPL and SPR  ---------------
   out <- add_spl(df        = out,
                  fill_gaps = fill_gaps,
@@ -1417,9 +1421,9 @@ add_vars_out_of_pipeline <- function(out, fill_gaps, lkup) {
 
   ## add distribution type -------------
   # based on info in framework data, rather than welfare data
-  out <- add_distribution_type(df = out,
-                        lkup = lkup,
-                        fill_gaps = fill_gaps)
+  out <- add_distribution_type(df        = out,
+                               lkup      = lkup,
+                               fill_gaps = fill_gaps)
 
   invisible(out)
 }

From 81e4816697fd9493583518d78568bc5af419dde8 Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Thu, 14 Aug 2025 10:59:12 -0400
Subject: [PATCH 035/203] add pip wrapper for new and old lineups

---
 R/pip.R                               | 141 ++++++++++++++++++++++----
 R/pip_old.R                           |  32 +++---
 R/{pip_zp_old.R => pip_old_lineups.R} |  12 +--
 3 files changed, 146 insertions(+), 39 deletions(-)
 rename R/{pip_zp_old.R => pip_old_lineups.R} (96%)

diff --git a/R/pip.R b/R/pip.R
index 44ee4184..7fe60b2b 100644
--- a/R/pip.R
+++ b/R/pip.R
@@ -1,24 +1,13 @@
+
+
 #' Compute PIP statistics
 #'
 #' Compute the main PIP poverty and inequality statistics.
 #'
-#' @param country character: Country ISO 3 codes
-#' @param year integer: Reporting year
-#' @param povline numeric: Poverty line
-#' @param popshare numeric: Proportion of the population living below the
-#'   poverty line
-#' @param fill_gaps logical: If set to TRUE, will interpolate / extrapolate
-#'   values for missing years
-#' @param group_by character: Will return aggregated values for predefined
-#'   sub-groups
-#' @param welfare_type character: Welfare type
-#' @param reporting_level character: Geographical reporting level
-#' @param ppp numeric: Custom Purchase Power Parity value
-#' @param lkup list: A list of lkup tables
-#' @param censor logical: Triggers censoring of country/year statistics
-#' @param lkup_hash character: hash of pip
-#' @param additional_ind logical: If TRUE add new set of indicators. Default if
-#'   FALSE
+#' This function is a wrapper around the [pip_new_lineups] and [pip_old_lineups]
+#' functions.
+#'
+#' @inheritParams pip_new_lineups
 #'
 #' @return data.table
 #' @examples
@@ -67,6 +56,124 @@ pip <- function(country         = "ALL",
                 lkup_hash       = lkup$cache_data_id$hash_pip,
                 additional_ind  = FALSE) {
 
+  # Should pip_old or pip_new be used?
+  #-------------------------------------
+  use_new <- lkup$use_new_lineup_version
+
+  # Run correct function
+  #-------------------------------------
+  out <- if (use_new) {
+    pip_new_lineups(country         = country,
+                    year            = year,
+                    povline         = povline,
+                    popshare        = popshare,
+                    fill_gaps       = fill_gaps,
+                    group_by        = group_by,
+                    welfare_type    = welfare_type,
+                    reporting_level = reporting_level,
+                    ppp             = ppp,
+                    lkup            = lkup,
+                    censor          = censor,
+                    lkup_hash       = lkup_hash,
+                    additional_ind  = additional_ind)
+  } else {
+    pip_old_lineups(country         = country,
+                    year            = year,
+                    povline         = povline,
+                    popshare        = popshare,
+                    fill_gaps       = fill_gaps,
+                    group_by        = group_by,
+                    welfare_type    = welfare_type,
+                    reporting_level = reporting_level,
+                    ppp             = ppp,
+                    lkup            = lkup,
+                    censor          = censor,
+                    lkup_hash       = lkup_hash,
+                    additional_ind  = additional_ind)
+  }
+
+  # Return
+  #-------------------------------------
+  out
+
+}
+
+
+
+
+
+
+
+
+#' Compute PIP statistics
+#'
+#' Compute the main PIP poverty and inequality statistics.
+#'
+#' @param country character: Country ISO 3 codes
+#' @param year integer: Reporting year
+#' @param povline numeric: Poverty line
+#' @param popshare numeric: Proportion of the population living below the
+#'   poverty line
+#' @param fill_gaps logical: If set to TRUE, will interpolate / extrapolate
+#'   values for missing years
+#' @param group_by character: Will return aggregated values for predefined
+#'   sub-groups
+#' @param welfare_type character: Welfare type
+#' @param reporting_level character: Geographical reporting level
+#' @param ppp numeric: Custom Purchase Power Parity value
+#' @param lkup list: A list of lkup tables
+#' @param censor logical: Triggers censoring of country/year statistics
+#' @param lkup_hash character: hash of pip
+#' @param additional_ind logical: If TRUE add new set of indicators. Default if
+#'   FALSE
+#'
+#' @return data.table
+#' @examples
+#' \dontrun{
+#' # Create lkups
+#' lkups <- create_lkups("<data-folder>")
+#'
+#' # A single country and year
+#' pip_new_lineups(country = "AGO",
+#'     year = 2000,
+#'     povline = 1.9,
+#'     lkup = lkups)
+#'
+#' # All years for a single country
+#' pip_new_lineups(country = "AGO",
+#'     year = "all",
+#'     povline = 1.9,
+#'     lkup = lkups)
+#'
+#' # Fill gaps
+#' pip_new_lineups(country = "AGO",
+#'     year = "all",
+#'     povline = 1.9,
+#'     fill_gaps = TRUE,
+#'     lkup = lkups)
+#'
+#' # Group by regions
+#' pip_new_lineups(country = "all",
+#'     year = "all",
+#'     povline = 1.9,
+#'     group_by = "wb",
+#'     lkup = lkups)
+#' }
+#' @export
+pip_new_lineups <- function(country         = "ALL",
+                            year            = "ALL",
+                            povline         = 1.9,
+                            popshare        = NULL,
+                            fill_gaps       = FALSE,
+                            group_by        = c("none", "wb"),
+                            welfare_type    = c("all", "consumption", "income"),
+                            reporting_level = c("all", "national", "rural", "urban"),
+                            ppp             = NULL,
+                            lkup,
+                            censor          = TRUE,
+                            lkup_hash       = lkup$cache_data_id$hash_pip,
+                            additional_ind  = FALSE) {
+
 
   # set up -------------
   welfare_type    <- match.arg(welfare_type)
diff --git a/R/pip_old.R b/R/pip_old.R
index 096b90ed..3fc5cee7 100644
--- a/R/pip_old.R
+++ b/R/pip_old.R
@@ -27,26 +27,26 @@
 #' lkups <- create_lkups("<data-folder>")
 #'
 #' # A single country and year
-#' pip(country = "AGO",
+#' pip_old(country = "AGO",
 #'     year = 2000,
 #'     povline = 1.9,
 #'     lkup = lkups)
 #'
 #' # All years for a single country
-#' pip(country = "AGO",
+#' pip_old(country = "AGO",
 #'     year = "all",
 #'     povline = 1.9,
 #'     lkup = lkups)
 #'
 #' # Fill gaps
-#' pip(country = "AGO",
+#' pip_old(country = "AGO",
 #'     year = "all",
 #'     povline = 1.9,
 #'     fill_gaps = TRUE,
 #'     lkup = lkups)
 #'
 #' # Group by regions
-#' pip(country = "all",
+#' pip_old(country = "all",
 #'     year = "all",
 #'     povline = 1.9,
 #'     group_by = "wb",
@@ -55,18 +55,18 @@
 #' @export
 #'
 pip_old <- function(country         = "ALL",
-                year            = "ALL",
-                povline         = 1.9,
-                popshare        = NULL,
-                fill_gaps       = FALSE,
-                group_by        = c("none", "wb"),
-                welfare_type    = c("all", "consumption", "income"),
-                reporting_level = c("all", "national", "rural", "urban"),
-                ppp             = NULL,
-                lkup,
-                censor          = TRUE,
-                lkup_hash       = lkup$cache_data_id$hash_pip,
-                additional_ind  = FALSE) {
+                    year            = "ALL",
+                    povline         = 1.9,
+                    popshare        = NULL,
+                    fill_gaps       = FALSE,
+                    group_by        = c("none", "wb"),
+                    welfare_type    = c("all", "consumption", "income"),
+                    reporting_level = c("all", "national", "rural", "urban"),
+                    ppp             = NULL,
+                    lkup,
+                    censor          = TRUE,
+                    lkup_hash       = lkup$cache_data_id$hash_pip,
+                    additional_ind  = FALSE) {
 
 
   # set up -------------
diff --git a/R/pip_zp_old.R b/R/pip_old_lineups.R
similarity index 96%
rename from R/pip_zp_old.R
rename to R/pip_old_lineups.R
index ee22541b..8994a3ba 100644
--- a/R/pip_zp_old.R
+++ b/R/pip_old_lineups.R
@@ -1,4 +1,4 @@
-#' Compute PIP statistics
+#' Compute PIP statistics - Old lineups function
 #'
 #' Compute the main PIP poverty and inequality statistics.
 #'
@@ -27,33 +27,33 @@
 #' lkups <- create_lkups("<data-folder>")
 #'
 #' # A single country and year
-#' pip(country = "AGO",
+#' pip_old_lineups(country = "AGO",
 #'     year = 2000,
 #'     povline = 1.9,
 #'     lkup = lkups)
 #'
 #' # All years for a single country
-#' pip(country = "AGO",
+#' pip_old_lineups(country = "AGO",
 #'     year = "all",
 #'     povline = 1.9,
 #'     lkup = lkups)
 #'
 #' # Fill gaps
-#' pip(country = "AGO",
+#' pip_old_lineups(country = "AGO",
 #'     year = "all",
 #'     povline = 1.9,
 #'     fill_gaps = TRUE,
 #'     lkup = lkups)
 #'
 #' # Group by regions
-#' pip(country = "all",
+#' pip_old_lineups(country = "all",
 #'     year = "all",
 #'     povline = 1.9,
 #'     group_by = "wb",
 #'     lkup = lkups)
 #' }
 #' @export
-pip_zp_old <- function(country         = "ALL",
+pip_old_lineups <- function(country         = "ALL",
                 year            = "ALL",
                 povline         = 1.9,
                 popshare        = NULL,

From 1081c77886d19c40e75b500671179a2c2c771c30 Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Thu, 14 Aug 2025 11:02:24 -0400
Subject: [PATCH 036/203] document lineups functions

---
 NAMESPACE                                   |  4 +
 man/add_dist_stats.Rd                       |  2 +-
 man/add_dist_stats_old.Rd                   | 19 +++++
 man/compute_fgt_dt.Rd                       |  2 +-
 man/compute_fgt_dt_old.Rd                   | 24 ++++++
 man/fg_assign_nas_values_to_dup_cols_old.Rd | 19 +++++
 man/fg_pip.Rd                               |  6 ++
 man/fg_pip_old.Rd                           | 42 ++++++++++
 man/fg_remove_duplicates_old.Rd             | 25 ++++++
 man/fg_standardize_cache_id_old.Rd          | 21 +++++
 man/load_data_list_old.Rd                   | 18 ++++
 man/pip.Rd                                  |  4 +
 man/pip_new_lineups.Rd                      | 91 +++++++++++++++++++++
 man/pip_old.Rd                              |  8 +-
 man/pip_old_lineups.Rd                      | 91 +++++++++++++++++++++
 man/rg_pip_old.Rd                           | 17 +++-
 man/use_new_lineup_version.Rd               | 29 +++++++
 17 files changed, 415 insertions(+), 7 deletions(-)
 create mode 100644 man/add_dist_stats_old.Rd
 create mode 100644 man/compute_fgt_dt_old.Rd
 create mode 100644 man/fg_assign_nas_values_to_dup_cols_old.Rd
 create mode 100644 man/fg_pip_old.Rd
 create mode 100644 man/fg_remove_duplicates_old.Rd
 create mode 100644 man/fg_standardize_cache_id_old.Rd
 create mode 100644 man/load_data_list_old.Rd
 create mode 100644 man/pip_new_lineups.Rd
 create mode 100644 man/pip_old_lineups.Rd
 create mode 100644 man/use_new_lineup_version.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 207921ed..5f8d1ce5 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,6 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
 export(add_dist_stats)
+export(add_dist_stats_old)
 export(assign_serializer)
 export(change_grouped_stats_to_csv)
 export(citation_from_version)
@@ -26,7 +27,9 @@ export(load_inter_cache)
 export(pip)
 export(pip_grp)
 export(pip_grp_logic)
+export(pip_new_lineups)
 export(pip_old)
+export(pip_old_lineups)
 export(pipgd_lorenz_curve)
 export(return_correct_version)
 export(return_if_exists)
@@ -44,6 +47,7 @@ export(ui_pc_regional)
 export(ui_svy_meta)
 export(unnest_dt_longer)
 export(update_master_file)
+export(use_new_lineup_version)
 export(valid_years)
 export(validate_input_grouped_stats)
 export(version_dataframe)
diff --git a/man/add_dist_stats.Rd b/man/add_dist_stats.Rd
index 8b9ba3bf..5ccd113c 100644
--- a/man/add_dist_stats.Rd
+++ b/man/add_dist_stats.Rd
@@ -4,7 +4,7 @@
 \alias{add_dist_stats}
 \title{Add pre-computed distributional stats}
 \usage{
-add_dist_stats(df, dist_stats)
+add_dist_stats(df, lkup, fill_gaps)
 }
 \arguments{
 \item{df}{data.table: Data frame of poverty statistics}
diff --git a/man/add_dist_stats_old.Rd b/man/add_dist_stats_old.Rd
new file mode 100644
index 00000000..e59eb531
--- /dev/null
+++ b/man/add_dist_stats_old.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{add_dist_stats_old}
+\alias{add_dist_stats_old}
+\title{Add pre-computed distributional stats}
+\usage{
+add_dist_stats_old(df, dist_stats)
+}
+\arguments{
+\item{df}{data.table: Data frame of poverty statistics}
+
+\item{dist_stats}{data.table: Distributional stats lookup}
+}
+\value{
+data.table
+}
+\description{
+Add pre-computed distributional stats
+}
diff --git a/man/compute_fgt_dt.Rd b/man/compute_fgt_dt.Rd
index 80504ccb..a57b0748 100644
--- a/man/compute_fgt_dt.Rd
+++ b/man/compute_fgt_dt.Rd
@@ -4,7 +4,7 @@
 \alias{compute_fgt_dt}
 \title{Title}
 \usage{
-compute_fgt_dt(dt, welfare, weight, povlines)
+compute_fgt_dt(dt, welfare, weight, povlines, mean_and_med = FALSE)
 }
 \arguments{
 \item{dt}{data frame with \code{welfare} and \code{weight} columns}
diff --git a/man/compute_fgt_dt_old.Rd b/man/compute_fgt_dt_old.Rd
new file mode 100644
index 00000000..82c9babd
--- /dev/null
+++ b/man/compute_fgt_dt_old.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/rg_pip_old.R
+\name{compute_fgt_dt_old}
+\alias{compute_fgt_dt_old}
+\title{Title}
+\usage{
+compute_fgt_dt_old(dt, welfare, weight, povlines)
+}
+\arguments{
+\item{dt}{data frame with \code{welfare} and \code{weight} columns}
+
+\item{welfare}{character: welfare variable name}
+
+\item{weight}{character: weight variable name}
+
+\item{povlines}{double: vector with poveryt lines}
+}
+\value{
+data.table with estimates poverty estimates
+}
+\description{
+Title
+}
+\keyword{internal}
diff --git a/man/fg_assign_nas_values_to_dup_cols_old.Rd b/man/fg_assign_nas_values_to_dup_cols_old.Rd
new file mode 100644
index 00000000..da765eea
--- /dev/null
+++ b/man/fg_assign_nas_values_to_dup_cols_old.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/fg_pip_old.R
+\name{fg_assign_nas_values_to_dup_cols_old}
+\alias{fg_assign_nas_values_to_dup_cols_old}
+\title{OLD: Coerce variable causing potential duplicates to NAs}
+\usage{
+fg_assign_nas_values_to_dup_cols_old(df, cols)
+}
+\arguments{
+\item{df}{data.table: Table of results created in \code{fg_pip()}}
+
+\item{cols}{character: Columns with potential duplicate values}
+}
+\value{
+data.table
+}
+\description{
+OLD: Coerce variable causing potential duplicates to NAs
+}
diff --git a/man/fg_pip.Rd b/man/fg_pip.Rd
index 9012b01b..f83e836f 100644
--- a/man/fg_pip.Rd
+++ b/man/fg_pip.Rd
@@ -39,4 +39,10 @@ data.frame
 \description{
 Compute the main PIP poverty and inequality statistics for imputed years.
 }
+\keyword{#}
+\keyword{Add:}
+\keyword{ZP}
+\keyword{data}
 \keyword{internal}
+\keyword{load}
+\keyword{refy}
diff --git a/man/fg_pip_old.Rd b/man/fg_pip_old.Rd
new file mode 100644
index 00000000..456989ea
--- /dev/null
+++ b/man/fg_pip_old.Rd
@@ -0,0 +1,42 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/fg_pip_old.R
+\name{fg_pip_old}
+\alias{fg_pip_old}
+\title{Compute imputed year stats}
+\usage{
+fg_pip_old(
+  country,
+  year,
+  povline,
+  popshare,
+  welfare_type,
+  reporting_level,
+  ppp,
+  lkup
+)
+}
+\arguments{
+\item{country}{character: Country ISO 3 codes}
+
+\item{year}{integer: Reporting year}
+
+\item{povline}{numeric: Poverty line}
+
+\item{popshare}{numeric: Proportion of the population living below the
+poverty line}
+
+\item{welfare_type}{character: Welfare type}
+
+\item{reporting_level}{character: Geographical reporting level}
+
+\item{ppp}{numeric: Custom Purchase Power Parity value}
+
+\item{lkup}{list: A list of lkup tables}
+}
+\value{
+data.frame
+}
+\description{
+Compute the main PIP poverty and inequality statistics for imputed years.
+}
+\keyword{internal}
diff --git a/man/fg_remove_duplicates_old.Rd b/man/fg_remove_duplicates_old.Rd
new file mode 100644
index 00000000..2190c4f1
--- /dev/null
+++ b/man/fg_remove_duplicates_old.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/fg_pip_old.R
+\name{fg_remove_duplicates_old}
+\alias{fg_remove_duplicates_old}
+\title{OLD: Remove duplicated rows created during the interpolation process}
+\usage{
+fg_remove_duplicates_old(
+  df,
+  cols = c("comparable_spell", "cpi", "display_cp", "gd_type", "path",
+    "predicted_mean_ppp", "survey_acronym", "survey_comparability", "survey_coverage",
+    "survey_id", "survey_mean_lcu", "survey_mean_ppp", "survey_median_lcu",
+    "survey_median_ppp", "survey_time", "survey_year", "surveyid_year")
+)
+}
+\arguments{
+\item{df}{data.table: Table of results created in \code{fg_pip()}}
+
+\item{cols}{character: Columns with potential duplicate values}
+}
+\value{
+data.table
+}
+\description{
+OLD: Remove duplicated rows created during the interpolation process
+}
diff --git a/man/fg_standardize_cache_id_old.Rd b/man/fg_standardize_cache_id_old.Rd
new file mode 100644
index 00000000..fd56a1b4
--- /dev/null
+++ b/man/fg_standardize_cache_id_old.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/fg_pip_old.R
+\name{fg_standardize_cache_id_old}
+\alias{fg_standardize_cache_id_old}
+\title{OLD: Standardize cache_id format to avoid duplication of rows}
+\usage{
+fg_standardize_cache_id_old(cache_id, interpolation_id, reporting_level)
+}
+\arguments{
+\item{cache_id}{character}
+
+\item{interpolation_id}{character}
+
+\item{reporting_level}{character}
+}
+\value{
+character
+}
+\description{
+OLD: Standardize cache_id format to avoid duplication of rows
+}
diff --git a/man/load_data_list_old.Rd b/man/load_data_list_old.Rd
new file mode 100644
index 00000000..b6ea172f
--- /dev/null
+++ b/man/load_data_list_old.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/rg_pip_old.R
+\name{load_data_list_old}
+\alias{load_data_list_old}
+\title{OLD: load survey year files and store them in a list}
+\usage{
+load_data_list_old(metadata)
+}
+\arguments{
+\item{metadata}{data frame from \code{subset_lkup()}}
+}
+\value{
+list with survey years data
+}
+\description{
+OLD: load survey year files and store them in a list
+}
+\keyword{internal}
diff --git a/man/pip.Rd b/man/pip.Rd
index 3e31f031..0161eeef 100644
--- a/man/pip.Rd
+++ b/man/pip.Rd
@@ -57,6 +57,10 @@ data.table
 \description{
 Compute the main PIP poverty and inequality statistics.
 }
+\details{
+This function is a wrapper around the \link{pip_new_lineups} and \link{pip_old_lineups}
+functions.
+}
 \examples{
 \dontrun{
 # Create lkups
diff --git a/man/pip_new_lineups.Rd b/man/pip_new_lineups.Rd
new file mode 100644
index 00000000..eded364f
--- /dev/null
+++ b/man/pip_new_lineups.Rd
@@ -0,0 +1,91 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/pip.R
+\name{pip_new_lineups}
+\alias{pip_new_lineups}
+\title{Compute PIP statistics}
+\usage{
+pip_new_lineups(
+  country = "ALL",
+  year = "ALL",
+  povline = 1.9,
+  popshare = NULL,
+  fill_gaps = FALSE,
+  group_by = c("none", "wb"),
+  welfare_type = c("all", "consumption", "income"),
+  reporting_level = c("all", "national", "rural", "urban"),
+  ppp = NULL,
+  lkup,
+  censor = TRUE,
+  lkup_hash = lkup$cache_data_id$hash_pip,
+  additional_ind = FALSE
+)
+}
+\arguments{
+\item{country}{character: Country ISO 3 codes}
+
+\item{year}{integer: Reporting year}
+
+\item{povline}{numeric: Poverty line}
+
+\item{popshare}{numeric: Proportion of the population living below the
+poverty line}
+
+\item{fill_gaps}{logical: If set to TRUE, will interpolate / extrapolate
+values for missing years}
+
+\item{group_by}{character: Will return aggregated values for predefined
+sub-groups}
+
+\item{welfare_type}{character: Welfare type}
+
+\item{reporting_level}{character: Geographical reporting level}
+
+\item{ppp}{numeric: Custom Purchase Power Parity value}
+
+\item{lkup}{list: A list of lkup tables}
+
+\item{censor}{logical: Triggers censoring of country/year statistics}
+
+\item{lkup_hash}{character: hash of pip}
+
+\item{additional_ind}{logical: If TRUE add new set of indicators. Default if
+FALSE}
+}
+\value{
+data.table
+}
+\description{
+Compute the main PIP poverty and inequality statistics.
+}
+\examples{
+\dontrun{
+# Create lkups
+lkups <- create_lkups("<data-folder>")
+
+# A single country and year
+pip_new_lineups(country = "AGO",
+    year = 2000,
+    povline = 1.9,
+    lkup = lkups)
+
+# All years for a single country
+pip_new_lineups(country = "AGO",
+    year = "all",
+    povline = 1.9,
+    lkup = lkups)
+
+# Fill gaps
+pip_new_lineups(country = "AGO",
+    year = "all",
+    povline = 1.9,
+    fill_gaps = TRUE,
+    lkup = lkups)
+
+# Group by regions
+pip_new_lineups(country = "all",
+    year = "all",
+    povline = 1.9,
+    group_by = "wb",
+    lkup = lkups)
+}
+}
diff --git a/man/pip_old.Rd b/man/pip_old.Rd
index 2c1ff5a2..1a725396 100644
--- a/man/pip_old.Rd
+++ b/man/pip_old.Rd
@@ -63,26 +63,26 @@ Compute the main PIP poverty and inequality statistics.
 lkups <- create_lkups("<data-folder>")
 
 # A single country and year
-pip(country = "AGO",
+pip_old(country = "AGO",
     year = 2000,
     povline = 1.9,
     lkup = lkups)
 
 # All years for a single country
-pip(country = "AGO",
+pip_old(country = "AGO",
     year = "all",
     povline = 1.9,
     lkup = lkups)
 
 # Fill gaps
-pip(country = "AGO",
+pip_old(country = "AGO",
     year = "all",
     povline = 1.9,
     fill_gaps = TRUE,
     lkup = lkups)
 
 # Group by regions
-pip(country = "all",
+pip_old(country = "all",
     year = "all",
     povline = 1.9,
     group_by = "wb",
diff --git a/man/pip_old_lineups.Rd b/man/pip_old_lineups.Rd
new file mode 100644
index 00000000..1a4dbb71
--- /dev/null
+++ b/man/pip_old_lineups.Rd
@@ -0,0 +1,91 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/pip_old_lineups.R
+\name{pip_old_lineups}
+\alias{pip_old_lineups}
+\title{Compute PIP statistics - Old lineups function}
+\usage{
+pip_old_lineups(
+  country = "ALL",
+  year = "ALL",
+  povline = 1.9,
+  popshare = NULL,
+  fill_gaps = FALSE,
+  group_by = c("none", "wb"),
+  welfare_type = c("all", "consumption", "income"),
+  reporting_level = c("all", "national", "rural", "urban"),
+  ppp = NULL,
+  lkup,
+  censor = TRUE,
+  lkup_hash = lkup$cache_data_id$hash_pip,
+  additional_ind = FALSE
+)
+}
+\arguments{
+\item{country}{character: Country ISO 3 codes}
+
+\item{year}{integer: Reporting year}
+
+\item{povline}{numeric: Poverty line}
+
+\item{popshare}{numeric: Proportion of the population living below the
+poverty line}
+
+\item{fill_gaps}{logical: If set to TRUE, will interpolate / extrapolate
+values for missing years}
+
+\item{group_by}{character: Will return aggregated values for predefined
+sub-groups}
+
+\item{welfare_type}{character: Welfare type}
+
+\item{reporting_level}{character: Geographical reporting level}
+
+\item{ppp}{numeric: Custom Purchase Power Parity value}
+
+\item{lkup}{list: A list of lkup tables}
+
+\item{censor}{logical: Triggers censoring of country/year statistics}
+
+\item{lkup_hash}{character: hash of pip}
+
+\item{additional_ind}{logical: If TRUE add new set of indicators. Default if
+FALSE}
+}
+\value{
+data.table
+}
+\description{
+Compute the main PIP poverty and inequality statistics.
+}
+\examples{
+\dontrun{
+# Create lkups
+lkups <- create_lkups("<data-folder>")
+
+# A single country and year
+pip_old_lineups(country = "AGO",
+    year = 2000,
+    povline = 1.9,
+    lkup = lkups)
+
+# All years for a single country
+pip_old_lineups(country = "AGO",
+    year = "all",
+    povline = 1.9,
+    lkup = lkups)
+
+# Fill gaps
+pip_old_lineups(country = "AGO",
+    year = "all",
+    povline = 1.9,
+    fill_gaps = TRUE,
+    lkup = lkups)
+
+# Group by regions
+pip_old_lineups(country = "all",
+    year = "all",
+    povline = 1.9,
+    group_by = "wb",
+    lkup = lkups)
+}
+}
diff --git a/man/rg_pip_old.Rd b/man/rg_pip_old.Rd
index bee24977..d5cea86f 100644
--- a/man/rg_pip_old.Rd
+++ b/man/rg_pip_old.Rd
@@ -1,9 +1,20 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/pip_old.R
+% Please edit documentation in R/pip_old.R, R/rg_pip_old.R
 \name{rg_pip_old}
 \alias{rg_pip_old}
 \title{Compute survey year stats}
 \usage{
+rg_pip_old(
+  country,
+  year,
+  povline,
+  popshare,
+  welfare_type,
+  reporting_level,
+  ppp,
+  lkup
+)
+
 rg_pip_old(
   country,
   year,
@@ -34,9 +45,13 @@ poverty line}
 \item{lkup}{list: A list of lkup tables}
 }
 \value{
+data.frame
+
 data.frame
 }
 \description{
+Compute the main PIP poverty and inequality statistics for survey years.
+
 Compute the main PIP poverty and inequality statistics for survey years.
 }
 \keyword{internal}
diff --git a/man/use_new_lineup_version.Rd b/man/use_new_lineup_version.Rd
new file mode 100644
index 00000000..59360d6c
--- /dev/null
+++ b/man/use_new_lineup_version.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_lkups.R
+\name{use_new_lineup_version}
+\alias{use_new_lineup_version}
+\title{Should the new lineup approach be used?}
+\usage{
+use_new_lineup_version(x)
+}
+\arguments{
+\item{x}{A character vector where each element starts with an
+8-digit date in the format \code{YYYYMMDD}.}
+}
+\value{
+A logical vector: \code{TRUE} if the extracted date is
+after May 1st, 2025, otherwise \code{FALSE}.
+}
+\description{
+Check if the date in a string is more recent than May 2025
+}
+\details{
+This function extracts the first 8 characters from an input string,
+interprets them as a date in the format \code{YYYYMMDD}, and checks
+whether this date is more recent than May 1st, 2025.
+}
+\examples{
+use_new_lineup_version("20250401_2021_01_02_PROD") # FALSE
+use_new_lineup_version("20250615_2021_01_02_PROD") # TRUE
+
+}

From 68a4684c76de62ffb283c7dfb9b069a2210fc77c Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Thu, 14 Aug 2025 12:11:37 -0400
Subject: [PATCH 037/203] document

---
 man/ui_cp_charts.Rd         | 2 +-
 man/ui_cp_poverty_charts.Rd | 2 +-
 man/ui_hp_countries.Rd      | 2 +-
 man/ui_pc_charts.Rd         | 2 +-
 man/ui_pc_regional.Rd       | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/man/ui_cp_charts.Rd b/man/ui_cp_charts.Rd
index 010a71b1..03ea2a00 100644
--- a/man/ui_cp_charts.Rd
+++ b/man/ui_cp_charts.Rd
@@ -7,7 +7,7 @@
 ui_cp_charts(
   country = "AGO",
   povline = 1.9,
-  pop_units = 1e+06,
+  pop_units = 1000000,
   lkup,
   lkup_hash = lkup$cache_data_id$hash_ui_cp
 )
diff --git a/man/ui_cp_poverty_charts.Rd b/man/ui_cp_poverty_charts.Rd
index 3b211f31..dcfbb943 100644
--- a/man/ui_cp_poverty_charts.Rd
+++ b/man/ui_cp_poverty_charts.Rd
@@ -4,7 +4,7 @@
 \alias{ui_cp_poverty_charts}
 \title{CP Poverty Charts}
 \usage{
-ui_cp_poverty_charts(country, povline, pop_units, lkup)
+ui_cp_poverty_charts(country, povline, pop_units = 1000000, lkup)
 }
 \arguments{
 \item{country}{character: Country ISO 3 codes}
diff --git a/man/ui_hp_countries.Rd b/man/ui_hp_countries.Rd
index 2ffcf443..760f458e 100644
--- a/man/ui_hp_countries.Rd
+++ b/man/ui_hp_countries.Rd
@@ -7,7 +7,7 @@
 ui_hp_countries(
   country = c("IDN", "CIV"),
   povline = 1.9,
-  pop_units = 1e+06,
+  pop_units = 1000000,
   lkup
 )
 }
diff --git a/man/ui_pc_charts.Rd b/man/ui_pc_charts.Rd
index 49715b48..438dfca9 100644
--- a/man/ui_pc_charts.Rd
+++ b/man/ui_pc_charts.Rd
@@ -12,7 +12,7 @@ ui_pc_charts(
   group_by = "none",
   welfare_type = c("all", "consumption", "income"),
   reporting_level = c("all", "national", "rural", "urban"),
-  pop_units = 1e+06,
+  pop_units = 1000000,
   lkup
 )
 }
diff --git a/man/ui_pc_regional.Rd b/man/ui_pc_regional.Rd
index 083e418b..3cbde341 100644
--- a/man/ui_pc_regional.Rd
+++ b/man/ui_pc_regional.Rd
@@ -8,7 +8,7 @@ ui_pc_regional(
   country = "ALL",
   year = "ALL",
   povline = 1.9,
-  pop_units = 1e+06,
+  pop_units = 1000000,
   lkup
 )
 }

From b69b1c62a8a9190977ad65423ba01cd864b5fe4b Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Thu, 14 Aug 2025 12:12:09 -0400
Subject: [PATCH 038/203] Increment version number to 1.3.19.9000

---
 DESCRIPTION | 2 +-
 NEWS.md     | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 9f6f1693..ceb175a6 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: pipapi
 Title: API for the Poverty and Inequality Platform
-Version: 1.3.19
+Version: 1.3.19.9000
 Authors@R: 
     c(person(given = "Tony",
             family = "Fujs",
diff --git a/NEWS.md b/NEWS.md
index 55099b39..50441fb5 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,5 @@
+# pipapi (development version)
+
 # pipapi 1.3.19
 * fix issue with comparability
 

From d325cec19e0d53fd38b9f7e2d587f9ac9359d402 Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Thu, 14 Aug 2025 12:17:01 -0400
Subject: [PATCH 039/203] rm setting use_new_lneups to TRUE

---
 R/create_lkups.R | 2 --
 1 file changed, 2 deletions(-)

diff --git a/R/create_lkups.R b/R/create_lkups.R
index fdfa31ed..09da3a0c 100644
--- a/R/create_lkups.R
+++ b/R/create_lkups.R
@@ -75,8 +75,6 @@ create_lkups <- function(data_dir, versions) {
 
   # Use new lineup approach? -----
   use_new_lineup_version <- use_new_lineup_version(versions)
-  # ZP temp
-  use_new_lineup_version <- TRUE
 
   # Get survey paths ----
   paths <- list.files(fs::path(data_dir, "survey_data"))

From f17a5cc1179fb74440127851743f8148a7d77fa8 Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Thu, 14 Aug 2025 15:34:08 -0400
Subject: [PATCH 040/203] rm ref_lkup from new fg

---
 R/fg_pip.R | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/R/fg_pip.R b/R/fg_pip.R
index 7c5cbc02..d1df4c90 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -17,7 +17,6 @@ fg_pip <- function(country,
   valid_regions       <- lkup$query_controls$region$values
   interpolation_list  <- lkup$interpolation_list
   data_dir            <- lkup$data_root
-  ref_lkup            <- lkup$ref_lkup  # the normal refy table, some country-years have two rows (interpolation)
   refy_lkup           <- lkup$refy_lkup # cleaned refy table, unique by country-years but some columns removed in order to do that
 
   cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb")
@@ -79,7 +78,7 @@ fg_pip <- function(country,
       funique()
   }
   dtemp <-
-    ref_lkup |>
+    refy_lkup |>
     fsubset(country_code     %in% cntry &
               reporting_year %in% yr) |>
     fsubset(reporting_year %in% lkup$valid_years$lineup_years) |>

From ed57649d9f216b9c5fa958c1b2e23de6270e140e Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Thu, 14 Aug 2025 15:36:57 -0400
Subject: [PATCH 041/203] fix lkups for old fn

---
 R/create_lkups.R | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/R/create_lkups.R b/R/create_lkups.R
index 09da3a0c..531f7933 100644
--- a/R/create_lkups.R
+++ b/R/create_lkups.R
@@ -73,6 +73,7 @@ extract_data_dirs <-
 #' @return list
 create_lkups <- function(data_dir, versions) {
 
+
   # Use new lineup approach? -----
   use_new_lineup_version <- use_new_lineup_version(versions)
 
@@ -527,8 +528,11 @@ create_lkups <- function(data_dir, versions) {
 
   # CREATE OBJECT: valid_years ----
   valid_years <- valid_years(data_dir)
-  valid_years <- c(valid_years,
-                   lineup_years) # add lineup years
+  if (use_new_lineup_version) {
+    valid_years <- c(valid_years,
+                     lineup_years) # add lineup years
+
+  }
 
   # CREATE OBJECT: query_controls ----
   # Create list of query controls
@@ -636,7 +640,6 @@ create_lkups <- function(data_dir, versions) {
   lkup <- list(
     svy_lkup               = svy_lkup,
     ref_lkup               = ref_lkup,
-    refy_lkup              = refy_lkup,
     dist_stats             = dist_stats,
     pop_region             = pop_region,
     cp_lkups               = cp_lkups,
@@ -650,9 +653,13 @@ create_lkups <- function(data_dir, versions) {
     interpolation_list     = interpolation_list,
     valid_years            = valid_years,
     cache_data_id          = cache_data_id,
-    lineup_dist_stats      = lineup_dist_stats,
     use_new_lineup_version = use_new_lineup_version)
 
+  if (use_new_lineup_version) {
+    lkup$refy_lkup         <- refy_lkup
+    lkup$lineup_dist_stats <- lineup_dist_stats
+  }
+
   return(lkup)
 }
 

From f561be821c825e51065a7b95f2e07bc9384aa73e Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Mon, 18 Aug 2025 15:53:27 -0400
Subject: [PATCH 042/203] full list function, incl rm cache country-year

---
 R/fg_pip.R | 135 ++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 91 insertions(+), 44 deletions(-)

diff --git a/R/fg_pip.R b/R/fg_pip.R
index d1df4c90..a53f2478 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -57,47 +57,11 @@ fg_pip <- function(country,
                 data_in_cache = data_present_in_master))
   }
 
+  full_list <- create_full_list(country                = country,
+                                year                   = year,
+                                refy_lkup              = refy_lkup,
+                                data_present_in_master = data_present_in_master)
 
-  #'     # ZP Add: load refy data
-  #-------------------------
-  # Extract unique combinations of country-year
-  if (any(c("ALL", "WLD") %in% country)) {
-    cntry <- refy_lkup$country_code |>
-      funique()
-  } else {
-    cntry <- refy_lkup[country_code %in% country,
-                       ]$country_code |>
-      funique()
-  }
-  if (any(c("ALL") %in% year)) {
-    yr <- refy_lkup$reporting_year |>
-      unique()
-  } else {
-    yr <- refy_lkup[reporting_year %in% year,
-                    ]$reporting_year |>
-      funique()
-  }
-  dtemp <-
-    refy_lkup |>
-    fsubset(country_code     %in% cntry &
-              reporting_year %in% yr) |>
-    fsubset(reporting_year %in% lkup$valid_years$lineup_years) |>
-    fselect(country_code,
-            year = reporting_year) |>
-    funique()
-
-  # Split years by country
-  full_list <- dtemp[,
-                     .(year = list(year)),
-                     by = country_code][
-    , .(country_code, year = year)
-  ]
-
-  # Convert to desired structure
-  full_list <- list(
-    country_code = full_list$country_code,
-    year         = lapply(full_list$year,
-                          as.numeric))
   lt <-
     pipdata::load_list_refy(input_list = full_list,
                             path = fs::path(data_dir,
@@ -155,10 +119,14 @@ fg_pip <- function(country,
   #-------------------------
   metadata[,
            file := basename(path)]
+  print(metadata)
+
   # TO BE REMOVED, ONLY FOR TESTING!!!
   rlang::env_poke(env   = globalenv(),
-                  nm    = "metadata_check",
+                  nm    = "metadata_check2",
                   value = metadata)
+  metadata <- copy(metadata_check2)
+  #stop("---------------")
   # try metadata unique code
   tmp_metadata <- metadata
   # Handle multiple distribution types (for aggregated distributions)
@@ -178,17 +146,21 @@ fg_pip <- function(country,
     } else {
       NA
     }}),
-    by = reporting_year, .SDcols = meta_vars]
+    by      = c("reporting_year", "country_code", "reporting_level", "welfare_type"),
+    .SDcols = meta_vars]
 
   # Remove duplicate rows by reporting_year (keep only one row per
   # reporting_year)
-  tmp_metadata_unique <- unique(tmp_metadata, by = "reporting_year")
+  tmp_metadata_unique <- funique(tmp_metadata)
   tmp_metadata_unique[,
                       file := paste0(country_code,
                                      "_",
                                      reporting_year)]
+  # rlang::env_poke(env   = globalenv(),
+  #                 nm    = "tmp_metadata_unique_check",
+  #                 value = tmp_metadata_unique)
   rlang::env_poke(env   = globalenv(),
-                  nm    = "tmp_metadata_unique_check",
+                  nm    = "tmp_metadata_unique_check2",
                   value = tmp_metadata_unique)
   rlang::env_poke(env   = globalenv(),
                   nm    = "res_final",
@@ -236,6 +208,13 @@ fg_pip <- function(country,
     out[, max_year := NULL]
   }
 
+  # in_cache <- data_present_in_master |>
+  #   fmutate(file = paste(country_code, reporting_year, sep = "_")) |>
+  #   fselect(file) |>
+  #   reg_elem()
+  # # rm rows in out that are in cache
+  # out <- out[!(file %in% in_cache)]
+
   return(list(main_data     = out,
               data_in_cache = data_present_in_master))
 
@@ -319,3 +298,71 @@ fg_assign_nas_values_to_dup_cols <- function(df,
   df[, (cols) := NA]
   return(df)
 }
+
+
+
+
+
+
+#' Create full list for fg data load, not including country-years in cache
+#'
+#' @param country Country selected in [fg_pip] function
+#' @param year Year/s selected in [fg_pip] function
+#' @param refy_lkup reference year lkup table with full lineups and new method
+#' @param data_present_in_master cache data
+#'
+#' @return list
+create_full_list <- function(country, year, refy_lkup, data_present_in_master) {
+
+  if (!is.null(data_present_in_master)) {
+    data_not_in_cache <-
+      joyn::anti_join(x =  refy_lkup,
+                      y =  data_present_in_master,
+                      by = c("country_code", "reporting_year"))
+  } else {
+    data_not_in_cache <- refy_lkup
+  }
+
+  # Extract unique combinations of country-year
+  if (any(c("ALL", "WLD") %in% country)) {
+    cntry <- data_not_in_cache$country_code |>
+      funique()
+  } else {
+    cntry <- data_not_in_cache[country_code %in% country,
+    ]$country_code |>
+      funique()
+  }
+  if (any(c("ALL") %in% year)) {
+    yr <- data_not_in_cache$reporting_year |>
+      unique()
+  } else {
+    yr <- data_not_in_cache[reporting_year %in% year,
+    ]$reporting_year |>
+      funique()
+  }
+  dtemp <-
+    data_not_in_cache |>
+    fsubset(country_code     %in% cntry &
+              reporting_year %in% yr) |>
+    fsubset(reporting_year %in% lkup$valid_years$lineup_years) |>
+    fselect(country_code,
+            year = reporting_year) |>
+    funique()
+
+  # Split years by country
+  full_list <- dtemp[,
+                     .(year = list(year)),
+                     by = country_code][
+                       , .(country_code, year = year)
+                     ]
+
+  # Convert to desired structure
+  full_list <- list(
+    country_code = full_list$country_code,
+    year         = lapply(full_list$year,
+                          as.numeric))
+
+
+  full_list
+
+}

From 5394ac494801a7f5b64a9b5b18551250c0615306 Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Mon, 18 Aug 2025 16:22:38 -0400
Subject: [PATCH 043/203] comment out env save checks

---
 R/fg_pip.R | 60 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 31 insertions(+), 29 deletions(-)

diff --git a/R/fg_pip.R b/R/fg_pip.R
index a53f2478..58993206 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -82,9 +82,9 @@ fg_pip <- function(country,
                   x
                })
 
-  rlang::env_poke(env   = globalenv(),
-                  nm    = "pipdata_list",
-                  value = lt)
+  # rlang::env_poke(env   = globalenv(),
+  #                 nm    = "pipdata_list",
+  #                 value = lt)
 
   # ZP Add: do fgt estimations using `res <- lapply(lt, process_dt, povline = povline)`
   #-------------------------
@@ -95,10 +95,10 @@ fg_pip <- function(country,
   res <- rbindlist(res,
                    fill = TRUE)
 
-  # TO BE REMOVED, ONLY FOR TESTING!!!
-  rlang::env_poke(env   = globalenv(),
-                  nm    = "res_povest",
-                  value = res)
+  # # TO BE REMOVED, ONLY FOR TESTING!!!
+  # rlang::env_poke(env   = globalenv(),
+  #                 nm    = "res_povest",
+  #                 value = res)
 
   # ZP Add: join to dist_stats
   #-------------------------
@@ -122,10 +122,10 @@ fg_pip <- function(country,
   print(metadata)
 
   # TO BE REMOVED, ONLY FOR TESTING!!!
-  rlang::env_poke(env   = globalenv(),
-                  nm    = "metadata_check2",
-                  value = metadata)
-  metadata <- copy(metadata_check2)
+  # rlang::env_poke(env   = globalenv(),
+  #                 nm    = "metadata_check2",
+  #                 value = metadata)
+  # metadata <- copy(metadata_check2)
   #stop("---------------")
   # try metadata unique code
   tmp_metadata <- metadata
@@ -159,12 +159,12 @@ fg_pip <- function(country,
   # rlang::env_poke(env   = globalenv(),
   #                 nm    = "tmp_metadata_unique_check",
   #                 value = tmp_metadata_unique)
-  rlang::env_poke(env   = globalenv(),
-                  nm    = "tmp_metadata_unique_check2",
-                  value = tmp_metadata_unique)
-  rlang::env_poke(env   = globalenv(),
-                  nm    = "res_final",
-                  value = res)
+  # rlang::env_poke(env   = globalenv(),
+  #                 nm    = "tmp_metadata_unique_check2",
+  #                 value = tmp_metadata_unique)
+  # rlang::env_poke(env   = globalenv(),
+  #                 nm    = "res_final",
+  #                 value = res)
 
   out <- join(res,
               tmp_metadata_unique,
@@ -177,9 +177,9 @@ fg_pip <- function(country,
               drop.dup.cols = TRUE,
               verbose       = 0)
 
-  rlang::env_poke(env   = globalenv(),
-                  nm    = "out_check",
-                  value = out)
+  # rlang::env_poke(env   = globalenv(),
+  #                 nm    = "out_check",
+  #                 value = out)
   out[, `:=`(
     file   = NULL
   )]
@@ -188,14 +188,14 @@ fg_pip <- function(country,
            "povline",
            "poverty_line")
 
-  rlang::env_poke(env   = globalenv(),
-                  nm    = "out1",
-                  value = out)
+  # rlang::env_poke(env   = globalenv(),
+  #                 nm    = "out1",
+  #                 value = out)
   # Ensure that out does not have duplicates
   out <- fg_remove_duplicates(out)
-  rlang::env_poke(env   = globalenv(),
-                  nm    = "out2",
-                  value = out)
+  # rlang::env_poke(env   = globalenv(),
+  #                 nm    = "out2",
+  #                 value = out)
 
   # Fix issue with rounding of poverty lines
   out[,
@@ -316,9 +316,11 @@ create_full_list <- function(country, year, refy_lkup, data_present_in_master) {
 
   if (!is.null(data_present_in_master)) {
     data_not_in_cache <-
-      joyn::anti_join(x =  refy_lkup,
-                      y =  data_present_in_master,
-                      by = c("country_code", "reporting_year"))
+      joyn::anti_join(x         =  refy_lkup,
+                      y         =  data_present_in_master,
+                      by        = c("country_code", "reporting_year"),
+                      reportvar = FALSE,
+                      verbose   = FALSE)
   } else {
     data_not_in_cache <- refy_lkup
   }

From 7a1ae757cd0b0558c0f9e9fb454e13eebd17a853 Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Mon, 18 Aug 2025 16:23:20 -0400
Subject: [PATCH 044/203] Increment version number to 1.3.19.9001

---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index ceb175a6..a758dd4f 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: pipapi
 Title: API for the Poverty and Inequality Platform
-Version: 1.3.19.9000
+Version: 1.3.19.9001
 Authors@R: 
     c(person(given = "Tony",
             family = "Fujs",

From 129aa4142e0a60aca2db309c2fc5a5929a576367 Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Wed, 20 Aug 2025 15:39:12 -0400
Subject: [PATCH 045/203] make dist stats NA for fg pip

---
 R/pip.R | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/R/pip.R b/R/pip.R
index 7fe60b2b..bbd9e04f 100644
--- a/R/pip.R
+++ b/R/pip.R
@@ -359,10 +359,10 @@ pip_new_lineups <- function(country         = "ALL",
       # ZP temp NA lineups:
       #---------------------
     # ## Inequality indicators to NA for lineup years ----
-    #   dist_vars  <- names2keep[!(names2keep %in% crr_names)]
-    #   out[,
-    #       (dist_vars) := NA_real_]
-    #
+      dist_vars  <- names2keep[!(names2keep %in% crr_names)]
+      out[,
+          (dist_vars) := NA_real_]
+
       ## estimate_var -----
       out <- estimate_type_ctr_lnp(out, lkup)
 

From fa602a0589d4050085a407bf7456367b78828158 Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Thu, 21 Aug 2025 23:12:55 -0400
Subject: [PATCH 046/203] new attr to cols functions

---
 R/add_attributes_as_columns.R | 229 ++++++++++++++++++++++++++++++++++
 R/fg_pip.R                    |  55 ++++----
 2 files changed, 255 insertions(+), 29 deletions(-)
 create mode 100644 R/add_attributes_as_columns.R

diff --git a/R/add_attributes_as_columns.R b/R/add_attributes_as_columns.R
new file mode 100644
index 00000000..2617f1d1
--- /dev/null
+++ b/R/add_attributes_as_columns.R
@@ -0,0 +1,229 @@
+#' Add attributes as columns (vectorized, in-place)
+#'
+#' @description
+#' Converts survey attributes on a `data.table`—including
+#' `reporting_level_rows`, `country_code`, `reporting_year`, and `dist_stats`—
+#' into columns using a **loop-free, segment-replication** strategy. Designed
+#' for very large tables and objects loaded via `readRDS()`/`load()`:
+#' uses `setDT()` and `alloc.col()` to ensure in-place assignment.
+#'
+#' @details
+#' The function expects an attribute `reporting_level_rows`, a list with:
+#' - `reporting_level`: character vector of the level label for each segment
+#'   (e.g., `c("rural","urban","rural", ...)`).
+#' - `rows`: integer vector of **cumulative** row-ends (e.g.,
+#'   `c(100000, 200000, 300000, ...)`).
+#'
+#' Segment lengths are computed as `diff(c(0L, rows))`, and `reporting_level`
+#' is replicated with `rep.int(lev, counts)`. Constants `country_code`,
+#' `reporting_year`, and `file` (`paste0(country_code, "_", reporting_year)`)
+#' are added to all rows. If `dist_stats$mean` / `dist_stats$median` are
+#' provided (as named vectors/lists keyed by level), they are mapped by level
+#' name and replicated per segment. If a level is missing from the names,
+#' `NA` values may result for that segment.
+#'
+#' This implementation avoids loops and `findInterval()` edge cases, and
+#' modifies `dt` by reference.
+#'
+#' @param dt A `data.table` carrying the attributes described above.
+#'
+#' @return The same `data.table`, modified by reference, with added columns:
+#'   `reporting_level`, `country_code`, `reporting_year`, `file`, and (if
+#'   present) `mean`, `median`.
+#'
+#' @section Assumptions:
+#' * `length(reporting_level_rows$reporting_level) == length(reporting_level_rows$rows)`.
+#' * `rows` are cumulative and non-decreasing, and their segment lengths sum to `nrow(dt)`.
+#' * If `dist_stats$mean` / `dist_stats$median` have multiple values, their names
+#'   align with the level labels.
+#'
+#' @note For objects loaded from disk (e.g., via `readRDS()`), `alloc.col(dt)`
+#'       ensures there is spare column capacity for by-reference assignment.
+#'
+#' @seealso [add_attributes_as_columns_multi()], [assign_stat()]
+#'
+#' @examples
+#' \dontrun{
+#' library(data.table)
+#' dt <- data.table(weight = 1:6, welfare = runif(6))
+#' attr(dt, "reporting_level_rows") <- list(
+#'   reporting_level = c("rural","urban","rural"),
+#'   rows = c(2L, 4L, 6L)
+#' )
+#' attr(dt, "country_code")   <- "XXY"
+#' attr(dt, "reporting_year") <- 2000L
+#' attr(dt, "dist_stats") <- list(
+#'   mean   = list(rural = 2.5, urban = 5.0),
+#'   median = list(rural = 2.0, urban = 4.5)
+#' )
+#'
+#' add_attributes_as_columns_vectorized(dt)
+#' head(dt)
+#' }
+#'
+#' @import data.table
+#' @export
+add_attributes_as_columns_vectorized <- function(dt) {
+  # Ensure proper internal state & spare column capacity (handles readRDS/load cases)
+  setDT(dt)        # harmless if already a data.table
+  alloc.col(dt)    # pre-allocate room for new columns
+
+  rl   <- attr(dt, "reporting_level_rows")
+  lev  <- rl$reporting_level
+  rows <- as.integer(rl$rows)
+  n    <- nrow(dt)
+
+  counts <- diff(c(0L, rows))
+  if (sum(counts) != n) stop("Sum of 'rows' in attribute does not equal nrow(dt).")
+
+  # reporting_level: vectorized, no loops, no findInterval edge cases
+  dt[, reporting_level := rep.int(lev, counts)]
+
+  # constants
+  cc <- attr(dt, "country_code")
+  ry <- attr(dt, "reporting_year")
+  dt[, `:=`(
+    country_code   = cc,
+    reporting_year = ry,
+    file           = paste0(cc, "_", ry)
+  )]
+
+  # dist_stats per reporting_level (align by names, then replicate by counts)
+  ds <- attr(dt, "dist_stats")
+  if (length(ds)) {
+    if (!is.null(ds$mean)) {
+      m <- unlist(ds$mean, use.names = TRUE)
+      dt[, mean := rep.int(unname(m[match(lev, names(m))]), counts)]
+    }
+    if (!is.null(ds$median)) {
+      md <- unlist(ds$median, use.names = TRUE)
+      dt[, median := rep.int(unname(md[match(lev, names(md))]), counts)]
+    }
+  }
+
+  dt
+}
+
+
+
+#' Add attributes as columns for multi-segment reporting levels
+#'
+#' @description
+#' Converts attributes on a survey `data.table` (e.g., `reporting_level_rows`,
+#' `country_code`, `reporting_year`, and `dist_stats`) into columns, handling
+#' **multiple alternating segments** (e.g., CHN rural/urban/rural/urban) or
+#' single-segment cases (e.g., ZAF).
+#'
+#' @param dt A `data.table` with attributes:
+#'   - `reporting_level_rows`: list with `reporting_level` (character) and
+#'     `rows` (integer cumulative row ends).
+#'   - `country_code` (character).
+#'   - `reporting_year` (integer/numeric).
+#'   - `dist_stats` (list) optionally containing `mean` and/or `median`, each as
+#'     a named list/vector keyed by reporting level, or a single scalar.
+#'
+#' @return The same `data.table`, modified by reference, with new columns:
+#'   `reporting_level`, `country_code`, `reporting_year`, `file`, and
+#'   optionally `mean`, `median`.
+#'
+#' @examples
+#' # chn2000_cols <- add_attributes_as_columns_multi(chn2000)
+#' # zaf2000_cols <- add_attributes_as_columns_multi(zaf2000)
+#' @import data.table
+#' @export
+add_attributes_as_columns_multi <- function(dt) {
+  # Ensure DT internals and spare capacity for new columns
+  setDT(dt)
+  alloc.col(dt)
+
+  # --- Pull + validate segment metadata ---
+  rl <- attr(dt, "reporting_level_rows")
+  if (is.null(rl) || is.null(rl$reporting_level) || is.null(rl$rows)) {
+    stop("Missing 'reporting_level_rows' attribute with $reporting_level and $rows.")
+  }
+  lev  <- as.character(rl$reporting_level)
+  rows <- as.integer(rl$rows)
+  n    <- nrow(dt)
+
+  if (length(lev) != length(rows)) stop("'reporting_level' and 'rows' lengths differ.")
+  if (length(rows) == 0L) stop("'rows' is empty.")
+  if (any(diff(rows) < 0L)) stop("'rows' must be non-decreasing.")
+  if (rows[length(rows)] != n) stop("Last element of 'rows' must equal nrow(dt).")
+
+  counts <- diff(c(0L, rows))
+  if (any(counts <= 0L)) stop("Computed non-positive segment length(s).")
+
+  # --- reporting_level: vectorized per-segment replication ---
+  dt[, reporting_level := rep.int(lev, counts)]
+
+  # --- constants ---
+  cc <- attr(dt, "country_code")
+  ry <- attr(dt, "reporting_year")
+  dt[, `:=`(
+    country_code   = cc,
+    reporting_year = ry,
+    file           = paste0(cc, "_", ry)
+  )]
+
+  # --- distribution stats ---
+  ds <- attr(dt, "dist_stats")
+  if (length(ds)) {
+    assign_stat(dt, lev, counts, ds$mean,   "mean")
+    assign_stat(dt, lev, counts, ds$median, "median")
+  }
+
+  dt
+}
+
+
+
+#' Assign a per-level statistic to a data.table column (by reference)
+#'
+#' @description
+#' Replicates a statistic per reporting-level segment and assigns it to a new
+#' column in `dt`, **in place**. `stat` can be a scalar (broadcast), a named
+#' vector, or a named list (one value per level).
+#'
+#' @param dt A `data.table`. Modified by reference.
+#' @param lev Character vector of reporting-level labels per segment
+#'   (e.g., `c("rural","urban","rural", ...)`).
+#' @param counts Integer vector of segment lengths matching `lev`
+#'   (e.g., `c(100000, 100000, 100000, ...)`).
+#' @param stat A numeric scalar, named vector, or named list with one value per
+#'   level (names must match `lev` values).
+#' @param colname Name of the column to create/overwrite.
+#'
+#' @return Invisibly returns `dt` (modified by reference).
+#' @examples
+#' # assign_stat(dt, lev, counts, list(rural = 2.6, urban = 5.5), "mean")
+#' @import data.table
+#' @export
+assign_stat <- function(dt, lev, counts, stat, colname) {
+  if (is.null(stat)) return(invisible(dt))
+  n <- nrow(dt)
+
+  v <- if (is.list(stat)) unlist(stat, use.names = TRUE) else stat
+
+  # Single scalar: broadcast
+  if (length(v) == 1L && is.null(names(v))) {
+    dt[, (colname) := rep.int(unname(v), n)]
+    return(invisible(dt))
+  }
+
+  # Need names to map values to levels
+  if (is.null(names(v))) {
+    stop("`stat` has length > 1 but no names; cannot map to levels.")
+  }
+
+  map_idx <- match(lev, names(v))
+  if (anyNA(map_idx)) {
+    missing_levels <- unique(lev[is.na(map_idx)])
+    stop(
+      sprintf("`stat` missing value(s) for level(s): %s",
+              paste(missing_levels, collapse = ", "))
+    )
+  }
+
+  dt[, (colname) := rep.int(unname(v[map_idx]), counts)]
+  invisible(dt)
+}
diff --git a/R/fg_pip.R b/R/fg_pip.R
index 58993206..3a075d67 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -52,7 +52,7 @@ fg_pip <- function(country,
 
   # Return empty dataframe if no metadata is found (i.e. all in cache)
   if (nrow(metadata) == 0) {
-    print("ZP: no metadata - i.e. nothing additional to estimate")
+    #print("ZP: no metadata - i.e. nothing additional to estimate")
     return(list(main_data     = pipapi::empty_response_fg,
                 data_in_cache = data_present_in_master))
   }
@@ -69,17 +69,8 @@ fg_pip <- function(country,
 
   lt <- lapply(lt,
                FUN = \(x) {
-                  x <- x |>
-                   pipdata::attr_to_column("reporting_level_rows") |> # only rep level????
-                   pipdata::attr_to_column("country_code") |>
-                   pipdata::attr_to_column("reporting_year") |>
-                    pipdata::attr_to_column("mean", dist_stats = TRUE) |>
-                    pipdata::attr_to_column("median", dist_stats = TRUE) |>
-                   fmutate(file = paste0(country_code,
-                                         "_",
-                                         reporting_year))
-
-                  x
+                  x |>
+                    add_attributes_as_columns_vectorized()
                })
 
   # rlang::env_poke(env   = globalenv(),
@@ -119,7 +110,6 @@ fg_pip <- function(country,
   #-------------------------
   metadata[,
            file := basename(path)]
-  print(metadata)
 
   # TO BE REMOVED, ONLY FOR TESTING!!!
   # rlang::env_poke(env   = globalenv(),
@@ -192,7 +182,8 @@ fg_pip <- function(country,
   #                 nm    = "out1",
   #                 value = out)
   # Ensure that out does not have duplicates
-  out <- fg_remove_duplicates(out)
+  out <- fg_remove_duplicates(out,
+                              use_new_lineup_version = lkup$use_new_lineup_version)
   # rlang::env_poke(env   = globalenv(),
   #                 nm    = "out2",
   #                 value = out)
@@ -246,22 +237,28 @@ fg_remove_duplicates <- function(df,
                                           "survey_median_ppp",
                                           "survey_time",
                                           "survey_year",
-                                          "surveyid_year")) {
-  # not all cols need to be changes
-  cols <- setdiff(cols,
-                  colnames(df))
-  # Modify cache_id
-  # * Ensures that cache_id is unique for both extrapolated and interpolated surveys
-  # * Ensures that cache_id can be kept as an output of fg_pip() while still removing duplicated rows
-  # df$cache_id <- fg_standardize_cache_id(cache_id = df$cache_id,
-  #                                        interpolation_id = df$data_interpolation_id,
-  #                                        reporting_level = df$reporting_level)
-  # Set collapse vars to NA (by type)
-  df <- fg_assign_nas_values_to_dup_cols(df   = df,
-                                         cols = cols)
+                                          "surveyid_year"),
+                                 use_new_lineup_version = FALSE) {
+
+  if (isFALSE(use_new_lineup_version)) {
+    print("here")
+    # not all cols need to be changes
+    cols <- setdiff(cols,
+                    colnames(df))
+    # Modify cache_id
+    # * Ensures that cache_id is unique for both extrapolated and interpolated surveys
+    # * Ensures that cache_id can be kept as an output of fg_pip() while still removing duplicated rows
+    # df$cache_id <- fg_standardize_cache_id(cache_id = df$cache_id,
+    #                                        interpolation_id = df$data_interpolation_id,
+    #                                        reporting_level = df$reporting_level)
+    # Set collapse vars to NA (by type)
+    df <- fg_assign_nas_values_to_dup_cols(df   = df,
+                                           cols = cols)
+
+    # Ensure that out does not have duplicates
+    df <- unique(df)
+  }
 
-  # Ensure that out does not have duplicates
-  df <- unique(df)
 
   return(df)
 }

From 1ca018f76179427aa9cb3c69294cf511565eb08b Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Thu, 21 Aug 2025 23:15:06 -0400
Subject: [PATCH 047/203] rm comments & print checks

---
 R/fg_pip.R | 56 +-----------------------------------------------------
 R/pip.R    | 18 +++++++-----------
 2 files changed, 8 insertions(+), 66 deletions(-)

diff --git a/R/fg_pip.R b/R/fg_pip.R
index 3a075d67..7e568ab5 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -73,10 +73,6 @@ fg_pip <- function(country,
                     add_attributes_as_columns_vectorized()
                })
 
-  # rlang::env_poke(env   = globalenv(),
-  #                 nm    = "pipdata_list",
-  #                 value = lt)
-
   # ZP Add: do fgt estimations using `res <- lapply(lt, process_dt, povline = povline)`
   #-------------------------
   res <- lapply(lt,
@@ -86,37 +82,11 @@ fg_pip <- function(country,
   res <- rbindlist(res,
                    fill = TRUE)
 
-  # # TO BE REMOVED, ONLY FOR TESTING!!!
-  # rlang::env_poke(env   = globalenv(),
-  #                 nm    = "res_povest",
-  #                 value = res)
-
-  # ZP Add: join to dist_stats
-  #-------------------------
-  # ly_dist <- lkup$lineup_dist_stats
-  # res <- res |>
-  #   joyn::left_join(y            = ly_dist,
-  #                   by           = c("file",
-  #                                    "reporting_level",
-  #                                    "reporting_year"),
-  #                   relationship = "one-to-one",
-  #                   reportvar    = FALSE,
-  #                   verbose      = FALSE)
-  # rlang::env_poke(env   = globalenv(),
-  #                 nm    = "res_dist",
-  #                 value = res)
-
   # ZP Add: join to metadata
   #-------------------------
   metadata[,
            file := basename(path)]
 
-  # TO BE REMOVED, ONLY FOR TESTING!!!
-  # rlang::env_poke(env   = globalenv(),
-  #                 nm    = "metadata_check2",
-  #                 value = metadata)
-  # metadata <- copy(metadata_check2)
-  #stop("---------------")
   # try metadata unique code
   tmp_metadata <- metadata
   # Handle multiple distribution types (for aggregated distributions)
@@ -146,15 +116,6 @@ fg_pip <- function(country,
                       file := paste0(country_code,
                                      "_",
                                      reporting_year)]
-  # rlang::env_poke(env   = globalenv(),
-  #                 nm    = "tmp_metadata_unique_check",
-  #                 value = tmp_metadata_unique)
-  # rlang::env_poke(env   = globalenv(),
-  #                 nm    = "tmp_metadata_unique_check2",
-  #                 value = tmp_metadata_unique)
-  # rlang::env_poke(env   = globalenv(),
-  #                 nm    = "res_final",
-  #                 value = res)
 
   out <- join(res,
               tmp_metadata_unique,
@@ -167,9 +128,6 @@ fg_pip <- function(country,
               drop.dup.cols = TRUE,
               verbose       = 0)
 
-  # rlang::env_poke(env   = globalenv(),
-  #                 nm    = "out_check",
-  #                 value = out)
   out[, `:=`(
     file   = NULL
   )]
@@ -178,15 +136,10 @@ fg_pip <- function(country,
            "povline",
            "poverty_line")
 
-  # rlang::env_poke(env   = globalenv(),
-  #                 nm    = "out1",
-  #                 value = out)
   # Ensure that out does not have duplicates
   out <- fg_remove_duplicates(out,
                               use_new_lineup_version = lkup$use_new_lineup_version)
-  # rlang::env_poke(env   = globalenv(),
-  #                 nm    = "out2",
-  #                 value = out)
+
 
   # Fix issue with rounding of poverty lines
   out[,
@@ -199,13 +152,6 @@ fg_pip <- function(country,
     out[, max_year := NULL]
   }
 
-  # in_cache <- data_present_in_master |>
-  #   fmutate(file = paste(country_code, reporting_year, sep = "_")) |>
-  #   fselect(file) |>
-  #   reg_elem()
-  # # rm rows in out that are in cache
-  # out <- out[!(file %in% in_cache)]
-
   return(list(main_data     = out,
               data_in_cache = data_present_in_master))
 
diff --git a/R/pip.R b/R/pip.R
index bbd9e04f..0f24ebb7 100644
--- a/R/pip.R
+++ b/R/pip.R
@@ -250,31 +250,27 @@ pip_new_lineups <- function(country         = "ALL",
   #---------------------------------------------
     cached_data <- qDT(out$data_in_cache)
     main_data   <- qDT(out$main_data)
-    print("A")
 
     if (nrow(main_data) > 0) {
 
-      print("B")
-
       if (is.null(out$data_in_cache)) {
-        print("C")
+
         out <- main_data
       } else {
-        print("D")
+
         if (fill_gaps) {
-          print("E")
-          cached_data <- fg_remove_duplicates(cached_data)
+
+          cached_data <- fg_remove_duplicates(cached_data,
+                                              use_new_lineup_version = lkup$use_new_lineup_version)
         }
-        print("F")
+
         out <- main_data |>
           rowbind(cached_data)
       }
-      print("G")
 
         update_master_file(main_data, cache_file_path, fill_gaps)
 
     } else {
-      print("H")
       out <- cached_data
     }
     if (!data.table::is.data.table(out)) {
@@ -293,7 +289,7 @@ pip_new_lineups <- function(country         = "ALL",
       }
     }
 
-    # Add out of pipeline variablse
+    # Add out of pipeline variable
     #---------------------------------------------
     add_vars_out_of_pipeline(out,
                              fill_gaps = fill_gaps,

From 7da94f68c0951eca3bf8127ddf1e484824529302 Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Thu, 21 Aug 2025 23:17:10 -0400
Subject: [PATCH 048/203] change checks to cli

---
 R/add_attributes_as_columns.R | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/R/add_attributes_as_columns.R b/R/add_attributes_as_columns.R
index 2617f1d1..f1cd461f 100644
--- a/R/add_attributes_as_columns.R
+++ b/R/add_attributes_as_columns.R
@@ -64,6 +64,7 @@
 #' @import data.table
 #' @export
 add_attributes_as_columns_vectorized <- function(dt) {
+
   # Ensure proper internal state & spare column capacity (handles readRDS/load cases)
   setDT(dt)        # harmless if already a data.table
   alloc.col(dt)    # pre-allocate room for new columns
@@ -74,10 +75,11 @@ add_attributes_as_columns_vectorized <- function(dt) {
   n    <- nrow(dt)
 
   counts <- diff(c(0L, rows))
-  if (sum(counts) != n) stop("Sum of 'rows' in attribute does not equal nrow(dt).")
+  if (sum(counts) != n) cli::cli_abort("Sum of 'rows' in attribute does not equal nrow(dt).")
 
   # reporting_level: vectorized, no loops, no findInterval edge cases
-  dt[, reporting_level := rep.int(lev, counts)]
+  dt[,
+     reporting_level := rep.int(lev, counts)]
 
   # constants
   cc <- attr(dt, "country_code")
@@ -93,11 +95,17 @@ add_attributes_as_columns_vectorized <- function(dt) {
   if (length(ds)) {
     if (!is.null(ds$mean)) {
       m <- unlist(ds$mean, use.names = TRUE)
-      dt[, mean := rep.int(unname(m[match(lev, names(m))]), counts)]
+      dt[,
+         mean := rep.int(unname(m[match(lev,
+                                        names(m))]),
+                         counts)]
     }
     if (!is.null(ds$median)) {
       md <- unlist(ds$median, use.names = TRUE)
-      dt[, median := rep.int(unname(md[match(lev, names(md))]), counts)]
+      dt[,
+         median := rep.int(unname(md[match(lev,
+                                           names(md))]),
+                           counts)]
     }
   }
 
@@ -139,19 +147,19 @@ add_attributes_as_columns_multi <- function(dt) {
   # --- Pull + validate segment metadata ---
   rl <- attr(dt, "reporting_level_rows")
   if (is.null(rl) || is.null(rl$reporting_level) || is.null(rl$rows)) {
-    stop("Missing 'reporting_level_rows' attribute with $reporting_level and $rows.")
+    cli::cli_abort("Missing 'reporting_level_rows' attribute with $reporting_level and $rows.")
   }
   lev  <- as.character(rl$reporting_level)
   rows <- as.integer(rl$rows)
   n    <- nrow(dt)
 
-  if (length(lev) != length(rows)) stop("'reporting_level' and 'rows' lengths differ.")
-  if (length(rows) == 0L) stop("'rows' is empty.")
-  if (any(diff(rows) < 0L)) stop("'rows' must be non-decreasing.")
-  if (rows[length(rows)] != n) stop("Last element of 'rows' must equal nrow(dt).")
+  if (length(lev) != length(rows)) cli::cli_abort("'reporting_level' and 'rows' lengths differ.")
+  if (length(rows) == 0L) cli::cli_abort("'rows' is empty.")
+  if (any(diff(rows) < 0L)) cli::cli_abort("'rows' must be non-decreasing.")
+  if (rows[length(rows)] != n) cli::cli_abort("Last element of 'rows' must equal nrow(dt).")
 
   counts <- diff(c(0L, rows))
-  if (any(counts <= 0L)) stop("Computed non-positive segment length(s).")
+  if (any(counts <= 0L)) cli::cli_abort("Computed non-positive segment length(s).")
 
   # --- reporting_level: vectorized per-segment replication ---
   dt[, reporting_level := rep.int(lev, counts)]

From bf68fccb26c400e2aa9e5625fd6c85a73f498afa Mon Sep 17 00:00:00 2001
From: zander-prinsloo <prinsloo.zander@gmail.com>
Date: Thu, 21 Aug 2025 23:17:42 -0400
Subject: [PATCH 049/203] Increment version number to 1.3.19.9002

---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index a758dd4f..fa2ae7c1 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: pipapi
 Title: API for the Poverty and Inequality Platform
-Version: 1.3.19.9001
+Version: 1.3.19.9002
 Authors@R: 
     c(person(given = "Tony",
             family = "Fujs",

From 4a25d5b54f194b74b2c8f8a8c0057246151eead4 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Fri, 22 Aug 2025 11:29:47 -0400
Subject: [PATCH 050/203] documnet and add pipdata functions into utils-pipdata

---
 NAMESPACE                                   |  4 ++
 R/fg_pip.R                                  | 18 +++--
 R/utils-pipdata.R                           | 49 +++++++++++++
 man/add_attributes_as_columns_multi.Rd      | 34 +++++++++
 man/add_attributes_as_columns_vectorized.Rd | 80 +++++++++++++++++++++
 man/assign_stat.Rd                          | 33 +++++++++
 man/create_full_list.Rd                     | 23 ++++++
 man/fg_pip.Rd                               |  6 --
 man/fg_remove_duplicates.Rd                 |  3 +-
 man/load_list_refy.Rd                       | 20 ++++++
 man/transform_input.Rd                      | 18 +++++
 man/ui_cp_charts.Rd                         |  2 +-
 man/ui_cp_poverty_charts.Rd                 |  2 +-
 man/ui_hp_countries.Rd                      |  2 +-
 man/ui_pc_charts.Rd                         |  2 +-
 man/ui_pc_regional.Rd                       |  2 +-
 16 files changed, 276 insertions(+), 22 deletions(-)
 create mode 100644 R/utils-pipdata.R
 create mode 100644 man/add_attributes_as_columns_multi.Rd
 create mode 100644 man/add_attributes_as_columns_vectorized.Rd
 create mode 100644 man/assign_stat.Rd
 create mode 100644 man/create_full_list.Rd
 create mode 100644 man/load_list_refy.Rd
 create mode 100644 man/transform_input.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 5f8d1ce5..2ee13c9d 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,8 +1,11 @@
 # Generated by roxygen2: do not edit by hand
 
+export(add_attributes_as_columns_multi)
+export(add_attributes_as_columns_vectorized)
 export(add_dist_stats)
 export(add_dist_stats_old)
 export(assign_serializer)
+export(assign_stat)
 export(change_grouped_stats_to_csv)
 export(citation_from_version)
 export(create_etag_header)
@@ -53,6 +56,7 @@ export(validate_input_grouped_stats)
 export(version_dataframe)
 export(wld_lineup_year)
 import(collapse, except = fdroplevels)
+import(data.table)
 import(data.table, except = fdroplevels)
 importFrom(glue,glue)
 importFrom(glue,glue_collapse)
diff --git a/R/fg_pip.R b/R/fg_pip.R
index 7e568ab5..e8c3edb2 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -63,14 +63,11 @@ fg_pip <- function(country,
                                 data_present_in_master = data_present_in_master)
 
   lt <-
-    pipdata::load_list_refy(input_list = full_list,
-                            path = fs::path(data_dir,
-                                            "lineup_data"))
-
-  lt <- lapply(lt,
-               FUN = \(x) {
-                  x |>
-                    add_attributes_as_columns_vectorized()
+    load_list_refy(input_list = full_list,
+                   path       = fs::path(data_dir, "lineup_data"))
+
+  lt <- lapply(lt, \(x) {
+                   add_attributes_as_columns_vectorized(x)
                })
 
   # ZP Add: do fgt estimations using `res <- lapply(lt, process_dt, povline = povline)`
@@ -297,8 +294,9 @@ create_full_list <- function(country, year, refy_lkup, data_present_in_master) {
   # Split years by country
   full_list <- dtemp[,
                      .(year = list(year)),
-                     by = country_code][
-                       , .(country_code, year = year)
+                     by = country_code
+                     ][,
+                       .(country_code, year = year)
                      ]
 
   # Convert to desired structure
diff --git a/R/utils-pipdata.R b/R/utils-pipdata.R
new file mode 100644
index 00000000..16b996ce
--- /dev/null
+++ b/R/utils-pipdata.R
@@ -0,0 +1,49 @@
+#' load refy list
+#'
+#' @param input_list list. output from [create_full_list]
+#' @param path character: directory path
+#'
+#' @return character vector
+#' @keywords internal
+load_list_refy <- \(input_list, path){
+  input_list <- transform_input(input_list)
+
+  dl <- lapply(input_list, FUN = function(x) {
+    qs::qread(file = fs::path(path, paste0(x$country_code, "_",
+                                                x$year),
+                              ext = "qs"))
+    })
+
+  names(dl) <- vapply(input_list, \(x) {
+    paste0(x$country_code, x$year)
+    },
+    FUN.VALUE = character(1))
+  dl
+}
+
+
+#' transform input list
+#'
+#' @inheritParams load_list_refy
+#'
+#' @return formated list
+#' @keywords internal
+transform_input <- function(input_list){
+  country_codes <- input_list$country_code
+  years <- input_list$year
+  if (!is.list(years)) {
+    years <- lapply(country_codes, function(x) years)
+  }
+  else {
+    if (length(years) != length(country_codes)) {
+      stop("The length of the 'year' list must match the length of the 'country_code' vector.")
+    }
+  }
+  output_list <- lapply(seq_along(country_codes), function(i) {
+    lapply(years[[i]], function(y) {
+      list(country_code = country_codes[i], year = y)
+    })
+  })
+  output_list <- unlist(output_list, recursive = FALSE)
+  return(output_list)
+}
diff --git a/man/add_attributes_as_columns_multi.Rd b/man/add_attributes_as_columns_multi.Rd
new file mode 100644
index 00000000..7df2c2ed
--- /dev/null
+++ b/man/add_attributes_as_columns_multi.Rd
@@ -0,0 +1,34 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/add_attributes_as_columns.R
+\name{add_attributes_as_columns_multi}
+\alias{add_attributes_as_columns_multi}
+\title{Add attributes as columns for multi-segment reporting levels}
+\usage{
+add_attributes_as_columns_multi(dt)
+}
+\arguments{
+\item{dt}{A \code{data.table} with attributes:
+\itemize{
+\item \code{reporting_level_rows}: list with \code{reporting_level} (character) and
+\code{rows} (integer cumulative row ends).
+\item \code{country_code} (character).
+\item \code{reporting_year} (integer/numeric).
+\item \code{dist_stats} (list) optionally containing \code{mean} and/or \code{median}, each as
+a named list/vector keyed by reporting level, or a single scalar.
+}}
+}
+\value{
+The same \code{data.table}, modified by reference, with new columns:
+\code{reporting_level}, \code{country_code}, \code{reporting_year}, \code{file}, and
+optionally \code{mean}, \code{median}.
+}
+\description{
+Converts attributes on a survey \code{data.table} (e.g., \code{reporting_level_rows},
+\code{country_code}, \code{reporting_year}, and \code{dist_stats}) into columns, handling
+\strong{multiple alternating segments} (e.g., CHN rural/urban/rural/urban) or
+single-segment cases (e.g., ZAF).
+}
+\examples{
+# chn2000_cols <- add_attributes_as_columns_multi(chn2000)
+# zaf2000_cols <- add_attributes_as_columns_multi(zaf2000)
+}
diff --git a/man/add_attributes_as_columns_vectorized.Rd b/man/add_attributes_as_columns_vectorized.Rd
new file mode 100644
index 00000000..66c51a00
--- /dev/null
+++ b/man/add_attributes_as_columns_vectorized.Rd
@@ -0,0 +1,80 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/add_attributes_as_columns.R
+\name{add_attributes_as_columns_vectorized}
+\alias{add_attributes_as_columns_vectorized}
+\title{Add attributes as columns (vectorized, in-place)}
+\usage{
+add_attributes_as_columns_vectorized(dt)
+}
+\arguments{
+\item{dt}{A \code{data.table} carrying the attributes described above.}
+}
+\value{
+The same \code{data.table}, modified by reference, with added columns:
+\code{reporting_level}, \code{country_code}, \code{reporting_year}, \code{file}, and (if
+present) \code{mean}, \code{median}.
+}
+\description{
+Converts survey attributes on a \code{data.table}—including
+\code{reporting_level_rows}, \code{country_code}, \code{reporting_year}, and \code{dist_stats}—
+into columns using a \strong{loop-free, segment-replication} strategy. Designed
+for very large tables and objects loaded via \code{readRDS()}/\code{load()}:
+uses \code{setDT()} and \code{alloc.col()} to ensure in-place assignment.
+}
+\details{
+The function expects an attribute \code{reporting_level_rows}, a list with:
+\itemize{
+\item \code{reporting_level}: character vector of the level label for each segment
+(e.g., \code{c("rural","urban","rural", ...)}).
+\item \code{rows}: integer vector of \strong{cumulative} row-ends (e.g.,
+\code{c(100000, 200000, 300000, ...)}).
+}
+
+Segment lengths are computed as \code{diff(c(0L, rows))}, and \code{reporting_level}
+is replicated with \code{rep.int(lev, counts)}. Constants \code{country_code},
+\code{reporting_year}, and \code{file} (\code{paste0(country_code, "_", reporting_year)})
+are added to all rows. If \code{dist_stats$mean} / \code{dist_stats$median} are
+provided (as named vectors/lists keyed by level), they are mapped by level
+name and replicated per segment. If a level is missing from the names,
+\code{NA} values may result for that segment.
+
+This implementation avoids loops and \code{findInterval()} edge cases, and
+modifies \code{dt} by reference.
+}
+\note{
+For objects loaded from disk (e.g., via \code{readRDS()}), \code{alloc.col(dt)}
+ensures there is spare column capacity for by-reference assignment.
+}
+\section{Assumptions}{
+
+\itemize{
+\item \code{length(reporting_level_rows$reporting_level) == length(reporting_level_rows$rows)}.
+\item \code{rows} are cumulative and non-decreasing, and their segment lengths sum to \code{nrow(dt)}.
+\item If \code{dist_stats$mean} / \code{dist_stats$median} have multiple values, their names
+align with the level labels.
+}
+}
+
+\examples{
+\dontrun{
+library(data.table)
+dt <- data.table(weight = 1:6, welfare = runif(6))
+attr(dt, "reporting_level_rows") <- list(
+  reporting_level = c("rural","urban","rural"),
+  rows = c(2L, 4L, 6L)
+)
+attr(dt, "country_code")   <- "XXY"
+attr(dt, "reporting_year") <- 2000L
+attr(dt, "dist_stats") <- list(
+  mean   = list(rural = 2.5, urban = 5.0),
+  median = list(rural = 2.0, urban = 4.5)
+)
+
+add_attributes_as_columns_vectorized(dt)
+head(dt)
+}
+
+}
+\seealso{
+\code{\link[=add_attributes_as_columns_multi]{add_attributes_as_columns_multi()}}, \code{\link[=assign_stat]{assign_stat()}}
+}
diff --git a/man/assign_stat.Rd b/man/assign_stat.Rd
new file mode 100644
index 00000000..987b08f4
--- /dev/null
+++ b/man/assign_stat.Rd
@@ -0,0 +1,33 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/add_attributes_as_columns.R
+\name{assign_stat}
+\alias{assign_stat}
+\title{Assign a per-level statistic to a data.table column (by reference)}
+\usage{
+assign_stat(dt, lev, counts, stat, colname)
+}
+\arguments{
+\item{dt}{A \code{data.table}. Modified by reference.}
+
+\item{lev}{Character vector of reporting-level labels per segment
+(e.g., \code{c("rural","urban","rural", ...)}).}
+
+\item{counts}{Integer vector of segment lengths matching \code{lev}
+(e.g., \code{c(100000, 100000, 100000, ...)}).}
+
+\item{stat}{A numeric scalar, named vector, or named list with one value per
+level (names must match \code{lev} values).}
+
+\item{colname}{Name of the column to create/overwrite.}
+}
+\value{
+Invisibly returns \code{dt} (modified by reference).
+}
+\description{
+Replicates a statistic per reporting-level segment and assigns it to a new
+column in \code{dt}, \strong{in place}. \code{stat} can be a scalar (broadcast), a named
+vector, or a named list (one value per level).
+}
+\examples{
+# assign_stat(dt, lev, counts, list(rural = 2.6, urban = 5.5), "mean")
+}
diff --git a/man/create_full_list.Rd b/man/create_full_list.Rd
new file mode 100644
index 00000000..249ae8b6
--- /dev/null
+++ b/man/create_full_list.Rd
@@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/fg_pip.R
+\name{create_full_list}
+\alias{create_full_list}
+\title{Create full list for fg data load, not including country-years in cache}
+\usage{
+create_full_list(country, year, refy_lkup, data_present_in_master)
+}
+\arguments{
+\item{country}{Country selected in \link{fg_pip} function}
+
+\item{year}{Year/s selected in \link{fg_pip} function}
+
+\item{refy_lkup}{reference year lkup table with full lineups and new method}
+
+\item{data_present_in_master}{cache data}
+}
+\value{
+list
+}
+\description{
+Create full list for fg data load, not including country-years in cache
+}
diff --git a/man/fg_pip.Rd b/man/fg_pip.Rd
index f83e836f..9012b01b 100644
--- a/man/fg_pip.Rd
+++ b/man/fg_pip.Rd
@@ -39,10 +39,4 @@ data.frame
 \description{
 Compute the main PIP poverty and inequality statistics for imputed years.
 }
-\keyword{#}
-\keyword{Add:}
-\keyword{ZP}
-\keyword{data}
 \keyword{internal}
-\keyword{load}
-\keyword{refy}
diff --git a/man/fg_remove_duplicates.Rd b/man/fg_remove_duplicates.Rd
index b9d9c87b..9b6b79dc 100644
--- a/man/fg_remove_duplicates.Rd
+++ b/man/fg_remove_duplicates.Rd
@@ -9,7 +9,8 @@ fg_remove_duplicates(
   cols = c("comparable_spell", "cpi", "display_cp", "gd_type", "path",
     "predicted_mean_ppp", "survey_acronym", "survey_comparability", "survey_coverage",
     "survey_id", "survey_mean_lcu", "survey_mean_ppp", "survey_median_lcu",
-    "survey_median_ppp", "survey_time", "survey_year", "surveyid_year")
+    "survey_median_ppp", "survey_time", "survey_year", "surveyid_year"),
+  use_new_lineup_version = FALSE
 )
 }
 \arguments{
diff --git a/man/load_list_refy.Rd b/man/load_list_refy.Rd
new file mode 100644
index 00000000..59a37d7c
--- /dev/null
+++ b/man/load_list_refy.Rd
@@ -0,0 +1,20 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils-pipdata.R
+\name{load_list_refy}
+\alias{load_list_refy}
+\title{load refy list}
+\usage{
+load_list_refy(input_list, path)
+}
+\arguments{
+\item{input_list}{list. output from \link{create_full_list}}
+
+\item{path}{character: directory path}
+}
+\value{
+character vector
+}
+\description{
+load refy list
+}
+\keyword{internal}
diff --git a/man/transform_input.Rd b/man/transform_input.Rd
new file mode 100644
index 00000000..e460cf0f
--- /dev/null
+++ b/man/transform_input.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils-pipdata.R
+\name{transform_input}
+\alias{transform_input}
+\title{transform input list}
+\usage{
+transform_input(input_list)
+}
+\arguments{
+\item{input_list}{list. output from \link{create_full_list}}
+}
+\value{
+formated list
+}
+\description{
+transform input list
+}
+\keyword{internal}
diff --git a/man/ui_cp_charts.Rd b/man/ui_cp_charts.Rd
index 03ea2a00..010a71b1 100644
--- a/man/ui_cp_charts.Rd
+++ b/man/ui_cp_charts.Rd
@@ -7,7 +7,7 @@
 ui_cp_charts(
   country = "AGO",
   povline = 1.9,
-  pop_units = 1000000,
+  pop_units = 1e+06,
   lkup,
   lkup_hash = lkup$cache_data_id$hash_ui_cp
 )
diff --git a/man/ui_cp_poverty_charts.Rd b/man/ui_cp_poverty_charts.Rd
index dcfbb943..e2356d1e 100644
--- a/man/ui_cp_poverty_charts.Rd
+++ b/man/ui_cp_poverty_charts.Rd
@@ -4,7 +4,7 @@
 \alias{ui_cp_poverty_charts}
 \title{CP Poverty Charts}
 \usage{
-ui_cp_poverty_charts(country, povline, pop_units = 1000000, lkup)
+ui_cp_poverty_charts(country, povline, pop_units = 1e+06, lkup)
 }
 \arguments{
 \item{country}{character: Country ISO 3 codes}
diff --git a/man/ui_hp_countries.Rd b/man/ui_hp_countries.Rd
index 760f458e..2ffcf443 100644
--- a/man/ui_hp_countries.Rd
+++ b/man/ui_hp_countries.Rd
@@ -7,7 +7,7 @@
 ui_hp_countries(
   country = c("IDN", "CIV"),
   povline = 1.9,
-  pop_units = 1000000,
+  pop_units = 1e+06,
   lkup
 )
 }
diff --git a/man/ui_pc_charts.Rd b/man/ui_pc_charts.Rd
index 438dfca9..49715b48 100644
--- a/man/ui_pc_charts.Rd
+++ b/man/ui_pc_charts.Rd
@@ -12,7 +12,7 @@ ui_pc_charts(
   group_by = "none",
   welfare_type = c("all", "consumption", "income"),
   reporting_level = c("all", "national", "rural", "urban"),
-  pop_units = 1000000,
+  pop_units = 1e+06,
   lkup
 )
 }
diff --git a/man/ui_pc_regional.Rd b/man/ui_pc_regional.Rd
index 3cbde341..083e418b 100644
--- a/man/ui_pc_regional.Rd
+++ b/man/ui_pc_regional.Rd
@@ -8,7 +8,7 @@ ui_pc_regional(
   country = "ALL",
   year = "ALL",
   povline = 1.9,
-  pop_units = 1000000,
+  pop_units = 1e+06,
   lkup
 )
 }

From 3521a1415e8fed38e8ca3da7f935442211d2d2df Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Fri, 22 Aug 2025 12:28:59 -0400
Subject: [PATCH 051/203] send pip_new_lineups to new file

---
 R/pip.R                | 302 -----------------------------------------
 R/pip_new_lineups.R    | 302 +++++++++++++++++++++++++++++++++++++++++
 man/pip_new_lineups.Rd |   2 +-
 3 files changed, 303 insertions(+), 303 deletions(-)
 create mode 100644 R/pip_new_lineups.R

diff --git a/R/pip.R b/R/pip.R
index 0f24ebb7..0689ef26 100644
--- a/R/pip.R
+++ b/R/pip.R
@@ -104,305 +104,3 @@ pip <- function(country         = "ALL",
 
 
-
-#' Compute PIP statistics
-#'
-#' Compute the main PIP poverty and inequality statistics.
-#'
-#' @param country character: Country ISO 3 codes
-#' @param year integer: Reporting year
-#' @param povline numeric: Poverty line
-#' @param popshare numeric: Proportion of the population living below the
-#'   poverty line
-#' @param fill_gaps logical: If set to TRUE, will interpolate / extrapolate
-#'   values for missing years
-#' @param group_by character: Will return aggregated values for predefined
-#'   sub-groups
-#' @param welfare_type character: Welfare type
-#' @param reporting_level character: Geographical reporting level
-#' @param ppp numeric: Custom Purchase Power Parity value
-#' @param lkup list: A list of lkup tables
-#' @param censor logical: Triggers censoring of country/year statistics
-#' @param lkup_hash character: hash of pip
-#' @param additional_ind logical: If TRUE add new set of indicators. Default if
-#'   FALSE
-#'
-#' @return data.table
-#' @examples
-#' \dontrun{
-#' # Create lkups
-#' lkups <- create_lkups("<data-folder>")
-#'
-#' # A single country and year
-#' pip_new_lineups(country = "AGO",
-#'     year = 2000,
-#'     povline = 1.9,
-#'     lkup = lkups)
-#'
-#' # All years for a single country
-#' pip_new_lineups(country = "AGO",
-#'     year = "all",
-#'     povline = 1.9,
-#'     lkup = lkups)
-#'
-#' # Fill gaps
-#' pip_new_lineups(country = "AGO",
-#'     year = "all",
-#'     povline = 1.9,
-#'     fill_gaps = TRUE,
-#'     lkup = lkups)
-#'
-#' # Group by regions
-#' pip_new_lineups(country = "all",
-#'     year = "all",
-#'     povline = 1.9,
-#'     group_by = "wb",
-#'     lkup = lkups)
-#' }
-#' @export
-pip_new_lineups <- function(country         = "ALL",
-                            year            = "ALL",
-                            povline         = 1.9,
-                            popshare        = NULL,
-                            fill_gaps       = FALSE,
-                            group_by        = c("none", "wb"),
-                            welfare_type    = c("all", "consumption", "income"),
-                            reporting_level = c("all", "national", "rural", "urban"),
-                            ppp             = NULL,
-                            lkup,
-                            censor          = TRUE,
-                            lkup_hash       = lkup$cache_data_id$hash_pip,
-                            additional_ind  = FALSE) {
-
-
-  # set up -------------
-  welfare_type    <- match.arg(welfare_type)
-  reporting_level <- match.arg(reporting_level)
-  group_by        <- match.arg(group_by)
-  povline         <- round(povline, digits = 3)
-
-
-
-  # TEMPORARY UNTIL SELECTION MECHANISM IS BEING IMPROVED
-  country <- toupper(country)
-  if (is.character(year)) {
-    year <- toupper(year)
-  }
-
-  # If svy_lkup is not part of lkup throw an error.
-  if (!all(c('svy_lkup') %in% names(lkup)))
-    stop("You are probably passing more than one dataset as lkup argument.
-  Try passing a single one by subsetting it lkup <- lkups$versions_paths$dataset_name_PROD")
-
-
-  # **** TO BE REMOVED **** REMOVAL STARTS HERE
-  # Once `pip-grp` has been integrated in ingestion pipeline
-  # Forces fill_gaps to TRUE when using group_by option
-  if (group_by != "none") {
-    fill_gaps <- TRUE
-    message("Info: argument group_by in pip() is deprecated; please use pip_grp() instead.")
-  }
-  # **** TO BE REMOVED **** REMOVAL ENDS HERE
-
-  # Countries vector ------------
-  lcv <- # List with countries vectors
-    create_countries_vctr(
-      country         =  country,
-      year            =  year,
-      valid_years     =  lkup$valid_years,
-      aux_files       =  lkup$aux_files
-    )
-  # lcv$est_ctrs has all the country_code that we are interested in
-
-  cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb")
-  if (!file.exists(cache_file_path)) {
-    # Create an empty duckdb file
-    create_duckdb_file(cache_file_path)
-  }
-    # mains estimates ---------------
-    if (fill_gaps) {
-      ## lineup years-----------------
-      out <- fg_pip(
-        country            = lcv$est_ctrs,
-        year               = year,
-        povline            = povline,
-        popshare           = popshare,
-        welfare_type       = welfare_type,
-        reporting_level    = reporting_level,
-        ppp                = ppp,
-        lkup               = lkup
-        )
-    } else {
-      ## survey years ------------------
-      out <- rg_pip(
-        country         = lcv$est_ctrs,
-        year            = year,
-        povline         = povline,
-        popshare        = popshare,
-        welfare_type    = welfare_type,
-        reporting_level = reporting_level,
-        ppp             = ppp,
-        lkup            = lkup
-      )
-    }
-
-  # Cache new data
-  #---------------------------------------------
-    cached_data <- qDT(out$data_in_cache)
-    main_data   <- qDT(out$main_data)
-
-    if (nrow(main_data) > 0) {
-
-      if (is.null(out$data_in_cache)) {
-
-        out <- main_data
-      } else {
-
-        if (fill_gaps) {
-
-          cached_data <- fg_remove_duplicates(cached_data,
-                                              use_new_lineup_version = lkup$use_new_lineup_version)
-        }
-
-        out <- main_data |>
-          rowbind(cached_data)
-      }
-
-        update_master_file(main_data, cache_file_path, fill_gaps)
-
-    } else {
-      out <- cached_data
-    }
-    if (!data.table::is.data.table(out)) {
-      setDT(out)
-    }
-    # Early return for empty table---------------
-    if (nrow(out) == 0) return(pipapi::empty_response)
-
-    # aggregate distributions ------------------
-    if (reporting_level %in% c("national", "all")) {
-      out <- add_agg_stats(
-        df          = out,
-        return_cols = lkup$return_cols$ag_average_poverty_stats)
-      if (reporting_level == "national") {
-        out <- out[reporting_level == "national"]
-      }
-    }
-
-    # Add out of pipeline variable
-    #---------------------------------------------
-    add_vars_out_of_pipeline(out,
-                             fill_gaps = fill_gaps,
-                             lkup      = lkup)
-
-    # **** TO BE REMOVED **** REMOVAL STARTS HERE
-    # Once `pip-grp` has been integrated in ingestion pipeline
-    # Handles grouped aggregations
-    if (group_by != "none") {
-      # Handle potential (insignificant) difference in poverty_line values that
-      # may mess-up the grouping
-      out$poverty_line <- povline
-
-      out <- pip_aggregate_by(
-        df          = out,
-        group_lkup  = lkup[["pop_region"]],
-        return_cols = lkup$return_cols$pip_grp)
-
-      # Censor regional values
-      if (censor) {
-        out <- censor_rows(out,
-                           lkup[["censored"]],
-                           type = "regions")
-      }
-
-      out <- out[, c("region_name",
-                     "region_code",
-                     "reporting_year",
-                     "reporting_pop",
-                     "poverty_line",
-                     "headcount",
-                     "poverty_gap",
-                     "poverty_severity",
-                     "watts",
-                     "mean",
-                     "pop_in_poverty")]
-
-      return(out)
-    }
-    # **** TO BE REMOVED **** REMOVAL ENDS HERE
-
-
-    # pre-computed distributional stats ---------------
-    crr_names  <- names(out)    # current variables
-    names2keep <- lkup$return_cols$pip$cols # all variables
-
-    out <- add_dist_stats(
-      df         = out,
-      lkup       = lkup,
-      fill_gaps  = fill_gaps)
-
-    # Add aggregate medians ----------------
-    out <- add_agg_medians(
-      df        = out,
-      fill_gaps = fill_gaps,
-      data_dir  = lkup$data_root
-    )
-
-    # format ----------------
-
-
-    if (fill_gaps) {
-      # ZP temp NA lineups:
-      #---------------------
-    # ## Inequality indicators to NA for lineup years ----
-      dist_vars  <- names2keep[!(names2keep %in% crr_names)]
-      out[,
-          (dist_vars) := NA_real_]
-
-      ## estimate_var -----
-      out <- estimate_type_ctr_lnp(out, lkup)
-
-    } else {
-      out[, estimate_type := NA_character_]
-    }
-
-    ## Handle survey coverage ------------
-    if (reporting_level != "all") {
-      keep <- out$reporting_level == reporting_level
-      out <- out[keep, ]
-    }
-
-    # Censor country values
-    if (censor) {
-      out <- censor_rows(out, lkup[["censored"]], type = "countries")
-    }
-
-
-    # Select columns
-    if (additional_ind) {
-      get_additional_indicators(out)
-      added_names <- attr(out, "new_indicators_names")
-      names2keep  <- c(names2keep, added_names)
-
-    }
-    # Keep relevant variables
-    out  <- out[, .SD, .SDcols = names2keep]
-
-
-    # make sure we always report the same precision in all numeric variables
-    doub_vars <-
-      names(out)[unlist(lapply(out, is.double))] |>
-      data.table::copy()
-
-    out[, (doub_vars) := lapply(.SD, round, digits = 12),
-       .SDcols = doub_vars]
-
-    # Order rows by country code and reporting year
-    data.table::setorder(out, country_code, reporting_year, reporting_level, welfare_type)
-  #}
-
-  # Make sure no duplicate remains
-  out <- out |> collapse::funique()
-  # return -------------
-  return(out)
-}
diff --git a/R/pip_new_lineups.R b/R/pip_new_lineups.R
new file mode 100644
index 00000000..6b0c6139
--- /dev/null
+++ b/R/pip_new_lineups.R
@@ -0,0 +1,302 @@
+
+#' Compute PIP statistics
+#'
+#' Compute the main PIP poverty and inequality statistics.
+#'
+#' @param country character: Country ISO 3 codes
+#' @param year integer: Reporting year
+#' @param povline numeric: Poverty line
+#' @param popshare numeric: Proportion of the population living below the
+#'   poverty line
+#' @param fill_gaps logical: If set to TRUE, will interpolate / extrapolate
+#'   values for missing years
+#' @param group_by character: Will return aggregated values for predefined
+#'   sub-groups
+#' @param welfare_type character: Welfare type
+#' @param reporting_level character: Geographical reporting level
+#' @param ppp numeric: Custom Purchase Power Parity value
+#' @param lkup list: A list of lkup tables
+#' @param censor logical: Triggers censoring of country/year statistics
+#' @param lkup_hash character: hash of pip
+#' @param additional_ind logical: If TRUE add new set of indicators. Default if
+#'   FALSE
+#'
+#' @return data.table
+#' @examples
+#' \dontrun{
+#' # Create lkups
+#' lkups <- create_lkups("<data-folder>")
+#'
+#' # A single country and year
+#' pip_new_lineups(country = "AGO",
+#'     year = 2000,
+#'     povline = 1.9,
+#'     lkup = lkups)
+#'
+#' # All years for a single country
+#' pip_new_lineups(country = "AGO",
+#'     year = "all",
+#'     povline = 1.9,
+#'     lkup = lkups)
+#'
+#' # Fill gaps
+#' pip_new_lineups(country = "AGO",
+#'     year = "all",
+#'     povline = 1.9,
+#'     fill_gaps = TRUE,
+#'     lkup = lkups)
+#'
+#' # Group by regions
+#' pip_new_lineups(country = "all",
+#'     year = "all",
+#'     povline = 1.9,
+#'     group_by = "wb",
+#'     lkup = lkups)
+#' }
+#' @export
+pip_new_lineups <- function(country         = "ALL",
+                            year            = "ALL",
+                            povline         = 1.9,
+                            popshare        = NULL,
+                            fill_gaps       = FALSE,
+                            group_by        = c("none", "wb"),
+                            welfare_type    = c("all", "consumption", "income"),
+                            reporting_level = c("all", "national", "rural", "urban"),
+                            ppp             = NULL,
+                            lkup,
+                            censor          = TRUE,
+                            lkup_hash       = lkup$cache_data_id$hash_pip,
+                            additional_ind  = FALSE) {
+
+
+  # set up -------------
+  welfare_type    <- match.arg(welfare_type)
+  reporting_level <- match.arg(reporting_level)
+  group_by        <- match.arg(group_by)
+  povline         <- round(povline, digits = 3)
+
+
+
+  # TEMPORARY UNTIL SELECTION MECHANISM IS BEING IMPROVED
+  country <- toupper(country)
+  if (is.character(year)) {
+    year <- toupper(year)
+  }
+
+  # If svy_lkup is not part of lkup throw an error.
+  if (!all(c('svy_lkup') %in% names(lkup)))
+    stop("You are probably passing more than one dataset as lkup argument.
+  Try passing a single one by subsetting it lkup <- lkups$versions_paths$dataset_name_PROD")
+
+
+  # **** TO BE REMOVED **** REMOVAL STARTS HERE
+  # Once `pip-grp` has been integrated in ingestion pipeline
+  # Forces fill_gaps to TRUE when using group_by option
+  if (group_by != "none") {
+    fill_gaps <- TRUE
+    message("Info: argument group_by in pip() is deprecated; please use pip_grp() instead.")
+  }
+  # **** TO BE REMOVED **** REMOVAL ENDS HERE
+
+  # Countries vector ------------
+  lcv <- # List with countries vectors
+    create_countries_vctr(
+      country         =  country,
+      year            =  year,
+      valid_years     =  lkup$valid_years,
+      aux_files       =  lkup$aux_files
+    )
+  # lcv$est_ctrs has all the country_code that we are interested in
+
+  cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb")
+  if (!file.exists(cache_file_path)) {
+    # Create an empty duckdb file
+    create_duckdb_file(cache_file_path)
+  }
+  # mains estimates ---------------
+  if (fill_gaps) {
+    ## lineup years-----------------
+    out <- fg_pip(
+      country            = lcv$est_ctrs,
+      year               = year,
+      povline            = povline,
+      popshare           = popshare,
+      welfare_type       = welfare_type,
+      reporting_level    = reporting_level,
+      ppp                = ppp,
+      lkup               = lkup
+    )
+  } else {
+    ## survey years ------------------
+    out <- rg_pip(
+      country         = lcv$est_ctrs,
+      year            = year,
+      povline         = povline,
+      popshare        = popshare,
+      welfare_type    = welfare_type,
+      reporting_level = reporting_level,
+      ppp             = ppp,
+      lkup            = lkup
+    )
+  }
+
+  # Cache new data
+  #---------------------------------------------
+  cached_data <- qDT(out$data_in_cache)
+  main_data   <- qDT(out$main_data)
+
+  if (nrow(main_data) > 0) {
+
+    if (is.null(out$data_in_cache)) {
+
+      out <- main_data
+    } else {
+
+      if (fill_gaps) {
+
+        cached_data <- fg_remove_duplicates(cached_data,
+                                            use_new_lineup_version = lkup$use_new_lineup_version)
+      }
+
+      out <- main_data |>
+        rowbind(cached_data)
+    }
+
+    update_master_file(main_data, cache_file_path, fill_gaps)
+
+  } else {
+    out <- cached_data
+  }
+  if (!is.data.table(out)) {
+    setDT(out)
+  }
+  # Early return for empty table---------------
+  if (nrow(out) == 0) return(pipapi::empty_response)
+
+  # aggregate distributions ------------------
+  if (reporting_level %in% c("national", "all")) {
+    out <- add_agg_stats(
+      df          = out,
+      return_cols = lkup$return_cols$ag_average_poverty_stats)
+    if (reporting_level == "national") {
+      out <- out[reporting_level == "national"]
+    }
+  }
+
+  # Add out of pipeline variable
+  #---------------------------------------------
+  add_vars_out_of_pipeline(out,
+                           fill_gaps = fill_gaps,
+                           lkup      = lkup)
+
+  # **** TO BE REMOVED **** REMOVAL STARTS HERE
+  # Once `pip-grp` has been integrated in ingestion pipeline
+  # Handles grouped aggregations
+  if (group_by != "none") {
+    # Handle potential (insignificant) difference in poverty_line values that
+    # may mess-up the grouping
+    out$poverty_line <- povline
+
+    out <- pip_aggregate_by(
+      df          = out,
+      group_lkup  = lkup[["pop_region"]],
+      return_cols = lkup$return_cols$pip_grp)
+
+    # Censor regional values
+    if (censor) {
+      out <- censor_rows(out,
+                         lkup[["censored"]],
+                         type = "regions")
+    }
+
+    out <- out[, c("region_name",
+                   "region_code",
+                   "reporting_year",
+                   "reporting_pop",
+                   "poverty_line",
+                   "headcount",
+                   "poverty_gap",
+                   "poverty_severity",
+                   "watts",
+                   "mean",
+                   "pop_in_poverty")]
+
+    return(out)
+  }
+  # **** TO BE REMOVED **** REMOVAL ENDS HERE
+
+
+  # pre-computed distributional stats ---------------
+  crr_names  <- names(out)    # current variables
+  names2keep <- lkup$return_cols$pip$cols # all variables
+
+  out <- add_dist_stats(
+    df         = out,
+    lkup       = lkup,
+    fill_gaps  = fill_gaps)
+
+  # Add aggregate medians ----------------
+  out <- add_agg_medians(
+    df        = out,
+    fill_gaps = fill_gaps,
+    data_dir  = lkup$data_root
+  )
+
+  # format ----------------
+
+
+  if (fill_gaps) {
+    # ZP temp NA lineups:
+    #---------------------
+    # ## Inequality indicators to NA for lineup years ----
+    dist_vars  <- names2keep[!(names2keep %in% crr_names)]
+    out[,
+        (dist_vars) := NA_real_]
+
+    ## estimate_var -----
+    out <- estimate_type_ctr_lnp(out, lkup)
+
+  } else {
+    out[, estimate_type := NA_character_]
+  }
+
+  ## Handle survey coverage ------------
+  if (reporting_level != "all") {
+    keep <- out$reporting_level == reporting_level
+    out <- out[keep, ]
+  }
+
+  # Censor country values
+  if (censor) {
+    out <- censor_rows(out, lkup[["censored"]], type = "countries")
+  }
+
+
+  # Select columns
+  if (additional_ind) {
+    get_additional_indicators(out)
+    added_names <- attr(out, "new_indicators_names")
+    names2keep  <- c(names2keep, added_names)
+
+  }
+  # Keep relevant variables
+  out  <- out[, .SD, .SDcols = names2keep]
+
+
+  # make sure we always report the same precision in all numeric variables
+  doub_vars <-
+    names(out)[unlist(lapply(out, is.double))] |>
+    data.table::copy()
+
+  out[, (doub_vars) := lapply(.SD, round, digits = 12),
+      .SDcols = doub_vars]
+
+  # Order rows by country code and reporting year
+  data.table::setorder(out, country_code, reporting_year, reporting_level, welfare_type)
+  #}
+
+  # Make sure no duplicate remains
+  out <- out |> collapse::funique()
+  # return -------------
+  return(out)
+}
diff --git a/man/pip_new_lineups.Rd b/man/pip_new_lineups.Rd
index eded364f..db43427e 100644
--- a/man/pip_new_lineups.Rd
+++ b/man/pip_new_lineups.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/pip.R
+% Please edit documentation in R/pip_new_lineups.R
 \name{pip_new_lineups}
 \alias{pip_new_lineups}
 \title{Compute PIP statistics}

From d6105fecf59393fb07cdaa1e338206de4351e621 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Fri, 22 Aug 2025 13:38:30 -0400
Subject: [PATCH 052/203] organize functions

---
 R/compute_fgt_new.R       | 133 ++++++++++++++++++++++++++++++++++++++
 R/compute_fgt_old.R       | 112 ++++++++++++++++++++++++++++++++
 R/pip_new_lineups.R       |  13 +++-
 R/rg_pip.R                | 133 --------------------------------------
 R/rg_pip_old.R            | 112 --------------------------------
 man/compute_fgt_dt.Rd     |   2 +-
 man/compute_fgt_dt_old.Rd |   2 +-
 man/load_data_list.Rd     |   2 +-
 man/load_data_list_old.Rd |   2 +-
 9 files changed, 261 insertions(+), 250 deletions(-)
 create mode 100644 R/compute_fgt_new.R
 create mode 100644 R/compute_fgt_old.R

diff --git a/R/compute_fgt_new.R b/R/compute_fgt_new.R
new file mode 100644
index 00000000..f4651f21
--- /dev/null
+++ b/R/compute_fgt_new.R
@@ -0,0 +1,133 @@
+# Efficient FGT calculation for a data.table and vector of poverty lines
+#' Title
+#'
+#' @param dt data frame with `welfare` and `weight` columns
+#' @param welfare character: welfare variable name
+#' @param weight character: weight variable name
+#' @param povlines double: vector with poveryt lines
+#'
+#' @return data.table with estimates poverty estimates
+#' @keywords internal
+compute_fgt_dt <- function(dt, welfare, weight, povlines, mean_and_med = FALSE) {
+  w   <- dt[[welfare]]
+  wt  <- dt[[weight]]
+  n   <- length(w)
+  m   <- length(povlines)
+
+  # Pre-allocate result matrix
+  res <- matrix(NA_real_, nrow = m, ncol = 3)
+  colnames(res) <- c("FGT0", "FGT1", "FGT2")
+  watts_vec <- numeric(m)
+
+  # Precompute log(w) for efficiency
+  logw <- rep(NA_real_, n)
+  pos <- w > 0
+  logw[pos] <- log(w[pos])
+
+  for (i in seq_along(povlines)) {
+    pov <- povlines[i]
+    poor <- w < pov
+    rel_dist <- 1 - (w / pov)
+    rel_dist[!poor] <- 0
+    res[i, 1] <- fmean(poor, w = wt) # FGT0
+    res[i, 2] <- fmean(rel_dist, w = wt) # FGT1
+    res[i, 3] <- fmean(rel_dist^2, w = wt) # FGT2
+
+    # Optimized Watts index calculation
+    keep <- poor & pos
+    if (any(keep, na.rm = TRUE)) {
+      watts_vec[i] <- (fsum((log(pov) - logw[keep]) * wt[keep])) / fsum(wt)
+    } else {
+      watts_vec[i] <- 0
+    }
+  }
+
+  if (mean_and_med) {
+    mn  <- funique(dt$mean)
+    med <- funique(dt$median)
+    cy  <- funique(dt$coutnry_code)
+    ry  <- funique(dt$reporting_year)
+    out <- data.table(
+      povline          = povlines,
+      headcount        = res[, 1],
+      poverty_gap      = res[, 2],
+      poverty_severity = res[, 3],
+      watts            = watts_vec,
+      mean             = mn,
+      median           = med,
+      country_code     = cy,
+      reporting_year   = ry)
+  } else {
+    out <- data.table(
+      povline          = povlines,
+      headcount        = res[, 1],
+      poverty_gap      = res[, 2],
+      poverty_severity = res[, 3],
+      watts            = watts_vec)
+  }
+
+  out
+
+}
+
+
+process_dt <- function(dt, povline, mean_and_med = FALSE) {
+  dt[, compute_fgt_dt(.SD, "welfare", "weight", povline, mean_and_med),
+     by = .(file, reporting_level)]
+}
+
+#' load survey year files and store them in a list
+#'
+#' @param metadata data frame from `subset_lkup()`
+#'
+#' @return list with survey years data
+#' @keywords internal
+load_data_list <- \(metadata) {
+
+  # unique values
+  mdout      <- metadata[, lapply(.SD, list), by = path]
+  upaths     <- mdout$path
+  urep_level <- mdout$reporting_level
+  uppp       <- mdout$ppp
+  ucpi       <- mdout$cpi
+
+  seq_along(upaths) |>
+    lapply(\(f) {
+      path      <- upaths[f]
+      rep_level <- urep_level[f][[1]]
+      ppp       <- uppp[f][[1]]
+      cpi       <- ucpi[f][[1]]
+
+      # Build a data.table to merge cpi and ppp
+      fdt <- data.table(reporting_level = as.character(rep_level),
+                        ppp             = ppp,
+                        cpi             = cpi)
+
+      # load data and format
+      dt <-  fst::read_fst(path, as.data.table = TRUE)
+
+      if (length(rep_level) == 1) {
+        if (rep_level == "national") dt[, area := "national"]
+      }
+      setnames(dt, "area", "reporting_level")
+      dt[,
+         `:=`(
+           file = basename(path),
+           reporting_level = as.character(reporting_level)
+         )
+      ]
+
+      dt <- join(dt, fdt,
+                 on = "reporting_level",
+                 validate = "m:1",
+                 how = "left",
+                 verbose = 0)
+
+      dt[, welfare := welfare/(cpi * ppp)
+      ][,
+        c("cpi", "ppp") := NULL]
+
+    })
+
+}
+
diff --git a/R/compute_fgt_old.R b/R/compute_fgt_old.R
new file mode 100644
index 00000000..cf999b08
--- /dev/null
+++ b/R/compute_fgt_old.R
@@ -0,0 +1,112 @@
+# OLD: Efficient FGT calculation for a data.table and vector of poverty lines
+#' Title
+#'
+#' @param dt data frame with `welfare` and `weight` columns
+#' @param welfare character: welfare variable name
+#' @param weight character: weight variable name
+#' @param povlines double: vector with poveryt lines
+#'
+#' @return data.table with estimates poverty estimates
+#' @keywords internal
+compute_fgt_dt_old <- function(dt, welfare, weight, povlines) {
+  w <- dt[[welfare]]
+  wt <- dt[[weight]]
+  n <- length(w)
+  m <- length(povlines)
+
+  # Pre-allocate result matrix
+  res <- matrix(NA_real_, nrow = m, ncol = 3)
+  colnames(res) <- c("FGT0", "FGT1", "FGT2")
+  watts_vec <- numeric(m)
+
+  # Precompute log(w) for efficiency
+  logw <- rep(NA_real_, n)
+  pos <- w > 0
+  logw[pos] <- log(w[pos])
+
+  for (i in seq_along(povlines)) {
+    pov <- povlines[i]
+    poor <- w < pov
+    rel_dist <- 1 - (w / pov)
+    rel_dist[!poor] <- 0
+    res[i, 1] <- fmean(poor, w = wt) # FGT0
+    res[i, 2] <- fmean(rel_dist, w = wt) # FGT1
+    res[i, 3] <- fmean(rel_dist^2, w = wt) # FGT2
+
+    # Optimized Watts index calculation
+    keep <- poor & pos
+    if (any(keep, na.rm = TRUE)) {
+      watts_vec[i] <- (fsum((log(pov) - logw[keep]) * wt[keep])) / fsum(wt)
+    } else {
+      watts_vec[i] <- 0
+    }
+  }
+  data.table(
+    povline = povlines,
+    headcount = res[, 1],
+    poverty_gap = res[, 2],
+    poverty_severity = res[, 3],
+    watts = watts_vec
+  )
+}
+
+
+process_dt_old <- function(dt, povline) {
+  dt[, compute_fgt_dt_old(.SD, "welfare", "weight", povline),
+     by = .(file, reporting_level)]
+}
+
+#' OLD: load survey year files and store them in a list
+#'
+#' @param metadata data frame from `subset_lkup()`
+#'
+#' @return list with survey years data
+#' @keywords internal
+load_data_list_old <- \(metadata) {
+
+  # unique values
+  mdout      <- metadata[, lapply(.SD, list), by = path]
+  upaths     <- mdout$path
+  urep_level <- mdout$reporting_level
+  uppp       <- mdout$ppp
+  ucpi       <- mdout$cpi
+
+  seq_along(upaths) |>
+    lapply(\(f) {
+      path      <- upaths[f]
+      rep_level <- urep_level[f][[1]]
+      ppp       <- uppp[f][[1]]
+      cpi       <- ucpi[f][[1]]
+
+      # Build a data.table to merge cpi and ppp
+      fdt <- data.table(reporting_level = as.character(rep_level),
+                        ppp             = ppp,
+                        cpi             = cpi)
+
+      # load data and format
+      dt <-  fst::read_fst(path, as.data.table = TRUE)
+
+      if (length(rep_level) == 1) {
+        if (rep_level == "national") dt[, area := "national"]
+      }
+      setnames(dt, "area", "reporting_level")
+      dt[,
+         `:=`(
+           file = basename(path),
+           reporting_level = as.character(reporting_level)
+         )
+      ]
+
+      dt <- join(dt, fdt,
+                 on = "reporting_level",
+                 validate = "m:1",
+                 how = "left",
+                 verbose = 0)
+
+      dt[, welfare := welfare/(cpi * ppp)
+      ][,
+        c("cpi", "ppp") := NULL]
+
+    })
+
+}
diff --git a/R/pip_new_lineups.R b/R/pip_new_lineups.R
index 6b0c6139..aea166e5 100644
--- a/R/pip_new_lineups.R
+++ b/R/pip_new_lineups.R
@@ -142,7 +142,18 @@ pip_new_lineups <- function(country         = "ALL",
 
   # Cache new data
   #---------------------------------------------
-  cached_data <- qDT(out$data_in_cache)
+  cached_data <- if (is.null(out$data_in_cache)) {
+    NULL
+  } else if (is.data.frame(out$data_in_cache)) {
+    if (fnrow(out$data_in_cache) == 0) {
+      NULL
+    } else {
+      qDT(out$data_in_cache)
+    }
+  } else {
+    cli::cli_abort("{.code out$data_in_cache} must be NULL or data.frame not {.field {class(out$data_in_cache)}}")
+  }
+
   main_data   <- qDT(out$main_data)
 
   if (nrow(main_data) > 0) {
diff --git a/R/rg_pip.R b/R/rg_pip.R
index 8b555753..769878ae 100644
--- a/R/rg_pip.R
+++ b/R/rg_pip.R
@@ -85,136 +85,3 @@ rg_pip <- function(country,
 
 
-# Efficient FGT calculation for a data.table and vector of poverty lines
-#' Title
-#'
-#' @param dt data frame with `welfare` and `weight` columns
-#' @param welfare character: welfare variable name
-#' @param weight character: weight variable name
-#' @param povlines double: vector with poveryt lines
-#'
-#' @return data.table with estimates poverty estimates
-#' @keywords internal
-compute_fgt_dt <- function(dt, welfare, weight, povlines, mean_and_med = FALSE) {
-  w   <- dt[[welfare]]
-  wt  <- dt[[weight]]
-  n   <- length(w)
-  m   <- length(povlines)
-
-  # Pre-allocate result matrix
-  res <- matrix(NA_real_, nrow = m, ncol = 3)
-  colnames(res) <- c("FGT0", "FGT1", "FGT2")
-  watts_vec <- numeric(m)
-
-  # Precompute log(w) for efficiency
-  logw <- rep(NA_real_, n)
-  pos <- w > 0
-  logw[pos] <- log(w[pos])
-
-  for (i in seq_along(povlines)) {
-    pov <- povlines[i]
-    poor <- w < pov
-    rel_dist <- 1 - (w / pov)
-    rel_dist[!poor] <- 0
-    res[i, 1] <- fmean(poor, w = wt) # FGT0
-    res[i, 2] <- fmean(rel_dist, w = wt) # FGT1
-    res[i, 3] <- fmean(rel_dist^2, w = wt) # FGT2
-
-    # Optimized Watts index calculation
-    keep <- poor & pos
-    if (any(keep, na.rm = TRUE)) {
-      watts_vec[i] <- (fsum((log(pov) - logw[keep]) * wt[keep])) / fsum(wt)
-    } else {
-      watts_vec[i] <- 0
-    }
-  }
-
-  if (mean_and_med) {
-    mn  <- funique(dt$mean)
-    med <- funique(dt$median)
-    cy  <- funique(dt$coutnry_code)
-    ry  <- funique(dt$reporting_year)
-    out <- data.table(
-      povline          = povlines,
-      headcount        = res[, 1],
-      poverty_gap      = res[, 2],
-      poverty_severity = res[, 3],
-      watts            = watts_vec,
-      mean             = mn,
-      median           = med,
-      country_code     = cy,
-      reporting_year   = ry)
-  } else {
-    out <- data.table(
-      povline          = povlines,
-      headcount        = res[, 1],
-      poverty_gap      = res[, 2],
-      poverty_severity = res[, 3],
-      watts            = watts_vec)
-  }
-
-  out
-
-}
-
-
-process_dt <- function(dt, povline, mean_and_med = FALSE) {
-  dt[, compute_fgt_dt(.SD, "welfare", "weight", povline, mean_and_med),
-     by = .(file, reporting_level)]
-}
-
-#' load survey year files and store them in a list
-#'
-#' @param metadata data frame from `subset_lkup()`
-#'
-#' @return list with survey years data
-#' @keywords internal
-load_data_list <- \(metadata) {
-
-  # unique values
-  mdout      <- metadata[, lapply(.SD, list), by = path]
-  upaths     <- mdout$path
-  urep_level <- mdout$reporting_level
-  uppp       <- mdout$ppp
-  ucpi       <- mdout$cpi
-
-  seq_along(upaths) |>
-    lapply(\(f) {
-      path      <- upaths[f]
-      rep_level <- urep_level[f][[1]]
-      ppp       <- uppp[f][[1]]
-      cpi       <- ucpi[f][[1]]
-
-      # Build a data.table to merge cpi and ppp
-      fdt <- data.table(reporting_level = as.character(rep_level),
-                        ppp             = ppp,
-                        cpi             = cpi)
-
-      # load data and format
-      dt <-  fst::read_fst(path, as.data.table = TRUE)
-
-      if (length(rep_level) == 1) {
-        if (rep_level == "national") dt[, area := "national"]
-      }
-      setnames(dt, "area", "reporting_level")
-      dt[,
-         `:=`(
-           file = basename(path),
-           reporting_level = as.character(reporting_level)
-           )
-           ]
-
-      dt <- join(dt, fdt,
-                 on = "reporting_level",
-                 validate = "m:1",
-                 how = "left",
-                 verbose = 0)
-
-      dt[, welfare := welfare/(cpi * ppp)
-         ][,
-           c("cpi", "ppp") := NULL]
-
-    })
-
-}
-
diff --git a/R/rg_pip_old.R b/R/rg_pip_old.R
index e5f71be3..da17b015 100644
--- a/R/rg_pip_old.R
+++ b/R/rg_pip_old.R
@@ -85,115 +85,3 @@ rg_pip_old <- function(country,
 
 
-# OLD: Efficient FGT calculation for a data.table and vector of poverty lines
-#' Title
-#'
-#' @param dt data frame with `welfare` and `weight` columns
-#' @param welfare character: welfare variable name
-#' @param weight character: weight variable name
-#' @param povlines double: vector with poveryt lines
-#'
-#' @return data.table with estimates poverty estimates
-#' @keywords internal
-compute_fgt_dt_old <- function(dt, welfare, weight, povlines) {
-  w <- dt[[welfare]]
-  wt <- dt[[weight]]
-  n <- length(w)
-  m <- length(povlines)
-
-  # Pre-allocate result matrix
-  res <- matrix(NA_real_, nrow = m, ncol = 3)
-  colnames(res) <- c("FGT0", "FGT1", "FGT2")
-  watts_vec <- numeric(m)
-
-  # Precompute log(w) for efficiency
-  logw <- rep(NA_real_, n)
-  pos <- w > 0
-  logw[pos] <- log(w[pos])
-
-  for (i in seq_along(povlines)) {
-    pov <- povlines[i]
-    poor <- w < pov
-    rel_dist <- 1 - (w / pov)
-    rel_dist[!poor] <- 0
-    res[i, 1] <- fmean(poor, w = wt) # FGT0
-    res[i, 2] <- fmean(rel_dist, w = wt) # FGT1
-    res[i, 3] <- fmean(rel_dist^2, w = wt) # FGT2
-
-    # Optimized Watts index calculation
-    keep <- poor & pos
-    if (any(keep, na.rm = TRUE)) {
-      watts_vec[i] <- (fsum((log(pov) - logw[keep]) * wt[keep])) / fsum(wt)
-    } else {
-      watts_vec[i] <- 0
-    }
-  }
-  data.table(
-    povline = povlines,
-    headcount = res[, 1],
-    poverty_gap = res[, 2],
-    poverty_severity = res[, 3],
-    watts = watts_vec
-  )
-}
-
-
-process_dt_old <- function(dt, povline) {
-  dt[, compute_fgt_dt_old(.SD, "welfare", "weight", povline),
-     by = .(file, reporting_level)]
-}
-
-#' OLD: load survey year files and store them in a list
-#'
-#' @param metadata data frame from `subset_lkup()`
-#'
-#' @return list with survey years data
-#' @keywords internal
-load_data_list_old <- \(metadata) {
-
-  # unique values
-  mdout      <- metadata[, lapply(.SD, list), by = path]
-  upaths     <- mdout$path
-  urep_level <- mdout$reporting_level
-  uppp       <- mdout$ppp
-  ucpi       <- mdout$cpi
-
-  seq_along(upaths) |>
-    lapply(\(f) {
-      path      <- upaths[f]
-      rep_level <- urep_level[f][[1]]
-      ppp       <- uppp[f][[1]]
-      cpi       <- ucpi[f][[1]]
-
-      # Build a data.table to merge cpi and ppp
-      fdt <- data.table(reporting_level = as.character(rep_level),
-                        ppp             = ppp,
-                        cpi             = cpi)
-
-      # load data and format
-      dt <-  fst::read_fst(path, as.data.table = TRUE)
-
-      if (length(rep_level) == 1) {
-        if (rep_level == "national") dt[, area := "national"]
-      }
-      setnames(dt, "area", "reporting_level")
-      dt[,
-         `:=`(
-           file = basename(path),
-           reporting_level = as.character(reporting_level)
-         )
-      ]
-
-      dt <- join(dt, fdt,
-                 on = "reporting_level",
-                 validate = "m:1",
-                 how = "left",
-                 verbose = 0)
-
-      dt[, welfare := welfare/(cpi * ppp)
-      ][,
-        c("cpi", "ppp") := NULL]
-
-    })
-
-}
diff --git a/man/compute_fgt_dt.Rd b/man/compute_fgt_dt.Rd
index a57b0748..ac27c0f1 100644
--- a/man/compute_fgt_dt.Rd
+++ b/man/compute_fgt_dt.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/rg_pip.R
+% Please edit documentation in R/compute_fgt_new.R
 \name{compute_fgt_dt}
 \alias{compute_fgt_dt}
 \title{Title}
diff --git a/man/compute_fgt_dt_old.Rd b/man/compute_fgt_dt_old.Rd
index 82c9babd..58f12e5e 100644
--- a/man/compute_fgt_dt_old.Rd
+++ b/man/compute_fgt_dt_old.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/rg_pip_old.R
+% Please edit documentation in R/compute_fgt_old.R
 \name{compute_fgt_dt_old}
 \alias{compute_fgt_dt_old}
 \title{Title}
diff --git a/man/load_data_list.Rd b/man/load_data_list.Rd
index 26a142d2..8098daa9 100644
--- a/man/load_data_list.Rd
+++ b/man/load_data_list.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/rg_pip.R
+% Please edit documentation in R/compute_fgt_new.R
 \name{load_data_list}
 \alias{load_data_list}
 \title{load survey year files and store them in a list}
diff --git a/man/load_data_list_old.Rd b/man/load_data_list_old.Rd
index b6ea172f..23dc3862 100644
--- a/man/load_data_list_old.Rd
+++ b/man/load_data_list_old.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/rg_pip_old.R
+% Please edit documentation in R/compute_fgt_old.R
 \name{load_data_list_old}
 \alias{load_data_list_old}
 \title{OLD: load survey year files and store them in a list}

From d4dea9f6130df6799fb02e54e8c8fce238c46686 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Fri, 22 Aug 2025 16:01:57 -0400
Subject: [PATCH 053/203] improve efficiency a little

---
 R/compute_fgt_new.R | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/R/compute_fgt_new.R b/R/compute_fgt_new.R
index f4651f21..f79cf6cf 100644
--- a/R/compute_fgt_new.R
+++ b/R/compute_fgt_new.R
@@ -19,10 +19,9 @@ compute_fgt_dt <- function(dt, welfare, weight, povlines, mean_and_med = FALSE)
   colnames(res) <- c("FGT0", "FGT1", "FGT2")
   watts_vec <- numeric(m)
 
-  # Precompute log(w) for efficiency
-  logw <- rep(NA_real_, n)
+  # Precompute log(w) for efficiency (vectorized)
+  logw <- fifelse(w > 0, log(w), NA_real_)
   pos <- w > 0
-  logw[pos] <- log(w[pos])
 
   for (i in seq_along(povlines)) {
     pov <- povlines[i]

From 9b688bb1cc8d4589b608fed1347206b18cdb519c Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Fri, 22 Aug 2025 16:30:56 -0400
Subject: [PATCH 054/203] more optimization

---
 R/compute_fgt_new.R | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/R/compute_fgt_new.R b/R/compute_fgt_new.R
index f79cf6cf..f0ce396d 100644
--- a/R/compute_fgt_new.R
+++ b/R/compute_fgt_new.R
@@ -20,8 +20,12 @@ compute_fgt_dt <- function(dt, welfare, weight, povlines, mean_and_med = FALSE)
   watts_vec <- numeric(m)
 
   # Precompute log(w) for efficiency (vectorized)
-  logw <- fifelse(w > 0, log(w), NA_real_)
-  pos <- w > 0
+
+  pos  <- w > 0
+  # logw <- log(w)
+  logw <- copyv(log(w), pos, NA_real_, invert = TRUE) |>
+    suppressWarnings()
+  # logw <- fifelse(w > 0, log(w), NA_real_)
 
   for (i in seq_along(povlines)) {
     pov <- povlines[i]

From b194914ef5ca750bbebec6683be56c47d9b1367d Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Fri, 22 Aug 2025 17:28:44 -0400
Subject: [PATCH 055/203] remove rep.int for efficiency

---
 R/add_attributes_as_columns.R               | 237 -------------------
 R/compute_fgt_new.R                         |  11 +-
 R/fg_pip.R                                  |   5 +-
 R/utils-pipdata.R                           | 245 ++++++++++++++++++++
 man/add_attributes_as_columns_multi.Rd      |   2 +-
 man/add_attributes_as_columns_vectorized.Rd |   2 +-
 man/assign_stat.Rd                          |   2 +-
 7 files changed, 256 insertions(+), 248 deletions(-)
 delete mode 100644 R/add_attributes_as_columns.R

diff --git a/R/add_attributes_as_columns.R b/R/add_attributes_as_columns.R
deleted file mode 100644
index f1cd461f..00000000
--- a/R/add_attributes_as_columns.R
+++ /dev/null
@@ -1,237 +0,0 @@
-#' Add attributes as columns (vectorized, in-place)
-#'
-#' @description
-#' Converts survey attributes on a `data.table`—including
-#' `reporting_level_rows`, `country_code`, `reporting_year`, and `dist_stats`—
-#' into columns using a **loop-free, segment-replication** strategy. Designed
-#' for very large tables and objects loaded via `readRDS()`/`load()`:
-#' uses `setDT()` and `alloc.col()` to ensure in-place assignment.
-#'
-#' @details
-#' The function expects an attribute `reporting_level_rows`, a list with:
-#' - `reporting_level`: character vector of the level label for each segment
-#'   (e.g., `c("rural","urban","rural", ...)`).
-#' - `rows`: integer vector of **cumulative** row-ends (e.g.,
-#'   `c(100000, 200000, 300000, ...)`).
-#'
-#' Segment lengths are computed as `diff(c(0L, rows))`, and `reporting_level`
-#' is replicated with `rep.int(lev, counts)`. Constants `country_code`,
-#' `reporting_year`, and `file` (`paste0(country_code, "_", reporting_year)`)
-#' are added to all rows. If `dist_stats$mean` / `dist_stats$median` are
-#' provided (as named vectors/lists keyed by level), they are mapped by level
-#' name and replicated per segment. If a level is missing from the names,
-#' `NA` values may result for that segment.
-#'
-#' This implementation avoids loops and `findInterval()` edge cases, and
-#' modifies `dt` by reference.
-#'
-#' @param dt A `data.table` carrying the attributes described above.
-#'
-#' @return The same `data.table`, modified by reference, with added columns:
-#'   `reporting_level`, `country_code`, `reporting_year`, `file`, and (if
-#'   present) `mean`, `median`.
-#'
-#' @section Assumptions:
-#' * `length(reporting_level_rows$reporting_level) == length(reporting_level_rows$rows)`.
-#' * `rows` are cumulative and non-decreasing, and their segment lengths sum to `nrow(dt)`.
-#' * If `dist_stats$mean` / `dist_stats$median` have multiple values, their names
-#'   align with the level labels.
-#'
-#' @note For objects loaded from disk (e.g., via `readRDS()`), `alloc.col(dt)`
-#'       ensures there is spare column capacity for by-reference assignment.
-#'
-#' @seealso [add_attributes_as_columns_multi()], [assign_stat()]
-#'
-#' @examples
-#' \dontrun{
-#' library(data.table)
-#' dt <- data.table(weight = 1:6, welfare = runif(6))
-#' attr(dt, "reporting_level_rows") <- list(
-#'   reporting_level = c("rural","urban","rural"),
-#'   rows = c(2L, 4L, 6L)
-#' )
-#' attr(dt, "country_code")   <- "XXY"
-#' attr(dt, "reporting_year") <- 2000L
-#' attr(dt, "dist_stats") <- list(
-#'   mean   = list(rural = 2.5, urban = 5.0),
-#'   median = list(rural = 2.0, urban = 4.5)
-#' )
-#'
-#' add_attributes_as_columns_vectorized(dt)
-#' head(dt)
-#' }
-#'
-#' @import data.table
-#' @export
-add_attributes_as_columns_vectorized <- function(dt) {
-
-  # Ensure proper internal state & spare column capacity (handles readRDS/load cases)
-  setDT(dt)        # harmless if already a data.table
-  alloc.col(dt)    # pre-allocate room for new columns
-
-  rl   <- attr(dt, "reporting_level_rows")
-  lev  <- rl$reporting_level
-  rows <- as.integer(rl$rows)
-  n    <- nrow(dt)
-
-  counts <- diff(c(0L, rows))
-  if (sum(counts) != n) cli::cli_abort("Sum of 'rows' in attribute does not equal nrow(dt).")
-
-  # reporting_level: vectorized, no loops, no findInterval edge cases
-  dt[,
-     reporting_level := rep.int(lev, counts)]
-
-  # constants
-  cc <- attr(dt, "country_code")
-  ry <- attr(dt, "reporting_year")
-  dt[, `:=`(
-    country_code   = cc,
-    reporting_year = ry,
-    file           = paste0(cc, "_", ry)
-  )]
-
-  # dist_stats per reporting_level (align by names, then replicate by counts)
-  ds <- attr(dt, "dist_stats")
-  if (length(ds)) {
-    if (!is.null(ds$mean)) {
-      m <- unlist(ds$mean, use.names = TRUE)
-      dt[,
-         mean := rep.int(unname(m[match(lev,
-                                        names(m))]),
-                         counts)]
-    }
-    if (!is.null(ds$median)) {
-      md <- unlist(ds$median, use.names = TRUE)
-      dt[,
-         median := rep.int(unname(md[match(lev,
-                                           names(md))]),
-                           counts)]
-    }
-  }
-
-  dt
-}
-
-
-
-#' Add attributes as columns for multi-segment reporting levels
-#'
-#' @description
-#' Converts attributes on a survey `data.table` (e.g., `reporting_level_rows`,
-#' `country_code`, `reporting_year`, and `dist_stats`) into columns, handling
-#' **multiple alternating segments** (e.g., CHN rural/urban/rural/urban) or
-#' single-segment cases (e.g., ZAF).
-#'
-#' @param dt A `data.table` with attributes:
-#'   - `reporting_level_rows`: list with `reporting_level` (character) and
-#'     `rows` (integer cumulative row ends).
-#'   - `country_code` (character).
-#'   - `reporting_year` (integer/numeric).
-#'   - `dist_stats` (list) optionally containing `mean` and/or `median`, each as
-#'     a named list/vector keyed by reporting level, or a single scalar.
-#'
-#' @return The same `data.table`, modified by reference, with new columns:
-#'   `reporting_level`, `country_code`, `reporting_year`, `file`, and
-#'   optionally `mean`, `median`.
-#'
-#' @examples
-#' # chn2000_cols <- add_attributes_as_columns_multi(chn2000)
-#' # zaf2000_cols <- add_attributes_as_columns_multi(zaf2000)
-#' @import data.table
-#' @export
-add_attributes_as_columns_multi <- function(dt) {
-  # Ensure DT internals and spare capacity for new columns
-  setDT(dt)
-  alloc.col(dt)
-
-  # --- Pull + validate segment metadata ---
-  rl <- attr(dt, "reporting_level_rows")
-  if (is.null(rl) || is.null(rl$reporting_level) || is.null(rl$rows)) {
-    cli::cli_abort("Missing 'reporting_level_rows' attribute with $reporting_level and $rows.")
-  }
-  lev  <- as.character(rl$reporting_level)
-  rows <- as.integer(rl$rows)
-  n    <- nrow(dt)
-
-  if (length(lev) != length(rows)) cli::cli_abort("'reporting_level' and 'rows' lengths differ.")
-  if (length(rows) == 0L) cli::cli_abort("'rows' is empty.")
-  if (any(diff(rows) < 0L)) cli::cli_abort("'rows' must be non-decreasing.")
-  if (rows[length(rows)] != n) cli::cli_abort("Last element of 'rows' must equal nrow(dt).")
-
-  counts <- diff(c(0L, rows))
-  if (any(counts <= 0L)) cli::cli_abort("Computed non-positive segment length(s).")
-
-  # --- reporting_level: vectorized per-segment replication ---
-  dt[, reporting_level := rep.int(lev, counts)]
-
-  # --- constants ---
-  cc <- attr(dt, "country_code")
-  ry <- attr(dt, "reporting_year")
-  dt[, `:=`(
-    country_code   = cc,
-    reporting_year = ry,
-    file           = paste0(cc, "_", ry)
-  )]
-
-  # --- distribution stats ---
-  ds <- attr(dt, "dist_stats")
-  if (length(ds)) {
-    assign_stat(dt, lev, counts, ds$mean,   "mean")
-    assign_stat(dt, lev, counts, ds$median, "median")
-  }
-
-  dt
-}
-
-
-
-#' Assign a per-level statistic to a data.table column (by reference)
-#'
-#' @description
-#' Replicates a statistic per reporting-level segment and assigns it to a new
-#' column in `dt`, **in place**. `stat` can be a scalar (broadcast), a named
-#' vector, or a named list (one value per level).
-#'
-#' @param dt A `data.table`. Modified by reference.
-#' @param lev Character vector of reporting-level labels per segment
-#'   (e.g., `c("rural","urban","rural", ...)`).
-#' @param counts Integer vector of segment lengths matching `lev`
-#'   (e.g., `c(100000, 100000, 100000, ...)`).
-#' @param stat A numeric scalar, named vector, or named list with one value per
-#'   level (names must match `lev` values).
-#' @param colname Name of the column to create/overwrite.
-#'
-#' @return Invisibly returns `dt` (modified by reference).
-#' @examples
-#' # assign_stat(dt, lev, counts, list(rural = 2.6, urban = 5.5), "mean")
-#' @import data.table
-#' @export
-assign_stat <- function(dt, lev, counts, stat, colname) {
-  if (is.null(stat)) return(invisible(dt))
-  n <- nrow(dt)
-
-  v <- if (is.list(stat)) unlist(stat, use.names = TRUE) else stat
-
-  # Single scalar: broadcast
-  if (length(v) == 1L && is.null(names(v))) {
-    dt[, (colname) := rep.int(unname(v), n)]
-    return(invisible(dt))
-  }
-
-  # Need names to map values to levels
-  if (is.null(names(v))) {
-    stop("`stat` has length > 1 but no names; cannot map to levels.")
-  }
-
-  map_idx <- match(lev, names(v))
-  if (anyNA(map_idx)) {
-    missing_levels <- unique(lev[is.na(map_idx)])
-    stop(
-      sprintf("`stat` missing value(s) for level(s): %s",
-              paste(missing_levels, collapse = ", "))
-    )
-  }
-
-  dt[, (colname) := rep.int(unname(v[map_idx]), counts)]
-  invisible(dt)
-}
diff --git a/R/compute_fgt_new.R b/R/compute_fgt_new.R
index f0ce396d..0e066e85 100644
--- a/R/compute_fgt_new.R
+++ b/R/compute_fgt_new.R
@@ -31,7 +31,8 @@ compute_fgt_dt <- function(dt, welfare, weight, povlines, mean_and_med = FALSE)
     pov <- povlines[i]
     poor <- w < pov
     rel_dist <- 1 - (w / pov)
-    rel_dist[!poor] <- 0
+    setv(rel_dist, poor, 0, invert = TRUE)
+    # rel_dist[!poor] <- 0
     res[i, 1] <- fmean(poor, w = wt) # FGT0
     res[i, 2] <- fmean(rel_dist, w = wt) # FGT1
     res[i, 3] <- fmean(rel_dist^2, w = wt) # FGT2
@@ -46,10 +47,10 @@ compute_fgt_dt <- function(dt, welfare, weight, povlines, mean_and_med = FALSE)
   }
 
   if (mean_and_med) {
-    mn  <- funique(dt$mean)
-    med <- funique(dt$median)
-    cy  <- funique(dt$coutnry_code)
-    ry  <- funique(dt$reporting_year)
+    mn  <- ffirst(dt$mean)
+    med <- ffirst(dt$median)
+    cy  <- ffirst(dt$coutnry_code)
+    ry  <- ffirst(dt$reporting_year)
     out <- data.table(
       povline          = povlines,
       headcount        = res[, 1],
diff --git a/R/fg_pip.R b/R/fg_pip.R
index e8c3edb2..f177a995 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -75,9 +75,8 @@ fg_pip <- function(country,
   res <- lapply(lt,
                 process_dt,
                 povline      = povline,
-                mean_and_med = TRUE)
-  res <- rbindlist(res,
-                   fill = TRUE)
+                mean_and_med = TRUE) |>
+  rbindlist(fill = TRUE)
 
   # ZP Add: join to metadata
   #-------------------------
diff --git a/R/utils-pipdata.R b/R/utils-pipdata.R
index 16b996ce..6c961cd3 100644
--- a/R/utils-pipdata.R
+++ b/R/utils-pipdata.R
@@ -47,3 +47,248 @@ transform_input <- function(input_list){
   output_list <- unlist(output_list, recursive = FALSE)
   return(output_list)
 }
+
+
+#' Add attributes as columns (vectorized, in-place)
+#'
+#' @description
+#' Converts survey attributes on a `data.table`—including
+#' `reporting_level_rows`, `country_code`, `reporting_year`, and `dist_stats`—
+#' into columns using a **loop-free, segment-replication** strategy. Designed
+#' for very large tables and objects loaded via `readRDS()`/`load()`:
+#' uses `setDT()` and `alloc.col()` to ensure in-place assignment.
+#'
+#' @details
+#' The function expects an attribute `reporting_level_rows`, a list with:
+#' - `reporting_level`: character vector of the level label for each segment
+#'   (e.g., `c("rural","urban","rural", ...)`).
+#' - `rows`: integer vector of **cumulative** row-ends (e.g.,
+#'   `c(100000, 200000, 300000, ...)`).
+#'
+#' Segment lengths are computed as `diff(c(0L, rows))`, and `reporting_level`
+#' is replicated with `rep.int(lev, counts)`. Constants `country_code`,
+#' `reporting_year`, and `file` (`paste0(country_code, "_", reporting_year)`)
+#' are added to all rows. If `dist_stats$mean` / `dist_stats$median` are
+#' provided (as named vectors/lists keyed by level), they are mapped by level
+#' name and replicated per segment. If a level is missing from the names,
+#' `NA` values may result for that segment.
+#'
+#' This implementation avoids loops and `findInterval()` edge cases, and
+#' modifies `dt` by reference.
+#'
+#' @param dt A `data.table` carrying the attributes described above.
+#'
+#' @return The same `data.table`, modified by reference, with added columns:
+#'   `reporting_level`, `country_code`, `reporting_year`, `file`, and (if
+#'   present) `mean`, `median`.
+#'
+#' @section Assumptions:
+#' * `length(reporting_level_rows$reporting_level) == length(reporting_level_rows$rows)`.
+#' * `rows` are cumulative and non-decreasing, and their segment lengths sum to `nrow(dt)`.
+#' * If `dist_stats$mean` / `dist_stats$median` have multiple values, their names
+#'   align with the level labels.
+#'
+#' @note For objects loaded from disk (e.g., via `readRDS()`), `alloc.col(dt)`
+#'       ensures there is spare column capacity for by-reference assignment.
+#'
+#' @seealso [add_attributes_as_columns_multi()], [assign_stat()]
+#'
+#' @examples
+#' \dontrun{
+#' library(data.table)
+#' dt <- data.table(weight = 1:6, welfare = runif(6))
+#' attr(dt, "reporting_level_rows") <- list(
+#'   reporting_level = c("rural","urban","rural"),
+#'   rows = c(2L, 4L, 6L)
+#' )
+#' attr(dt, "country_code")   <- "XXY"
+#' attr(dt, "reporting_year") <- 2000L
+#' attr(dt, "dist_stats") <- list(
+#'   mean   = list(rural = 2.5, urban = 5.0),
+#'   median = list(rural = 2.0, urban = 4.5)
+#' )
+#'
+#' add_attributes_as_columns_vectorized(dt)
+#' head(dt)
+#' }
+#'
+#' @import data.table
+#' @export
+add_attributes_as_columns_vectorized <- function(dt) {
+
+  # Ensure proper internal state & spare column capacity (handles readRDS/load cases)
+  setDT(dt)        # harmless if already a data.table
+  alloc.col(dt)    # pre-allocate room for new columns
+
+  rl   <- attr(dt, "reporting_level_rows")
+  lev  <- rl$reporting_level
+  rows <- as.integer(rl$rows)
+  n    <- fnrow(dt)
+
+  counts <- diff(c(0L, rows))
+  if (sum(counts) != n) cli::cli_abort("Sum of 'rows' in attribute does not equal nrow(dt).")
+
+  # reporting_level: optimized assignment by range
+  reporting_level_vec <- character(n)
+  start <- 1L
+  for (i in seq_along(lev)) {
+    end <- rows[i]
+    reporting_level_vec[start:end] <- lev[i]
+    start <- end + 1L
+  }
+  dt[, reporting_level := reporting_level_vec]
+
+  # constants
+  cc <- attr(dt, "country_code")
+  ry <- attr(dt, "reporting_year")
+  dt[, `:=`(
+    country_code   = cc,
+    reporting_year = ry,
+    file           = paste0(cc, "_", ry)
+  )]
+
+  # dist_stats per reporting_level (align by names, then replicate by counts)
+  ds <- attr(dt, "dist_stats")
+  if (length(ds)) {
+    if (!is.null(ds$mean)) {
+      m <- unlist(ds$mean, use.names = TRUE)
+      dt[,
+         mean := rep.int(unname(m[match(lev,
+                                        names(m))]),
+                         counts)]
+    }
+    if (!is.null(ds$median)) {
+      md <- unlist(ds$median, use.names = TRUE)
+      dt[,
+         median := rep.int(unname(md[match(lev,
+                                           names(md))]),
+                           counts)]
+    }
+  }
+
+  dt
+}
+
+
+
+#' Add attributes as columns for multi-segment reporting levels
+#'
+#' @description
+#' Converts attributes on a survey `data.table` (e.g., `reporting_level_rows`,
+#' `country_code`, `reporting_year`, and `dist_stats`) into columns, handling
+#' **multiple alternating segments** (e.g., CHN rural/urban/rural/urban) or
+#' single-segment cases (e.g., ZAF).
+#'
+#' @param dt A `data.table` with attributes:
+#'   - `reporting_level_rows`: list with `reporting_level` (character) and
+#'     `rows` (integer cumulative row ends).
+#'   - `country_code` (character).
+#'   - `reporting_year` (integer/numeric).
+#'   - `dist_stats` (list) optionally containing `mean` and/or `median`, each as
+#'     a named list/vector keyed by reporting level, or a single scalar.
+#'
+#' @return The same `data.table`, modified by reference, with new columns:
+#'   `reporting_level`, `country_code`, `reporting_year`, `file`, and
+#'   optionally `mean`, `median`.
+#'
+#' @examples
+#' # chn2000_cols <- add_attributes_as_columns_multi(chn2000)
+#' # zaf2000_cols <- add_attributes_as_columns_multi(zaf2000)
+#' @import data.table
+#' @export
+add_attributes_as_columns_multi <- function(dt) {
+  # Ensure DT internals and spare capacity for new columns
+  setDT(dt)
+  alloc.col(dt)
+
+  # --- Pull + validate segment metadata ---
+  rl <- attr(dt, "reporting_level_rows")
+  if (is.null(rl) || is.null(rl$reporting_level) || is.null(rl$rows)) {
+    cli::cli_abort("Missing 'reporting_level_rows' attribute with $reporting_level and $rows.")
+  }
+  lev  <- as.character(rl$reporting_level)
+  rows <- as.integer(rl$rows)
+  n    <- nrow(dt)
+
+  if (length(lev) != length(rows)) cli::cli_abort("'reporting_level' and 'rows' lengths differ.")
+  if (length(rows) == 0L) cli::cli_abort("'rows' is empty.")
+  if (any(diff(rows) < 0L)) cli::cli_abort("'rows' must be non-decreasing.")
+  if (rows[length(rows)] != n) cli::cli_abort("Last element of 'rows' must equal nrow(dt).")
+
+  counts <- diff(c(0L, rows))
+  if (any(counts <= 0L)) cli::cli_abort("Computed non-positive segment length(s).")
+
+  # --- reporting_level: vectorized per-segment replication ---
+  dt[, reporting_level := rep.int(lev, counts)]
+
+  # --- constants ---
+  cc <- attr(dt, "country_code")
+  ry <- attr(dt, "reporting_year")
+  dt[, `:=`(
+    country_code   = cc,
+    reporting_year = ry,
+    file           = paste0(cc, "_", ry)
+  )]
+
+  # --- distribution stats ---
+  ds <- attr(dt, "dist_stats")
+  if (length(ds)) {
+    assign_stat(dt, lev, counts, ds$mean,   "mean")
+    assign_stat(dt, lev, counts, ds$median, "median")
+  }
+
+  dt
+}
+
+
+
+#' Assign a per-level statistic to a data.table column (by reference)
+#'
+#' @description
+#' Replicates a statistic per reporting-level segment and assigns it to a new
+#' column in `dt`, **in place**. `stat` can be a scalar (broadcast), a named
+#' vector, or a named list (one value per level).
+#'
+#' @param dt A `data.table`. Modified by reference.
+#' @param lev Character vector of reporting-level labels per segment
+#'   (e.g., `c("rural","urban","rural", ...)`).
+#' @param counts Integer vector of segment lengths matching `lev`
+#'   (e.g., `c(100000, 100000, 100000, ...)`).
+#' @param stat A numeric scalar, named vector, or named list with one value per
+#'   level (names must match `lev` values).
+#' @param colname Name of the column to create/overwrite.
+#'
+#' @return Invisibly returns `dt` (modified by reference).
+#' @examples
+#' # assign_stat(dt, lev, counts, list(rural = 2.6, urban = 5.5), "mean")
+#' @import data.table
+#' @export
+assign_stat <- function(dt, lev, counts, stat, colname) {
+  if (is.null(stat)) return(invisible(dt))
+  n <- nrow(dt)
+
+  v <- if (is.list(stat)) unlist(stat, use.names = TRUE) else stat
+
+  # Single scalar: broadcast
+  if (length(v) == 1L && is.null(names(v))) {
+    dt[, (colname) := rep.int(unname(v), n)]
+    return(invisible(dt))
+  }
+
+  # Need names to map values to levels
+  if (is.null(names(v))) {
+    stop("`stat` has length > 1 but no names; cannot map to levels.")
+  }
+
+  map_idx <- match(lev, names(v))
+  if (anyNA(map_idx)) {
+    missing_levels <- unique(lev[is.na(map_idx)])
+    stop(
+      sprintf("`stat` missing value(s) for level(s): %s",
+              paste(missing_levels, collapse = ", "))
+    )
+  }
+
+  dt[, (colname) := rep.int(unname(v[map_idx]), counts)]
+  invisible(dt)
+}
diff --git a/man/add_attributes_as_columns_multi.Rd b/man/add_attributes_as_columns_multi.Rd
index 7df2c2ed..27a0d5a7 100644
--- a/man/add_attributes_as_columns_multi.Rd
+++ b/man/add_attributes_as_columns_multi.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/add_attributes_as_columns.R
+% Please edit documentation in R/utils-pipdata.R
 \name{add_attributes_as_columns_multi}
 \alias{add_attributes_as_columns_multi}
 \title{Add attributes as columns for multi-segment reporting levels}
diff --git a/man/add_attributes_as_columns_vectorized.Rd b/man/add_attributes_as_columns_vectorized.Rd
index 66c51a00..40e8008d 100644
--- a/man/add_attributes_as_columns_vectorized.Rd
+++ b/man/add_attributes_as_columns_vectorized.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/add_attributes_as_columns.R
+% Please edit documentation in R/utils-pipdata.R
 \name{add_attributes_as_columns_vectorized}
 \alias{add_attributes_as_columns_vectorized}
 \title{Add attributes as columns (vectorized, in-place)}
diff --git a/man/assign_stat.Rd b/man/assign_stat.Rd
index 987b08f4..36cb81b9 100644
--- a/man/assign_stat.Rd
+++ b/man/assign_stat.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/add_attributes_as_columns.R
+% Please edit documentation in R/utils-pipdata.R
 \name{assign_stat}
 \alias{assign_stat}
 \title{Assign a per-level statistic to a data.table column (by reference)}

From ed78664e3e34574321719d5d9fc6d6d0b89638b7 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Fri, 22 Aug 2025 18:00:49 -0400
Subject: [PATCH 056/203] slight optimization

---
 R/fg_pip.R        |  9 +++++----
 R/utils-pipdata.R | 20 +++++++++-----------
 2 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/R/fg_pip.R b/R/fg_pip.R
index f177a995..1d824164 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -72,10 +72,11 @@ fg_pip <- function(country,
 
   # ZP Add: do fgt estimations using `res <- lapply(lt, process_dt, povline = povline)`
   #-------------------------
-  res <- lapply(lt,
-                process_dt,
-                povline      = povline,
-                mean_and_med = TRUE) |>
+  res <- lapply(lt, \(x) {
+    process_dt(x,
+               povline      = povline,
+               mean_and_med = TRUE)
+    }) |>
   rbindlist(fill = TRUE)
 
   # ZP Add: join to metadata
diff --git a/R/utils-pipdata.R b/R/utils-pipdata.R
index 6c961cd3..5e76f2c0 100644
--- a/R/utils-pipdata.R
+++ b/R/utils-pipdata.R
@@ -118,7 +118,7 @@ add_attributes_as_columns_vectorized <- function(dt) {
 
   # Ensure proper internal state & spare column capacity (handles readRDS/load cases)
   setDT(dt)        # harmless if already a data.table
-  alloc.col(dt)    # pre-allocate room for new columns
+  setalloccol(dt)    # pre-allocate room for new columns... #AC, I am still not sure about this.
 
   rl   <- attr(dt, "reporting_level_rows")
   lev  <- rl$reporting_level
@@ -151,18 +151,16 @@ add_attributes_as_columns_vectorized <- function(dt) {
   ds <- attr(dt, "dist_stats")
   if (length(ds)) {
     if (!is.null(ds$mean)) {
-      m <- unlist(ds$mean, use.names = TRUE)
-      dt[,
-         mean := rep.int(unname(m[match(lev,
-                                        names(m))]),
-                         counts)]
+      for (l in lev) {
+        dt[reporting_level == l,
+           mean := ds$mean[[l]]]
+      }
     }
     if (!is.null(ds$median)) {
-      md <- unlist(ds$median, use.names = TRUE)
-      dt[,
-         median := rep.int(unname(md[match(lev,
-                                           names(md))]),
-                           counts)]
+      for (l in lev) {
+        dt[reporting_level == l,
+           median := ds$median[[l]]]
+      }
     }
   }
 

From 563cc34cef7ebc840a2cbe3b563bcb24a158ffb8 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Fri, 22 Aug 2025 18:24:40 -0400
Subject: [PATCH 057/203] more optimization

---
 R/utils-pipdata.R | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/R/utils-pipdata.R b/R/utils-pipdata.R
index 5e76f2c0..57f01c96 100644
--- a/R/utils-pipdata.R
+++ b/R/utils-pipdata.R
@@ -149,18 +149,22 @@ add_attributes_as_columns_vectorized <- function(dt) {
 
   # dist_stats per reporting_level (align by names, then replicate by counts)
   ds <- attr(dt, "dist_stats")
+
+
+
   if (length(ds)) {
-    if (!is.null(ds$mean)) {
-      for (l in lev) {
-        dt[reporting_level == l,
-           mean := ds$mean[[l]]]
-      }
-    }
-    if (!is.null(ds$median)) {
-      for (l in lev) {
-        dt[reporting_level == l,
-           median := ds$median[[l]]]
-      }
+    dstats <- c("mean", "median")
+    for (l in lev) {
+      wrl <- whichv(dt$reporting_level, l)
+      ld <- lapply(dstats, \(d) {
+        if (is.null(ds[[d]][[l]])) {
+          NA
+        } else {
+          ds[[d]][[l]]
+        }
+      })
+
+      dt[wrl, (dstats) := ld]
     }
   }
 

From 95e9f21689837b9d2682ae71df4e5e6dd4815e7f Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Fri, 22 Aug 2025 19:08:06 -0400
Subject: [PATCH 058/203] init reformat

---
 R/compute_fgt_new.R | 57 +++++++++++++++++++++++++++++++++++++++++++++
 R/fg_pip.R          | 14 ++++++++---
 R/utils-pipdata.R   | 19 +++++++--------
 3 files changed, 77 insertions(+), 13 deletions(-)

diff --git a/R/compute_fgt_new.R b/R/compute_fgt_new.R
index 0e066e85..f4d0135c 100644
--- a/R/compute_fgt_new.R
+++ b/R/compute_fgt_new.R
@@ -75,6 +75,63 @@ compute_fgt_dt <- function(dt, welfare, weight, povlines, mean_and_med = FALSE)
 }
 
 
+
+# Efficient FGT calculation for vectors (No data.table)
+#'
+#' @param w character: welfare variable name
+#' @param wt character: weight variable name
+#' @param povlines double: vector with poverty lines
+#'
+#' @return data.table with estimates poverty estimates
+#' @keywords internal
+compute_fgt <- function(w, wt, povlines) {
+  n   <- length(w)
+  m   <- length(povlines)
+
+  # Pre-allocate result matrix
+  res <- matrix(NA_real_, nrow = m, ncol = 3)
+  colnames(res) <- c("FGT0", "FGT1", "FGT2")
+  watts_vec <- numeric(m)
+
+  # Precompute log(w) for efficiency (vectorized)
+
+  pos  <- w > 0
+  # logw <- log(w)
+  logw <- copyv(log(w), pos, NA_real_, invert = TRUE) |>
+    suppressWarnings()
+  # logw <- fifelse(w > 0, log(w), NA_real_)
+
+  for (i in seq_along(povlines)) {
+    pov <- povlines[i]
+    poor <- w < pov
+    rel_dist <- 1 - (w / pov)
+    setv(rel_dist, poor, 0, invert = TRUE)
+    # rel_dist[!poor] <- 0
+    res[i, 1] <- fmean(poor, w = wt) # FGT0
+    res[i, 2] <- fmean(rel_dist, w = wt) # FGT1
+    res[i, 3] <- fmean(rel_dist^2, w = wt) # FGT2
+
+    # Optimized Watts index calculation
+    keep <- poor & pos
+    if (any(keep, na.rm = TRUE)) {
+      watts_vec[i] <- (fsum((log(pov) - logw[keep]) * wt[keep])) / fsum(wt)
+    } else {
+      watts_vec[i] <- 0
+    }
+  }
+
+
+  data.table(
+      povline          = povlines,
+      headcount        = res[, 1],
+      poverty_gap      = res[, 2],
+      poverty_severity = res[, 3],
+      watts            = watts_vec)
+
+}
+
+
+
 process_dt <- function(dt, povline, mean_and_med = FALSE) {
   dt[, compute_fgt_dt(.SD, "welfare", "weight", povline, mean_and_med),
      by = .(file, reporting_level)]
diff --git a/R/fg_pip.R b/R/fg_pip.R
index 1d824164..a2dc64a0 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -66,9 +66,17 @@ fg_pip <- function(country,
     load_list_refy(input_list = full_list,
                    path       = fs::path(data_dir, "lineup_data"))
 
-  lt <- lapply(lt, \(x) {
-                   add_attributes_as_columns_vectorized(x)
-               })
+  # lt <- lapply(lt, \(x) {
+  #                  add_attributes_as_columns_vectorized(x)
+  #              })
+
+  lt_att <- lapply(lt, \(.) {
+    list(
+      dist_stats = attr(., "dt_dist_stats"),
+      rl_rows    = attr(., "reporting_level_rows")
+    )
+  })
+
 
   # ZP Add: do fgt estimations using `res <- lapply(lt, process_dt, povline = povline)`
   #-------------------------
diff --git a/R/utils-pipdata.R b/R/utils-pipdata.R
index 57f01c96..3459c1b9 100644
--- a/R/utils-pipdata.R
+++ b/R/utils-pipdata.R
@@ -8,16 +8,15 @@
 load_list_refy <- \(input_list, path){
   input_list <- transform_input(input_list)
 
-  dl <- lapply(input_list, FUN = function(x) {
-    qs::qread(file = fs::path(path, paste0(x$country_code, "_",
-                                                x$year),
-                              ext = "qs"))
-    })
+  inames <- lapply(input_list, \(x) {
+    paste0(x$country_code, "_", x$year)
+  })
+
+  dl <- lapply(inames, \(x) {
+    qs::qread(file = fs::path(path, x, ext = "qs"))
+    }) |>
+    setNames(inames)
 
-  names(dl) <- vapply(input_list, \(x) {
-    paste0(x$country_code, x$year)
-    },
-    FUN.VALUE = character(1))
   dl
 }
 
@@ -151,7 +150,7 @@ add_attributes_as_columns_vectorized <- function(dt) {
   ds <- attr(dt, "dist_stats")
 
 
-
+  # AC: I think this is not necessary at this stage. Can we merge at a later point?
   if (length(ds)) {
     dstats <- c("mean", "median")
     for (l in lev) {

From d5b3370642fdddbf2ee60c94f438649cf85f747c Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Fri, 22 Aug 2025 22:06:25 -0400
Subject: [PATCH 059/203] new optimization

---
 R/compute_fgt_new.R | 45 ++++++++++++++++++++++++++++++++++++++++++---
 R/fg_pip.R          | 19 +++++++------------
 R/utils-pipdata.R   | 45 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 94 insertions(+), 15 deletions(-)

diff --git a/R/compute_fgt_new.R b/R/compute_fgt_new.R
index f4d0135c..e78d8296 100644
--- a/R/compute_fgt_new.R
+++ b/R/compute_fgt_new.R
@@ -97,9 +97,9 @@ compute_fgt <- function(w, wt, povlines) {
 
   pos  <- w > 0
   # logw <- log(w)
-  logw <- copyv(log(w), pos, NA_real_, invert = TRUE) |>
-    suppressWarnings()
-  # logw <- fifelse(w > 0, log(w), NA_real_)
+  # logw <- copyv(log(w), pos, NA_real_, invert = TRUE) |>
+  #   suppressWarnings()
+  logw <- fifelse(w > 0, log(w), NA_real_)
 
   for (i in seq_along(povlines)) {
     pov <- povlines[i]
@@ -130,8 +130,47 @@ compute_fgt <- function(w, wt, povlines) {
 
 }
 
+#' compute FGT using indices by reporting level
+#'
+#' This function is intended to be used inside [map_fgt]
+#'
+#' @param x data.table from lt list, with welfare and weight vectors
+#' @param y list of indices for each reporting level
+#' @param nx name of data table. Usuall country code and year in the form "CCC_YYYY"
+#'
+#' @return data.table with FGT estimates by reporting level
+#' @keywords  internal
+DT_fgt_by_rl <- \(x, y, nx) {
+  DT_fgt <- lapply(names(y), \(rl) {
+
+    idx <- y[[rl]]
+    w   <- x[idx, welfare]
+    wt  <- x[idx, weight]
+    RL  <- compute_fgt(w = w, wt = wt, povlines = povline)
+    RL[, reporting_level := rl]
+
+  }) |>
+    rbindlist(fill = TRUE)
 
 
+  DT_fgt[, `:=`(
+    country_code   = gsub("([^_]+)(_.+)", "\\1", nx),
+    reporting_year = gsub("(.+_)([^_]+)", "\\2", nx)
+  )]
+}
+
+#' map over list of data.tables and indices to compute FGT by reporting_level
+#'
+#' @param lt list of data.tables with welfare and weight data
+#' @param l_rl_rows list of indeces
+#'
+#' @return data.table with all measured
+#' @keywords internal
+map_fgt <- \(lt, l_rl_rows) {
+  Map(DT_fgt_by_rl, lt, l_rl_rows, names(lt)) |>
+    rbindlist(fill = TRUE)
+}
+
 process_dt <- function(dt, povline, mean_and_med = FALSE) {
   dt[, compute_fgt_dt(.SD, "welfare", "weight", povline, mean_and_med),
      by = .(file, reporting_level)]
diff --git a/R/fg_pip.R b/R/fg_pip.R
index a2dc64a0..036c21c3 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -70,22 +70,17 @@ fg_pip <- function(country,
   #                  add_attributes_as_columns_vectorized(x)
   #              })
 
-  lt_att <- lapply(lt, \(.) {
-    list(
-      dist_stats = attr(., "dt_dist_stats"),
-      rl_rows    = attr(., "reporting_level_rows")
-    )
-  })
+  # Extract some attributes
+  lt_att <- get_lt_attr(lt)
+
+  # get rows indices
+  l_rl_rows <- get_rl_rows(lt_att)
+
 
 
   # ZP Add: do fgt estimations using `res <- lapply(lt, process_dt, povline = povline)`
   #-------------------------
-  res <- lapply(lt, \(x) {
-    process_dt(x,
-               povline      = povline,
-               mean_and_med = TRUE)
-    }) |>
-  rbindlist(fill = TRUE)
+  res <- map_fgt(lt, l_rl_rows)
 
   # ZP Add: join to metadata
   #-------------------------
diff --git a/R/utils-pipdata.R b/R/utils-pipdata.R
index 3459c1b9..45f65b24 100644
--- a/R/utils-pipdata.R
+++ b/R/utils-pipdata.R
@@ -293,3 +293,48 @@ assign_stat <- function(dt, lev, counts, stat, colname) {
   dt[, (colname) := rep.int(unname(v[map_idx]), counts)]
   invisible(dt)
 }
+
+
+
+
+
+#' extract rows indices
+#'
+#' @param a list with attributes from lt
+#'
+#' @return names list with indices for reporting level
+#' @keywords internal
+get_rl_rows_single <- function(a) {
+  rl <- a$rl_rows
+  rl_rows <- vector("list", length(rl$reporting_level))
+
+  start <- 1L
+  for (i in seq_along(rl$reporting_level)) {
+    end <- rl$rows[i]
+    rl_rows[[i]] <- start:end
+    start <- end + 1L
+  }
+  setNames(rl_rows, rl$reporting_level)
+}
+
+
+#' apply get_rl_rows_single
+#' @rdname get_rl_rows_single
+get_rl_rows <- \(lt_att) {
+  lapply(lt_att, get_rl_rows_single)
+}
+
+
+
+#' Get some attributes from lt lis
+#'
+#' @param lt list
+#' @keywords internal
+get_lt_attr <- function(lt) {
+  lapply(lt, \(.) {
+    list(
+      dist_stats = attr(., "dt_dist_stats"),
+      rl_rows    = attr(., "reporting_level_rows")
+    )
+  })
+}

From 0040957620aaa12fe483a193d9e272211d4feb6c Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Fri, 22 Aug 2025 22:24:23 -0400
Subject: [PATCH 060/203] optimize how to estimate FGT by reporting level

---
 R/fg_pip.R        | 25 ++++++-------------------
 R/utils-pipdata.R | 14 ++++++++++++++
 2 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/R/fg_pip.R b/R/fg_pip.R
index 036c21c3..b121b722 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -76,17 +76,14 @@ fg_pip <- function(country,
   # get rows indices
   l_rl_rows <- get_rl_rows(lt_att)
 
+  # get data.table with dist stats
+  dt_dist_stats <- get_dt_dist_stats(la_att)
 
 
   # ZP Add: do fgt estimations using `res <- lapply(lt, process_dt, povline = povline)`
   #-------------------------
   res <- map_fgt(lt, l_rl_rows)
 
-  # ZP Add: join to metadata
-  #-------------------------
-  metadata[,
-           file := basename(path)]
-
   # try metadata unique code
   tmp_metadata <- metadata
   # Handle multiple distribution types (for aggregated distributions)
@@ -112,25 +109,19 @@ fg_pip <- function(country,
   # Remove duplicate rows by reporting_year (keep only one row per
   # reporting_year)
   tmp_metadata_unique <- funique(tmp_metadata)
-  tmp_metadata_unique[,
-                      file := paste0(country_code,
-                                     "_",
-                                     reporting_year)]
+
 
   out <- join(res,
               tmp_metadata_unique,
-              on            = c("file",
+              on            = c("country_code", "reporting_year",
                                 "reporting_level"),
               how           = "left", # ZP: change from full to left,
                                       #  this rm nowcast years - i.e. years not included
                                       #  as lineup years
               validate      = "m:1",
               drop.dup.cols = TRUE,
-              verbose       = 0)
-
-  out[, `:=`(
-    file   = NULL
-  )]
+              verbose       = 0,
+              overid        = 2)
 
   setnames(out,
            "povline",
@@ -141,10 +132,6 @@ fg_pip <- function(country,
                               use_new_lineup_version = lkup$use_new_lineup_version)
 
 
-  # Fix issue with rounding of poverty lines
-  out[,
-      poverty_line := round(poverty_line, digits = 3) ]
-
   # Formatting. MUST be done in data.table tom modify by reference
   out[, path := as.character(path)]
 
diff --git a/R/utils-pipdata.R b/R/utils-pipdata.R
index 45f65b24..e3ee1f62 100644
--- a/R/utils-pipdata.R
+++ b/R/utils-pipdata.R
@@ -325,6 +325,20 @@ get_rl_rows <- \(lt_att) {
 }
 
 
+#' get data.table with distribution stats
+#'
+#' this is a loop over lt attributes
+#'
+#' @param lt_att list of attributes of lt list
+#'
+#' @return data.table
+#' @keywords internal
+get_dt_dist_stats <- \(lt_att) {
+  lapply(lt_att, \(.) {
+    .$dist_stats
+  }) |>
+    rbindlist(fill = TRUE)
+}
 
 #' Get some attributes from lt lis
 #'

From f8b718ce7e419c33191c7a585c6ea29730509642 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Fri, 22 Aug 2025 22:28:29 -0400
Subject: [PATCH 061/203] merge dist stats

---
 R/fg_pip.R | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/R/fg_pip.R b/R/fg_pip.R
index b121b722..0b6de622 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -77,12 +77,23 @@ fg_pip <- function(country,
   l_rl_rows <- get_rl_rows(lt_att)
 
   # get data.table with dist stats
-  dt_dist_stats <- get_dt_dist_stats(la_att)
+  dt_dist_stats <- get_dt_dist_stats(lt_att)
 
 
   # ZP Add: do fgt estimations using `res <- lapply(lt, process_dt, povline = povline)`
   #-------------------------
-  res <- map_fgt(lt, l_rl_rows)
+  fgt <- map_fgt(lt, l_rl_rows)
+
+  # add dist stats
+  res <- join(fgt, dt_dist_stats,
+              on            = c("country_code", "reporting_year",
+                                "reporting_level"),
+              how           = "left",
+              validate      = "m:1",
+              drop.dup.cols = TRUE,
+              verbose       = 0,
+              overid        = 2)
+
 
   # try metadata unique code
   tmp_metadata <- metadata
@@ -132,7 +143,7 @@ fg_pip <- function(country,
                               use_new_lineup_version = lkup$use_new_lineup_version)
 
 
-  # Formatting. MUST be done in data.table tom modify by reference
+  # Formatting. MUST be done in data.table to modify by reference
   out[, path := as.character(path)]
 
   if ("max_year" %in% names(out)) {

From 4b564d07717caae5ef823cc04c2352e1ed0f45f0 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Fri, 22 Aug 2025 22:54:39 -0400
Subject: [PATCH 062/203] update

---
 R/fg_pip.R | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/R/fg_pip.R b/R/fg_pip.R
index 0b6de622..df0987cd 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -94,9 +94,11 @@ fg_pip <- function(country,
               verbose       = 0,
               overid        = 2)
 
+  # convert reporting year to numeris
+  res[, reporting_year := as.numeric(reporting_year)]
 
   # try metadata unique code
-  tmp_metadata <- metadata
+  tmp_metadata <- copy(metadata) # I think we can avoid this inefficiency.
   # Handle multiple distribution types (for aggregated distributions)
   if (length(unique(tmp_metadata$distribution_type)) > 1) {
     tmp_metadata[, distribution_type := "mixed"]

From e362307091e8b0f498f8ead815da78ec54ba9ff2 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Sat, 23 Aug 2025 10:34:49 -0400
Subject: [PATCH 063/203] create pipenv enviroment

---
 R/fg_pip.R          | 34 ++++++++++++++++++++--------------
 R/pip_new_lineups.R |  6 ++++++
 2 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/R/fg_pip.R b/R/fg_pip.R
index df0987cd..86bdcebc 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -12,7 +12,8 @@ fg_pip <- function(country,
                    welfare_type,
                    reporting_level,
                    ppp,
-                   lkup) {
+                   lkup,
+                   pipenv = NULL) {
 
   valid_regions       <- lkup$query_controls$region$values
   interpolation_list  <- lkup$interpolation_list
@@ -76,23 +77,28 @@ fg_pip <- function(country,
   # get rows indices
   l_rl_rows <- get_rl_rows(lt_att)
 
-  # get data.table with dist stats
-  dt_dist_stats <- get_dt_dist_stats(lt_att)
+  if (exists("pipenv", envir = .GlobalEnv)) {
+    # get data.table with dist stats
+    dt_dist_stats <- get_dt_dist_stats(lt_att)
+    # Assign them to pipenv to use later
+    pipenv <- get("pipenv", envir = .GlobalEnv)
+    rlang::env_poke(pipenv, "dt_dist_stats", dt_dist_stats)
+  }
 
 
   # ZP Add: do fgt estimations using `res <- lapply(lt, process_dt, povline = povline)`
   #-------------------------
-  fgt <- map_fgt(lt, l_rl_rows)
-
-  # add dist stats
-  res <- join(fgt, dt_dist_stats,
-              on            = c("country_code", "reporting_year",
-                                "reporting_level"),
-              how           = "left",
-              validate      = "m:1",
-              drop.dup.cols = TRUE,
-              verbose       = 0,
-              overid        = 2)
+  res <- map_fgt(lt, l_rl_rows)
+
+  # # add dist stats
+  # res <- join(fgt, dt_dist_stats,
+  #             on            = c("country_code", "reporting_year",
+  #                               "reporting_level"),
+  #             how           = "left",
+  #             validate      = "m:1",
+  #             drop.dup.cols = TRUE,
+  #             verbose       = 0,
+  #             overid        = 2)
 
   # convert reporting year to numeris
   res[, reporting_year := as.numeric(reporting_year)]
diff --git a/R/pip_new_lineups.R b/R/pip_new_lineups.R
index aea166e5..cb51cd6b 100644
--- a/R/pip_new_lineups.R
+++ b/R/pip_new_lineups.R
@@ -76,6 +76,12 @@ pip_new_lineups <- function(country         = "ALL",
   povline         <- round(povline, digits = 3)
 
 
+  # Create temp environment
+  pipenv <- rlang::env()
+  assign("pipenv", pipenv, envir = .GlobalEnv)
+  on.exit(rm("pipenv", envir = .GlobalEnv), add = TRUE)
+
+
 
   # TEMPORARY UNTIL SELECTION MECHANISM IS BEING IMPROVED
   country <- toupper(country)

From 6d637b489be317d9469f634ab0da2be5b14d6963 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Sat, 23 Aug 2025 10:46:09 -0400
Subject: [PATCH 064/203] remove mean and median from dickdb

---
 R/duckdb_func.R | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/R/duckdb_func.R b/R/duckdb_func.R
index 663056c9..4758db15 100644
--- a/R/duckdb_func.R
+++ b/R/duckdb_func.R
@@ -217,8 +217,8 @@ update_master_file <- function(dat,
     keep_vars <- c(
       "interpolation_id",
       "poverty_line",
-      "mean",
-      "median",
+
+
       "headcount",
       "poverty_gap",
       "poverty_severity",
@@ -233,8 +233,8 @@ update_master_file <- function(dat,
       "cache_id",
       "reporting_level",
       "poverty_line",
-      "mean",
-      "median",
+
+
       "headcount",
       "poverty_gap",
       "poverty_severity",
@@ -339,8 +339,7 @@ create_duckdb_file <- function(cache_file_path) {
                  cache_id VARCHAR,
                  reporting_level   VARCHAR,
                  poverty_line   DOUBLE,
-                 mean   DOUBLE,
-                 median DOUBLE,
+
                  headcount    DOUBLE,
                  poverty_gap   DOUBLE,
                  poverty_severity  DOUBLE,
@@ -350,8 +349,7 @@ create_duckdb_file <- function(cache_file_path) {
   DBI::dbExecute(con, "CREATE OR REPLACE table fg_master_file (
                  interpolation_id VARCHAR,
                  poverty_line   DOUBLE,
-                 mean   DOUBLE,
-                 median DOUBLE,
+
                  headcount    DOUBLE,
                  poverty_gap   DOUBLE,
                  poverty_severity  DOUBLE,

From 0315985517c04b045f65f03ef8a3f687b467d4b5 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Sat, 23 Aug 2025 11:14:19 -0400
Subject: [PATCH 065/203] add just mean and median

---
 R/fg_pip.R    | 44 ++++++++++++++++++++++++++++++++++++--------
 man/fg_pip.Rd |  3 ++-
 2 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/R/fg_pip.R b/R/fg_pip.R
index 86bdcebc..e4e575d7 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -77,18 +77,19 @@ fg_pip <- function(country,
   # get rows indices
   l_rl_rows <- get_rl_rows(lt_att)
 
-  if (exists("pipenv", envir = .GlobalEnv)) {
-    # get data.table with dist stats
-    dt_dist_stats <- get_dt_dist_stats(lt_att)
-    # Assign them to pipenv to use later
-    pipenv <- get("pipenv", envir = .GlobalEnv)
-    rlang::env_poke(pipenv, "dt_dist_stats", dt_dist_stats)
-  }
+  # if (exists("pipenv", envir = .GlobalEnv)) {
+  #   # get data.table with dist stats
+  #   dt_dist_stats <- get_dt_dist_stats(lt_att)
+  #   # Assign them to pipenv to use later
+  #   pipenv <- get("pipenv", envir = .GlobalEnv)
+  #   rlang::env_poke(pipenv, "dt_dist_stats", dt_dist_stats)
+  # }
 
 
   # ZP Add: do fgt estimations using `res <- lapply(lt, process_dt, povline = povline)`
   #-------------------------
-  res <- map_fgt(lt, l_rl_rows)
+  fgt <- map_fgt(lt, l_rl_rows) |>
+    funique() # TO REMOVE
 
   # # add dist stats
   # res <- join(fgt, dt_dist_stats,
@@ -103,6 +104,10 @@ fg_pip <- function(country,
   # convert reporting year to numeris
   res[, reporting_year := as.numeric(reporting_year)]
 
+  # Add just mean and median
+  res <- fg_get_mean_median(fgt, lkup)
+
+
   # try metadata unique code
   tmp_metadata <- copy(metadata) # I think we can avoid this inefficiency.
   # Handle multiple distribution types (for aggregated distributions)
@@ -318,3 +323,26 @@ create_full_list <- function(country, year, refy_lkup, data_present_in_master) {
   full_list
 
 }
+
+
+
+
+#' merge into fgt table the mean and median from dist stats table in lkup
+#'
+#' @param fgt data,table with fgt measures
+#' @param lkup lkup
+#'
+#' @return data.table with with fgt, mean and median
+#' @keywords internal
+fg_get_mean_median <- \(fgt, lkup) {
+  joyn::joyn(x = fgt,
+             y = lkup$lineup_dist_stats[,
+                                        .(country_code, reporting_year,
+                                          reporting_level, mean, median)],
+             by = c('country_code', "reporting_year", "reporting_level"),
+             match_type = "1:1",
+             keep = "left",
+             reportvar = FALSE,
+             verbose = FALSE)
+}
+
diff --git a/man/fg_pip.Rd b/man/fg_pip.Rd
index 9012b01b..17faee02 100644
--- a/man/fg_pip.Rd
+++ b/man/fg_pip.Rd
@@ -12,7 +12,8 @@ fg_pip(
   welfare_type,
   reporting_level,
   ppp,
-  lkup
+  lkup,
+  pipenv = NULL
 )
 }
 \arguments{

From f6bb8e4020670c23224ad574fae97c187422e7a9 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Sat, 23 Aug 2025 11:19:26 -0400
Subject: [PATCH 066/203] super efficient code now

---
 R/fg_pip.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/fg_pip.R b/R/fg_pip.R
index e4e575d7..2a4ddef3 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -102,7 +102,7 @@ fg_pip <- function(country,
   #             overid        = 2)
 
   # convert reporting year to numeris
-  res[, reporting_year := as.numeric(reporting_year)]
+  fgt[, reporting_year := as.numeric(reporting_year)]
 
   # Add just mean and median
   res <- fg_get_mean_median(fgt, lkup)

From bf9f87f096f7f50bb43c0c918497b3c387a12c74 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Sat, 23 Aug 2025 11:20:31 -0400
Subject: [PATCH 067/203] remove idea of the pipenv

---
 R/fg_pip.R          | 18 ------------------
 R/pip_new_lineups.R |  7 -------
 2 files changed, 25 deletions(-)

diff --git a/R/fg_pip.R b/R/fg_pip.R
index 2a4ddef3..dad368bc 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -77,30 +77,12 @@ fg_pip <- function(country,
   # get rows indices
   l_rl_rows <- get_rl_rows(lt_att)
 
-  # if (exists("pipenv", envir = .GlobalEnv)) {
-  #   # get data.table with dist stats
-  #   dt_dist_stats <- get_dt_dist_stats(lt_att)
-  #   # Assign them to pipenv to use later
-  #   pipenv <- get("pipenv", envir = .GlobalEnv)
-  #   rlang::env_poke(pipenv, "dt_dist_stats", dt_dist_stats)
-  # }
-
 
   # ZP Add: do fgt estimations using `res <- lapply(lt, process_dt, povline = povline)`
   #-------------------------
   fgt <- map_fgt(lt, l_rl_rows) |>
     funique() # TO REMOVE
 
-  # # add dist stats
-  # res <- join(fgt, dt_dist_stats,
-  #             on            = c("country_code", "reporting_year",
-  #                               "reporting_level"),
-  #             how           = "left",
-  #             validate      = "m:1",
-  #             drop.dup.cols = TRUE,
-  #             verbose       = 0,
-  #             overid        = 2)
-
   # convert reporting year to numeris
   fgt[, reporting_year := as.numeric(reporting_year)]
 
diff --git a/R/pip_new_lineups.R b/R/pip_new_lineups.R
index cb51cd6b..5b255ec6 100644
--- a/R/pip_new_lineups.R
+++ b/R/pip_new_lineups.R
@@ -76,13 +76,6 @@ pip_new_lineups <- function(country         = "ALL",
   povline         <- round(povline, digits = 3)
 
 
-  # Create temp environment
-  pipenv <- rlang::env()
-  assign("pipenv", pipenv, envir = .GlobalEnv)
-  on.exit(rm("pipenv", envir = .GlobalEnv), add = TRUE)
-
-
-
   # TEMPORARY UNTIL SELECTION MECHANISM IS BEING IMPROVED
   country <- toupper(country)
   if (is.character(year)) {

From aea71b72b8915554ed6b5997dc6e33d5fb688c05 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Sat, 23 Aug 2025 11:49:14 -0400
Subject: [PATCH 068/203] make sure it saves mean and median depending on the
 type of data.

---
 R/duckdb_func.R           | 12 ++++++++----
 R/fg_pip.R                |  2 +-
 man/DT_fgt_by_rl.Rd       | 22 ++++++++++++++++++++++
 man/fg_get_mean_median.Rd | 20 ++++++++++++++++++++
 man/get_dt_dist_stats.Rd  | 18 ++++++++++++++++++
 man/get_lt_attr.Rd        | 15 +++++++++++++++
 man/get_rl_rows_single.Rd | 23 +++++++++++++++++++++++
 man/map_fgt.Rd            | 20 ++++++++++++++++++++
 8 files changed, 127 insertions(+), 5 deletions(-)
 create mode 100644 man/DT_fgt_by_rl.Rd
 create mode 100644 man/fg_get_mean_median.Rd
 create mode 100644 man/get_dt_dist_stats.Rd
 create mode 100644 man/get_lt_attr.Rd
 create mode 100644 man/get_rl_rows_single.Rd
 create mode 100644 man/map_fgt.Rd

diff --git a/R/duckdb_func.R b/R/duckdb_func.R
index 4758db15..5071e1ca 100644
--- a/R/duckdb_func.R
+++ b/R/duckdb_func.R
@@ -217,8 +217,6 @@ update_master_file <- function(dat,
     keep_vars <- c(
       "interpolation_id",
       "poverty_line",
-
-
       "headcount",
       "poverty_gap",
       "poverty_severity",
@@ -233,8 +231,6 @@ update_master_file <- function(dat,
       "cache_id",
       "reporting_level",
       "poverty_line",
-
-
       "headcount",
       "poverty_gap",
       "poverty_severity",
@@ -242,6 +238,14 @@ update_master_file <- function(dat,
     )
   }
 
+  # Get column names from DuckDB table
+  table_info <- DBI::dbGetQuery(write_con, glue("PRAGMA table_info({target_file})"))
+  col_names  <- table_info$name
+  # Add mean and median if present in table
+  if (all(c("mean", "median") %in% col_names)) {
+    keep_vars <- c(keep_vars, "mean", "median")
+  }
+
   # Select variables
   dat <- dat[, ..keep_vars]
 
diff --git a/R/fg_pip.R b/R/fg_pip.R
index dad368bc..2ec21575 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -83,7 +83,7 @@ fg_pip <- function(country,
   fgt <- map_fgt(lt, l_rl_rows) |>
     funique() # TO REMOVE
 
-  # convert reporting year to numeris
+  # convert reporting year to numeric
   fgt[, reporting_year := as.numeric(reporting_year)]
 
   # Add just mean and median
diff --git a/man/DT_fgt_by_rl.Rd b/man/DT_fgt_by_rl.Rd
new file mode 100644
index 00000000..72663294
--- /dev/null
+++ b/man/DT_fgt_by_rl.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/compute_fgt_new.R
+\name{DT_fgt_by_rl}
+\alias{DT_fgt_by_rl}
+\title{compute FGT using indices by reporting level}
+\usage{
+DT_fgt_by_rl(x, y, nx)
+}
+\arguments{
+\item{x}{data.table from lt list, with welfare and weight vectors}
+
+\item{y}{list of indices for each reporting level}
+
+\item{nx}{name of data table. Usuall country code and year in the form "CCC_YYYY"}
+}
+\value{
+data.table with FGT estimates by reporting level
+}
+\description{
+This function is intended to be used inside \link{map_fgt}
+}
+\keyword{internal}
diff --git a/man/fg_get_mean_median.Rd b/man/fg_get_mean_median.Rd
new file mode 100644
index 00000000..88667176
--- /dev/null
+++ b/man/fg_get_mean_median.Rd
@@ -0,0 +1,20 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/fg_pip.R
+\name{fg_get_mean_median}
+\alias{fg_get_mean_median}
+\title{merge into fgt table the mean and median from dist stats table in lkup}
+\usage{
+fg_get_mean_median(fgt, lkup)
+}
+\arguments{
+\item{fgt}{data,table with fgt measures}
+
+\item{lkup}{lkup}
+}
+\value{
+data.table with with fgt, mean and median
+}
+\description{
+merge into fgt table the mean and median from dist stats table in lkup
+}
+\keyword{internal}
diff --git a/man/get_dt_dist_stats.Rd b/man/get_dt_dist_stats.Rd
new file mode 100644
index 00000000..f59e6b9f
--- /dev/null
+++ b/man/get_dt_dist_stats.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils-pipdata.R
+\name{get_dt_dist_stats}
+\alias{get_dt_dist_stats}
+\title{get data.table with distribution stats}
+\usage{
+get_dt_dist_stats(lt_att)
+}
+\arguments{
+\item{lt_att}{list of attributes of lt list}
+}
+\value{
+data.table
+}
+\description{
+this is a loop over lt attributes
+}
+\keyword{internal}
diff --git a/man/get_lt_attr.Rd b/man/get_lt_attr.Rd
new file mode 100644
index 00000000..303808ab
--- /dev/null
+++ b/man/get_lt_attr.Rd
@@ -0,0 +1,15 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils-pipdata.R
+\name{get_lt_attr}
+\alias{get_lt_attr}
+\title{Get some attributes from lt lis}
+\usage{
+get_lt_attr(lt)
+}
+\arguments{
+\item{lt}{list}
+}
+\description{
+Get some attributes from lt lis
+}
+\keyword{internal}
diff --git a/man/get_rl_rows_single.Rd b/man/get_rl_rows_single.Rd
new file mode 100644
index 00000000..e11afce9
--- /dev/null
+++ b/man/get_rl_rows_single.Rd
@@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils-pipdata.R
+\name{get_rl_rows_single}
+\alias{get_rl_rows_single}
+\alias{get_rl_rows}
+\title{extract rows indices}
+\usage{
+get_rl_rows_single(a)
+
+get_rl_rows(lt_att)
+}
+\arguments{
+\item{a}{list with attributes from lt}
+}
+\value{
+names list with indices for reporting level
+}
+\description{
+extract rows indices
+
+apply get_rl_rows_single
+}
+\keyword{internal}
diff --git a/man/map_fgt.Rd b/man/map_fgt.Rd
new file mode 100644
index 00000000..a192f3f2
--- /dev/null
+++ b/man/map_fgt.Rd
@@ -0,0 +1,20 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/compute_fgt_new.R
+\name{map_fgt}
+\alias{map_fgt}
+\title{map over list of data.tables and indices to compute FGT by reporting_level}
+\usage{
+map_fgt(lt, l_rl_rows)
+}
+\arguments{
+\item{lt}{list of data.tables with welfare and weight data}
+
+\item{l_rl_rows}{list of indeces}
+}
+\value{
+data.table with all measured
+}
+\description{
+map over list of data.tables and indices to compute FGT by reporting_level
+}
+\keyword{internal}

From 84ab37fd44ec6fcd47cddf74473b5861690cb5a8 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Sat, 23 Aug 2025 12:22:42 -0400
Subject: [PATCH 069/203] update

---
 R/fg_pip.R | 2 +-
 R/utils.R  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/fg_pip.R b/R/fg_pip.R
index 2ec21575..07f0e9df 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -322,7 +322,7 @@ fg_get_mean_median <- \(fgt, lkup) {
                                         .(country_code, reporting_year,
                                           reporting_level, mean, median)],
              by = c('country_code', "reporting_year", "reporting_level"),
-             match_type = "1:1",
+             match_type = "m:1", # multiple povlines
              keep = "left",
              reportvar = FALSE,
              verbose = FALSE)
diff --git a/R/utils.R b/R/utils.R
index 65f2df74..33969942 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -304,7 +304,7 @@ add_dist_stats <- function(df, lkup, fill_gaps) {
                  by               = c("country_code",
                                       "reporting_level",
                                       "reporting_year"),
-                 match_type       = "1:1",
+                 match_type       = "m:1", # multiple poverty lines
                  keep_common_vars = FALSE,
                  reportvar        = FALSE,
                  verbose          = FALSE,

From 851b83c7d53548c2a36d534667c64ecad934bac5 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Sat, 23 Aug 2025 12:46:24 -0400
Subject: [PATCH 070/203] init new strcuture for aggregates

---
 NAMESPACE                            |  1 +
 R/pip_agg.R                          | 54 ++++++++++++++++++++++++++++
 R/pip_grp.R                          |  6 ++--
 R/pip_grp_logic.R                    |  6 ++--
 R/pip_grp_new.R                      | 14 ++++++++
 R/utils-pipdata.R                    |  3 +-
 inst/plumber/v1/endpoints.R          |  2 +-
 man/{pip_grp_logic.Rd => pip_agg.Rd} | 25 +++++++++++--
 man/pip_gg.Rd                        | 22 ++++++++++++
 9 files changed, 123 insertions(+), 10 deletions(-)
 create mode 100644 R/pip_agg.R
 create mode 100644 R/pip_grp_new.R
 rename man/{pip_grp_logic.Rd => pip_agg.Rd} (70%)
 create mode 100644 man/pip_gg.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 2ee13c9d..133f86b1 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -28,6 +28,7 @@ export(is_empty)
 export(is_forked)
 export(load_inter_cache)
 export(pip)
+export(pip_agg)
 export(pip_grp)
 export(pip_grp_logic)
 export(pip_new_lineups)
diff --git a/R/pip_agg.R b/R/pip_agg.R
new file mode 100644
index 00000000..5c4421dc
--- /dev/null
+++ b/R/pip_agg.R
@@ -0,0 +1,54 @@
+#' Logic for computing new aggregate
+#'
+#' @inheritParams pip
+#' @return data.table
+#' @examples
+#' \dontrun{
+#' # Create lkups
+#' }
+#' @export
+pip_agg <- function(country         = "ALL",
+                    year            = "ALL",
+                    povline         = 1.9,
+                    group_by        = c("wb", "none"),
+                    welfare_type    = c("all", "consumption", "income"),
+                    reporting_level = c("all", "national"),
+                    lkup,
+                    censor          = TRUE,
+                    lkup_hash       = lkup$cache_data_id$hash_pip_grp,
+                    additional_ind  = FALSE) {
+
+  # Should pip_old or pip_new be used?
+  #-------------------------------------
+  use_new <- lkup$use_new_lineup_version
+
+  # Run correct function
+  #-------------------------------------
+  out <- if (use_new) {
+    pip_grp_new(country         = country,
+                year            = year,
+                povline         = povline,
+                group_by        = group_by,
+                welfare_type    = welfare_type,
+                reporting_level = reporting_level,
+                lkup            = lkup,
+                censor          = censor,
+                lkup_hash       = lkup_hash,
+                additional_ind  = additional_ind)
+  } else {
+    pip_grp_logic(country         = country,
+                  year            = year,
+                  povline         = povline,
+                  group_by        = group_by,
+                  welfare_type    = welfare_type,
+                  reporting_level = reporting_level,
+                  lkup            = lkup,
+                  censor          = censor,
+                  lkup_hash       = lkup_hash,
+                  additional_ind  = additional_ind)
+  }
+
+  # Return
+  #-------------------------------------
+  out
+}
diff --git a/R/pip_grp.R b/R/pip_grp.R
index 8fc68331..91037580 100644
--- a/R/pip_grp.R
+++ b/R/pip_grp.R
@@ -50,7 +50,7 @@ pip_grp <- function(country         = "ALL",
     reporting_level <- "national"
   }
 
-  out <- fg_pip(
+  out <- fg_pip_old(
     country         = country,
     year            = year,
     povline         = povline,
@@ -208,7 +208,7 @@ pip_aggregate <- function(df, by = NULL, return_cols) {
 }
 
 #' Aggregate by predefined groups
-#' @param df data.frame: Response from `fg_pip()` or `rg_pip()`.
+#' @param df data.frame: Response from `fg_pip_old()` or `rg_pip()`.
 #' @param group_lkup data.frame: Group lkup table (pop_region)
 #' @param country character: Selected countries / regions
 #' @param return_cols list: lkup$return_cols$pip_grp object. Controls returned
@@ -308,7 +308,7 @@ compute_world_aggregates <- function(rgn, cols) {
 
 
 #' Filter relevant rows for aggregating by predefined groups
-#' @param df data.frame: Response from `fg_pip()`
+#' @param df data.frame: Response from `fg_pip_old()`
 #' @noRd
 filter_for_aggregate_by <- function(df) {
   # This algorithm is incorrect, but should mostly work as a first iteration
diff --git a/R/pip_grp_logic.R b/R/pip_grp_logic.R
index 9cc9bc34..5503b4c7 100644
--- a/R/pip_grp_logic.R
+++ b/R/pip_grp_logic.R
@@ -1,6 +1,6 @@
-#' Logic for computing new aggregate
+#' Old way to estimate aggregate data
+#' @rdname pip_agg
 #'
-#' @inheritParams pip
 #' @return data.table
 #' @examples
 #' \dontrun{
@@ -73,7 +73,7 @@ pip_grp_logic <- function(country         = "ALL",
     ## STEP 3.2: Compute fg_pip for ALL required countries ----
     ## This will then be re-used in various part of the function
     ## This is to avoid re-computing and re-loading the same data over and over
-    fg_pip_master <- fg_pip(
+    fg_pip_master <- fg_pip_old(
       country         = c(lcv$md_off_reg, lcv$user_off_reg),
       year            = year,
       povline         = povline,
diff --git a/R/pip_grp_new.R b/R/pip_grp_new.R
new file mode 100644
index 00000000..c4a2045e
--- /dev/null
+++ b/R/pip_grp_new.R
@@ -0,0 +1,14 @@
+#' New way to estimate Aggregate data
+#' @rdname pip_gg
+pip_grp_new <- \(country         = "ALL",
+                 year            = "ALL",
+                 povline         = 1.9,
+                 group_by        = c("wb", "none"),
+                 welfare_type    = c("all", "consumption", "income"),
+                 reporting_level = c("all", "national"),
+                 lkup,
+                 censor          = TRUE,
+                 lkup_hash       = lkup$cache_data_id$hash_pip_grp,
+                 additional_ind  = FALSE) {
+
+}
diff --git a/R/utils-pipdata.R b/R/utils-pipdata.R
index e3ee1f62..d430d49b 100644
--- a/R/utils-pipdata.R
+++ b/R/utils-pipdata.R
@@ -150,7 +150,8 @@ add_attributes_as_columns_vectorized <- function(dt) {
   ds <- attr(dt, "dist_stats")
 
 
-  # AC: I think this is not necessary at this stage. Can we merge at a later point?
+  # This block processes distribution statistics (mean, median) for each reporting level.
+  # If this is not required at this stage, consider removing it or deferring it to a later step.
   if (length(ds)) {
     dstats <- c("mean", "median")
     for (l in lev) {
diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R
index 626f4b6a..77c83c07 100644
--- a/inst/plumber/v1/endpoints.R
+++ b/inst/plumber/v1/endpoints.R
@@ -282,7 +282,7 @@ function(req, res) {
   params$format  <- NULL
   params$version <- NULL
 
-  out <- do.call(pipapi::pip_grp_logic, params)
+  out <- do.call(pipapi::pip_agg, params)
   out
 }
 
diff --git a/man/pip_grp_logic.Rd b/man/pip_agg.Rd
similarity index 70%
rename from man/pip_grp_logic.Rd
rename to man/pip_agg.Rd
index a7d861f1..9c197ecb 100644
--- a/man/pip_grp_logic.Rd
+++ b/man/pip_agg.Rd
@@ -1,9 +1,23 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/pip_grp_logic.R
-\name{pip_grp_logic}
+% Please edit documentation in R/pip_agg.R, R/pip_grp_logic.R
+\name{pip_agg}
+\alias{pip_agg}
 \alias{pip_grp_logic}
 \title{Logic for computing new aggregate}
 \usage{
+pip_agg(
+  country = "ALL",
+  year = "ALL",
+  povline = 1.9,
+  group_by = c("wb", "none"),
+  welfare_type = c("all", "consumption", "income"),
+  reporting_level = c("all", "national"),
+  lkup,
+  censor = TRUE,
+  lkup_hash = lkup$cache_data_id$hash_pip_grp,
+  additional_ind = FALSE
+)
+
 pip_grp_logic(
   country = "ALL",
   year = "ALL",
@@ -41,13 +55,20 @@ sub-groups}
 FALSE}
 }
 \value{
+data.table
+
 data.table
 }
 \description{
 Logic for computing new aggregate
+
+Old way to estimate aggregate data
 }
 \examples{
 \dontrun{
 # Create lkups
 }
+\dontrun{
+# Create lkups
+}
 }
diff --git a/man/pip_gg.Rd b/man/pip_gg.Rd
new file mode 100644
index 00000000..ce32bdd5
--- /dev/null
+++ b/man/pip_gg.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/pip_grp_new.R
+\name{pip_grp_new}
+\alias{pip_grp_new}
+\title{New way to estimate Aggregate data}
+\usage{
+pip_grp_new(
+  country = "ALL",
+  year = "ALL",
+  povline = 1.9,
+  group_by = c("wb", "none"),
+  welfare_type = c("all", "consumption", "income"),
+  reporting_level = c("all", "national"),
+  lkup,
+  censor = TRUE,
+  lkup_hash = lkup$cache_data_id$hash_pip_grp,
+  additional_ind = FALSE
+)
+}
+\description{
+New way to estimate Aggregate data
+}

From eb796c1e1048c5f0525d6548f03a0885ee122a2b Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Sat, 23 Aug 2025 12:53:37 -0400
Subject: [PATCH 071/203] small progress.

---
 R/pip_agg.R     |  2 --
 R/pip_grp_new.R | 15 +++++++++++++--
 man/pip_gg.Rd   |  2 --
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/R/pip_agg.R b/R/pip_agg.R
index 5c4421dc..52b18bcd 100644
--- a/R/pip_agg.R
+++ b/R/pip_agg.R
@@ -28,12 +28,10 @@ pip_agg <- function(country         = "ALL",
     pip_grp_new(country         = country,
                 year            = year,
                 povline         = povline,
-                group_by        = group_by,
                 welfare_type    = welfare_type,
                 reporting_level = reporting_level,
                 lkup            = lkup,
                 censor          = censor,
-                lkup_hash       = lkup_hash,
                 additional_ind  = additional_ind)
   } else {
     pip_grp_logic(country         = country,
diff --git a/R/pip_grp_new.R b/R/pip_grp_new.R
index c4a2045e..8b76c1b7 100644
--- a/R/pip_grp_new.R
+++ b/R/pip_grp_new.R
@@ -3,12 +3,23 @@
 pip_grp_new <- \(country         = "ALL",
                  year            = "ALL",
                  povline         = 1.9,
-                 group_by        = c("wb", "none"),
                  welfare_type    = c("all", "consumption", "income"),
                  reporting_level = c("all", "national"),
                  lkup,
                  censor          = TRUE,
-                 lkup_hash       = lkup$cache_data_id$hash_pip_grp,
                  additional_ind  = FALSE) {
 
+  welfare_type    <- match.arg(welfare_type)
+  reporting_level <- match.arg(reporting_level)
+
+  # Custom aggregations only supported at the national level
+  # subgroups aggregations only supported for "all" countries
+  country <- toupper(country)
+  year    <- toupper(year)
+  reporting_level <- "all"
+  if (!all(country %in% c("ALL", lkup$query_controls$region$values))) {
+    country <- "ALL"
+  }
+
+
 }
diff --git a/man/pip_gg.Rd b/man/pip_gg.Rd
index ce32bdd5..285bcec0 100644
--- a/man/pip_gg.Rd
+++ b/man/pip_gg.Rd
@@ -8,12 +8,10 @@ pip_grp_new(
   country = "ALL",
   year = "ALL",
   povline = 1.9,
-  group_by = c("wb", "none"),
   welfare_type = c("all", "consumption", "income"),
   reporting_level = c("all", "national"),
   lkup,
   censor = TRUE,
-  lkup_hash = lkup$cache_data_id$hash_pip_grp,
   additional_ind = FALSE
 )
 }

From 1f6ea6539e9875548117eee8baa71015e7e2f46c Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Mon, 25 Aug 2025 12:08:49 -0400
Subject: [PATCH 072/203] init function to subset country codes

---
 R/pip_grp_new.R | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/R/pip_grp_new.R b/R/pip_grp_new.R
index 8b76c1b7..4b6e0a2b 100644
--- a/R/pip_grp_new.R
+++ b/R/pip_grp_new.R
@@ -23,3 +23,25 @@ pip_grp_new <- \(country         = "ALL",
 
 
 }
+
+
+
+get_country_code_subset <- function(dt, country) {
+  # Find all *_code columns except 'country_code'
+  code_cols <- setdiff(grep("_code$", names(dt), value = TRUE), "country_code")
+  
+  # Initialize result vector
+  result <- character(0)
+  
+  # For each code column, check for matches and collect country_code
+  for (col in code_cols) {
+    # Find rows where the code column matches any value in country
+    idx <- dt[[col]] %in% country
+    if (any(idx, na.rm = TRUE)) {
+      result <- c(result, dt[idx, country_code])
+    }
+  }
+  
+  # Return unique country_code values
+  funique(result)
+}
\ No newline at end of file

From 0ed1bf417cfdc3c6a4f59e1d0438fd0d86132734 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Mon, 25 Aug 2025 12:34:29 -0400
Subject: [PATCH 073/203] document new get_country_code_subset function

---
 R/pip_grp_new.R                | 34 +++++++++++++++++++++++++++++-----
 man/get_country_code_subset.Rd | 28 ++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+), 5 deletions(-)
 create mode 100644 man/get_country_code_subset.Rd

diff --git a/R/pip_grp_new.R b/R/pip_grp_new.R
index 4b6e0a2b..8baff230 100644
--- a/R/pip_grp_new.R
+++ b/R/pip_grp_new.R
@@ -21,18 +21,42 @@ pip_grp_new <- \(country         = "ALL",
     country <- "ALL"
   }
 
+  # Select countries to estimate poverty
+  cts <- copy(lkup$aux_files$country_list)
+  country_code  <-  if (country != "ALL") {
+    get_country_code_subset(dt = cts, country = country)
+  } else {
+    "ALL"
+  }
 
-}
 
 
+}
+
+#' Subset country_code values based on matches in *_code columns
+#'
+#' This function searches all columns in a data.table ending with '_code' (except 'country_code')
+#' and returns a unique character vector of 'country_code' values for rows where any of those
+#' columns match a value in the provided 'country' vector. The input data.table 'dt' should be
+#' 'lkup$aux_files$country_list', which contains country and region codes for subsetting.
+#'
+#' @param dt A data.table, typically lkup$aux_files$country_list, containing country_code and other *_code columns.
+#' @param country Character vector of country or region codes to match against *_code columns.
+#'
+#' @return A unique character vector of country_code values corresponding to matches in any *_code column.
+#' @examples
+#' \dontrun{
+#' dt <- lkup$aux_files$country_list
+#' get_country_code_subset(dt, c("USA", "EAP"))
+#' }
 get_country_code_subset <- function(dt, country) {
   # Find all *_code columns except 'country_code'
   code_cols <- setdiff(grep("_code$", names(dt), value = TRUE), "country_code")
-  
+
   # Initialize result vector
   result <- character(0)
-  
+
   # For each code column, check for matches and collect country_code
   for (col in code_cols) {
     # Find rows where the code column matches any value in country
@@ -41,7 +65,7 @@ get_country_code_subset <- function(dt, country) {
       result <- c(result, dt[idx, country_code])
     }
   }
-  
+
   # Return unique country_code values
   funique(result)
-}
\ No newline at end of file
+}
diff --git a/man/get_country_code_subset.Rd b/man/get_country_code_subset.Rd
new file mode 100644
index 00000000..60d3be69
--- /dev/null
+++ b/man/get_country_code_subset.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/pip_grp_new.R
+\name{get_country_code_subset}
+\alias{get_country_code_subset}
+\title{Subset country_code values based on matches in *_code columns}
+\usage{
+get_country_code_subset(dt, country)
+}
+\arguments{
+\item{dt}{A data.table, typically lkup$aux_files$country_list, containing country_code and other *_code columns.}
+
+\item{country}{Character vector of country or region codes to match against *_code columns.}
+}
+\value{
+A unique character vector of country_code values corresponding to matches in any *_code column.
+}
+\description{
+This function searches all columns in a data.table ending with '_code' (except 'country_code')
+and returns a unique character vector of 'country_code' values for rows where any of those
+columns match a value in the provided 'country' vector. The input data.table 'dt' should be
+'lkup$aux_files$country_list', which contains country and region codes for subsetting.
+}
+\examples{
+\dontrun{
+dt <- lkup$aux_files$country_list
+get_country_code_subset(dt, c("USA", "EAP"))
+}
+}

From 72de2ca11e2401522db05bb4930d1b630c120d0f Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Mon, 25 Aug 2025 12:55:55 -0400
Subject: [PATCH 074/203] add list_code_column_values

---
 R/pip_grp_new.R | 61 +++++++++++++++++++++++++++++++++++++------------
 1 file changed, 46 insertions(+), 15 deletions(-)

diff --git a/R/pip_grp_new.R b/R/pip_grp_new.R
index 8baff230..cafe746c 100644
--- a/R/pip_grp_new.R
+++ b/R/pip_grp_new.R
@@ -34,38 +34,69 @@ pip_grp_new <- \(country         = "ALL",
 
 }
 
-#' Subset country_code values based on matches in *_code columns
+#' Subset country_code values based on matches in *_code columns and country_code
 #'
-#' This function searches all columns in a data.table ending with '_code' (except 'country_code')
-#' and returns a unique character vector of 'country_code' values for rows where any of those
-#' columns match a value in the provided 'country' vector. The input data.table 'dt' should be
-#' 'lkup$aux_files$country_list', which contains country and region codes for subsetting.
+#' This function searches all columns in a data.table ending with '_code' (except 'country_code'),
+#' as well as 'country_code' itself, and returns a unique character vector of 'country_code' values
+#' for rows where any of those columns match a value in the provided 'country' vector. If any value
+#' in 'country' is not found in any *_code column or in 'country_code', an error is thrown. The input
+#' data.table 'dt' should be 'lkup$aux_files$country_list', which contains country and region codes for subsetting.
 #'
 #' @param dt A data.table, typically lkup$aux_files$country_list, containing country_code and other *_code columns.
-#' @param country Character vector of country or region codes to match against *_code columns.
+#' @param country Character vector of country or region codes to match against *_code columns and country_code.
 #'
-#' @return A unique character vector of country_code values corresponding to matches in any *_code column.
+#' @return A unique character vector of country_code values corresponding to matches in any *_code column or country_code.
 #' @examples
 #' \dontrun{
 #' dt <- lkup$aux_files$country_list
 #' get_country_code_subset(dt, c("USA", "EAP"))
 #' }
 get_country_code_subset <- function(dt, country) {
-  # Find all *_code columns except 'country_code'
-  code_cols <- setdiff(grep("_code$", names(dt), value = TRUE), "country_code")
-
-  # Initialize result vector
+  code_cols <- grep("_code$", names(dt), value = TRUE)
   result <- character(0)
+  matched <- logical(length(country))
 
-  # For each code column, check for matches and collect country_code
   for (col in code_cols) {
-    # Find rows where the code column matches any value in country
     idx <- dt[[col]] %in% country
     if (any(idx, na.rm = TRUE)) {
       result <- c(result, dt[idx, country_code])
+      matched <- matched | country %in% dt[[col]]
     }
   }
-
-  # Return unique country_code values
+  # Also check country_code itself
+  idx_cc <- dt$country_code %in% country
+  if (any(idx_cc, na.rm = TRUE)) {
+    result <- c(result, dt[idx_cc, country_code])
+    matched <- matched | country %in% dt$country_code
+  }
+  # Error if any country not matched
+  if (any(!matched)) {
+    cli::cli_abort(
+      "The following values in {.arg country} were not found in any *_code column or country_code:
+      {country[!matched]}")
+  }
   funique(result)
 }
+
+#' List values in each *_code column that match the country vector
+#'
+#' Returns a named list where each element is the vector of unique values in each *_code column
+#' that are present in the provided 'country' vector.
+#'
+#' @param dt A data.table, typically lkup$aux_files$country_list.
+#' @param country Character vector of country or region codes to match against *_code columns.
+#'
+#' @return A named list of unique values for each *_code column that match 'country'.
+#' @examples
+#' \dontrun{
+#' dt <- lkup$aux_files$country_list
+#' list_code_column_values(dt, c("USA", "EAP"))
+#' }
+list_code_column_values <- function(dt, country) {
+  code_cols <- grep("_code$", names(dt), value = TRUE)
+  lapply(code_cols, \(col) {
+    dt[get(col) %in% country, ..col] |>
+      funique()
+  }) |>
+    setNames(code_cols)
+}

From e9bfeb39926404b454facb39807a3642ce43c3c3 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Mon, 25 Aug 2025 12:56:11 -0400
Subject: [PATCH 075/203] document

---
 man/get_country_code_subset.Rd | 15 ++++++++-------
 man/list_code_column_values.Rd | 26 ++++++++++++++++++++++++++
 2 files changed, 34 insertions(+), 7 deletions(-)
 create mode 100644 man/list_code_column_values.Rd

diff --git a/man/get_country_code_subset.Rd b/man/get_country_code_subset.Rd
index 60d3be69..a1acd4dc 100644
--- a/man/get_country_code_subset.Rd
+++ b/man/get_country_code_subset.Rd
@@ -2,23 +2,24 @@
 % Please edit documentation in R/pip_grp_new.R
 \name{get_country_code_subset}
 \alias{get_country_code_subset}
-\title{Subset country_code values based on matches in *_code columns}
+\title{Subset country_code values based on matches in *_code columns and country_code}
 \usage{
 get_country_code_subset(dt, country)
 }
 \arguments{
 \item{dt}{A data.table, typically lkup$aux_files$country_list, containing country_code and other *_code columns.}
 
-\item{country}{Character vector of country or region codes to match against *_code columns.}
+\item{country}{Character vector of country or region codes to match against *_code columns and country_code.}
 }
 \value{
-A unique character vector of country_code values corresponding to matches in any *_code column.
+A unique character vector of country_code values corresponding to matches in any *_code column or country_code.
 }
 \description{
-This function searches all columns in a data.table ending with '_code' (except 'country_code')
-and returns a unique character vector of 'country_code' values for rows where any of those
-columns match a value in the provided 'country' vector. The input data.table 'dt' should be
-'lkup$aux_files$country_list', which contains country and region codes for subsetting.
+This function searches all columns in a data.table ending with '_code' (except 'country_code'),
+as well as 'country_code' itself, and returns a unique character vector of 'country_code' values
+for rows where any of those columns match a value in the provided 'country' vector. If any value
+in 'country' is not found in any *_code column or in 'country_code', an error is thrown. The input
+data.table 'dt' should be 'lkup$aux_files$country_list', which contains country and region codes for subsetting.
 }
 \examples{
 \dontrun{
diff --git a/man/list_code_column_values.Rd b/man/list_code_column_values.Rd
new file mode 100644
index 00000000..2b285c88
--- /dev/null
+++ b/man/list_code_column_values.Rd
@@ -0,0 +1,26 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/pip_grp_new.R
+\name{list_code_column_values}
+\alias{list_code_column_values}
+\title{List values in each *_code column that match the country vector}
+\usage{
+list_code_column_values(dt, country)
+}
+\arguments{
+\item{dt}{A data.table, typically lkup$aux_files$country_list.}
+
+\item{country}{Character vector of country or region codes to match against *_code columns.}
+}
+\value{
+A named list of unique values for each *_code column that match 'country'.
+}
+\description{
+Returns a named list where each element is the vector of unique values in each *_code column
+that are present in the provided 'country' vector.
+}
+\examples{
+\dontrun{
+dt <- lkup$aux_files$country_list
+list_code_column_values(dt, c("USA", "EAP"))
+}
+}

From 55feea82c7b7d203877fb00889fec9204c15474d Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Mon, 25 Aug 2025 16:21:34 -0400
Subject: [PATCH 076/203] add pip_agg instead of pip_grp_logic

---
 R/pip_grp.R               |  2 +-
 R/pip_grp_new.R           | 46 +++++++++++++++++++++
 R/pip_new_lineups.R       | 85 ++++++++++++++++++++++-----------------
 R/ui_poverty_indicators.R |  2 +-
 R/zzz.R                   |  2 +
 man/pip_aggregate_by.Rd   | 22 ++++++++++
 6 files changed, 119 insertions(+), 40 deletions(-)
 create mode 100644 man/pip_aggregate_by.Rd

diff --git a/R/pip_grp.R b/R/pip_grp.R
index 91037580..b8b9abdc 100644
--- a/R/pip_grp.R
+++ b/R/pip_grp.R
@@ -213,7 +213,7 @@ pip_aggregate <- function(df, by = NULL, return_cols) {
 #' @param country character: Selected countries / regions
 #' @param return_cols list: lkup$return_cols$pip_grp object. Controls returned
 #' columns
-#' @noRd
+#' @keywords internal
 pip_aggregate_by <- function(df,
                              group_lkup,
                              country = "ALL",
diff --git a/R/pip_grp_new.R b/R/pip_grp_new.R
index cafe746c..1dcf557a 100644
--- a/R/pip_grp_new.R
+++ b/R/pip_grp_new.R
@@ -30,6 +30,52 @@ pip_grp_new <- \(country         = "ALL",
   }
 
 
+  out <- fg_pip(
+    country         = country_code,
+    year            = year,
+    povline         = povline,
+    popshare        = NULL,
+    welfare_type    = welfare_type,
+    reporting_level = reporting_level,
+    ppp             = NULL,
+    lkup           = lkup)
+
+  out <- treat_cache_and_main(out)
+
+  # return empty dataframe if no metadata is found
+  if (nrow(out) == 0) {
+    return(pipapi::empty_response_grp)
+  }
+
+  # Handles aggregated distributions (like CHN and IND)
+  if (tolower(reporting_level) %in% c("national", "all")) {
+    out <- add_agg_stats(out,
+                         return_cols = lkup$return_cols$ag_average_poverty_stats)
+  }
+
+  add_vars_out_of_pipeline(out, fill_gaps = TRUE, lkup = lkup)
+
+  # Handle potential (insignificant) difference in poverty_line values that
+  # may mess-up the grouping
+  # I don't think we need this out$poverty_line already has the correct values additionally,
+  # since povline is vectorized the below line does not work as expected
+  #out$poverty_line <- povline
+
+  # Handle aggregations with sub-groups
+
+    out <- pip_aggregate_by(
+      df = out,
+      group_lkup = lkup[["pop_region"]],
+      country = country,
+      return_cols = lkup$return_cols$pip_grp
+    )
+
+    out <- estimate_type_var(out,lkup)
+
+    # Censor regional values
+    if (censor) {
+      out <- censor_rows(out, lkup[["censored"]], type = "regions")
+    }
 
 
 }
diff --git a/R/pip_new_lineups.R b/R/pip_new_lineups.R
index 5b255ec6..04d70acb 100644
--- a/R/pip_new_lineups.R
+++ b/R/pip_new_lineups.R
@@ -141,45 +141,8 @@ pip_new_lineups <- function(country         = "ALL",
 
   # Cache new data
   #---------------------------------------------
-  cached_data <- if (is.null(out$data_in_cache)) {
-    NULL
-  } else if (is.data.frame(out$data_in_cache)) {
-    if (fnrow(out$data_in_cache) == 0) {
-      NULL
-    } else {
-      qDT(out$data_in_cache)
-    }
-  } else {
-    cli::cli_abort("{.code out$data_in_cache} must be NULL or data.frame not {.field {class(out$data_in_cache)}}")
-  }
-
-  main_data   <- qDT(out$main_data)
-
-  if (nrow(main_data) > 0) {
-
-    if (is.null(out$data_in_cache)) {
-
-      out <- main_data
-    } else {
+  out <- treat_cache_and_main(out)
 
-      if (fill_gaps) {
-
-        cached_data <- fg_remove_duplicates(cached_data,
-                                            use_new_lineup_version = lkup$use_new_lineup_version)
-      }
-
-      out <- main_data |>
-        rowbind(cached_data)
-    }
-
-    update_master_file(main_data, cache_file_path, fill_gaps)
-
-  } else {
-    out <- cached_data
-  }
-  if (!is.data.table(out)) {
-    setDT(out)
-  }
   # Early return for empty table---------------
   if (nrow(out) == 0) return(pipapi::empty_response)
 
@@ -310,3 +273,49 @@ pip_new_lineups <- function(country         = "ALL",
   # return -------------
   return(out)
 }
+
+
+
+
+
+treat_cache_and_main <- \(out) {
+  cached_data <- if (is.null(out$data_in_cache)) {
+    NULL
+  } else if (is.data.frame(out$data_in_cache)) {
+    if (fnrow(out$data_in_cache) == 0) {
+      NULL
+    } else {
+      qDT(out$data_in_cache)
+    }
+  } else {
+    cli::cli_abort("{.code out$data_in_cache} must be NULL or data.frame not {.field {class(out$data_in_cache)}}")
+  }
+
+  main_data   <- qDT(out$main_data)
+
+  if (nrow(main_data) > 0) {
+
+    if (is.null(out$data_in_cache)) {
+
+      out <- main_data
+    } else {
+
+      if (fill_gaps) {
+
+        cached_data <- fg_remove_duplicates(cached_data,
+                                            use_new_lineup_version = lkup$use_new_lineup_version)
+      }
+
+      out <- main_data |>
+        rowbind(cached_data)
+    }
+
+    update_master_file(main_data, cache_file_path, fill_gaps)
+
+  } else {
+    out <- cached_data
+  }
+
+
+  setDT(out)
+}
diff --git a/R/ui_poverty_indicators.R b/R/ui_poverty_indicators.R
index 2588da6b..f2758a13 100644
--- a/R/ui_poverty_indicators.R
+++ b/R/ui_poverty_indicators.R
@@ -75,7 +75,7 @@ ui_pc_regional <- function(country   = "ALL",
     year <- toupper(year)
   }
 
-  out <- pip_grp_logic(country         = country,
+  out <- pip_agg(country         = country,
                        year            = year,
                        group_by        = "wb",
                        reporting_level = "national",
diff --git a/R/zzz.R b/R/zzz.R
index e534cced..576b58a4 100644
--- a/R/zzz.R
+++ b/R/zzz.R
@@ -19,6 +19,8 @@ pipapi_default_options <- list(
 
     pip <<- memoise::memoise(pip, cache = cd, omit_args = "lkup")
     ui_hp_stacked <<- memoise::memoise(ui_hp_stacked, cache = cd, omit_args = "lkup")
+    pip_agg <<- memoise::memoise(pip_agg, cache = cd, omit_args = "lkup")
+    pip_grp_new <<- memoise::memoise(pip_grp_new, cache = cd, omit_args = "lkup")
     pip_grp_logic <<- memoise::memoise(pip_grp_logic, cache = cd, omit_args = "lkup")
     pip_grp <<- memoise::memoise(pip_grp, cache = cd, omit_args = "lkup")
     ui_cp_charts <<- memoise::memoise(ui_cp_charts, cache = cd, omit_args = "lkup")
diff --git a/man/pip_aggregate_by.Rd b/man/pip_aggregate_by.Rd
new file mode 100644
index 00000000..98bf18f7
--- /dev/null
+++ b/man/pip_aggregate_by.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/pip_grp.R
+\name{pip_aggregate_by}
+\alias{pip_aggregate_by}
+\title{Aggregate by predefined groups}
+\usage{
+pip_aggregate_by(df, group_lkup, country = "ALL", return_cols)
+}
+\arguments{
+\item{df}{data.frame: Response from \code{fg_pip_old()} or \code{rg_pip()}.}
+
+\item{group_lkup}{data.frame: Group lkup table (pop_region)}
+
+\item{country}{character: Selected countries / regions}
+
+\item{return_cols}{list: lkup$return_cols$pip_grp object. Controls returned
+columns}
+}
+\description{
+Aggregate by predefined groups
+}
+\keyword{internal}

From 8ef8492e79516d7efbef1b9fc389954a4a2c4223 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Mon, 25 Aug 2025 16:32:43 -0400
Subject: [PATCH 077/203] format

---
 R/zzz.R | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/R/zzz.R b/R/zzz.R
index 576b58a4..9bb99af6 100644
--- a/R/zzz.R
+++ b/R/zzz.R
@@ -17,13 +17,13 @@ pipapi_default_options <- list(
                              max_size = as.numeric(Sys.getenv("PIPAPI_CACHE_MAX_SIZE")),
                              prune_rate = 50)
 
-    pip <<- memoise::memoise(pip, cache = cd, omit_args = "lkup")
-    ui_hp_stacked <<- memoise::memoise(ui_hp_stacked, cache = cd, omit_args = "lkup")
-    pip_agg <<- memoise::memoise(pip_agg, cache = cd, omit_args = "lkup")
-    pip_grp_new <<- memoise::memoise(pip_grp_new, cache = cd, omit_args = "lkup")
-    pip_grp_logic <<- memoise::memoise(pip_grp_logic, cache = cd, omit_args = "lkup")
-    pip_grp <<- memoise::memoise(pip_grp, cache = cd, omit_args = "lkup")
-    ui_cp_charts <<- memoise::memoise(ui_cp_charts, cache = cd, omit_args = "lkup")
+    pip            <<- memoise::memoise(pip, cache = cd, omit_args = "lkup")
+    ui_hp_stacked  <<- memoise::memoise(ui_hp_stacked, cache = cd, omit_args = "lkup")
+    pip_agg        <<- memoise::memoise(pip_agg, cache = cd, omit_args = "lkup")
+    pip_grp_new    <<- memoise::memoise(pip_grp_new, cache = cd, omit_args = "lkup")
+    pip_grp_logic  <<- memoise::memoise(pip_grp_logic, cache = cd, omit_args = "lkup")
+    pip_grp        <<- memoise::memoise(pip_grp, cache = cd, omit_args = "lkup")
+    ui_cp_charts   <<- memoise::memoise(ui_cp_charts, cache = cd, omit_args = "lkup")
     ui_cp_download <<- memoise::memoise(ui_cp_download, cache = cd, omit_args = "lkup")
     ui_cp_key_indicators <<- memoise::memoise(ui_cp_key_indicators, cache = cd, omit_args = "lkup")
     pos = 1L

From 61170c4346f80235a1169066e1158de9e2203590 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Tue, 26 Aug 2025 10:30:36 -0400
Subject: [PATCH 078/203] add pipapienv

---
 R/aaa.R             | 1 +
 R/compute_fgt_new.R | 4 +++-
 R/pip_new_lineups.R | 4 ++--
 R/utils-pipdata.R   | 3 ++-
 4 files changed, 8 insertions(+), 4 deletions(-)
 create mode 100644 R/aaa.R

diff --git a/R/aaa.R b/R/aaa.R
new file mode 100644
index 00000000..cd685993
--- /dev/null
+++ b/R/aaa.R
@@ -0,0 +1 @@
+.pipapienv <-  new.env(parent = emptyenv())
diff --git a/R/compute_fgt_new.R b/R/compute_fgt_new.R
index e78d8296..9e9de70b 100644
--- a/R/compute_fgt_new.R
+++ b/R/compute_fgt_new.R
@@ -99,7 +99,9 @@ compute_fgt <- function(w, wt, povlines) {
   # logw <- log(w)
   # logw <- copyv(log(w), pos, NA_real_, invert = TRUE) |>
   #   suppressWarnings()
-  logw <- fifelse(w > 0, log(w), NA_real_)
+  # logw <- fifelse(w > 0, log(w), NA_real_)
+  logw <- log(w) |>
+    suppressWarnings()
 
   for (i in seq_along(povlines)) {
     pov <- povlines[i]
diff --git a/R/pip_new_lineups.R b/R/pip_new_lineups.R
index 04d70acb..33efc76f 100644
--- a/R/pip_new_lineups.R
+++ b/R/pip_new_lineups.R
@@ -141,7 +141,7 @@ pip_new_lineups <- function(country         = "ALL",
 
   # Cache new data
   #---------------------------------------------
-  out <- treat_cache_and_main(out)
+  out <- treat_cache_and_main(out, cache_file_path)
 
   # Early return for empty table---------------
   if (nrow(out) == 0) return(pipapi::empty_response)
@@ -278,7 +278,7 @@ pip_new_lineups <- function(country         = "ALL",
 
 
-treat_cache_and_main <- \(out) {
+treat_cache_and_main <- \(out, cache_file_path) {
   cached_data <- if (is.null(out$data_in_cache)) {
     NULL
   } else if (is.data.frame(out$data_in_cache)) {
diff --git a/R/utils-pipdata.R b/R/utils-pipdata.R
index d430d49b..597663d6 100644
--- a/R/utils-pipdata.R
+++ b/R/utils-pipdata.R
@@ -13,7 +13,8 @@ load_list_refy <- \(input_list, path){
   })
 
   dl <- lapply(inames, \(x) {
-    qs::qread(file = fs::path(path, x, ext = "qs"))
+    qs::qread(file = fs::path(path, x, ext = "qs"),
+              nthreads = 4)
     }) |>
     setNames(inames)
 

From e4f08835e4cc094b8da357da0a323dfadcdb44d5 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Tue, 26 Aug 2025 10:32:20 -0400
Subject: [PATCH 079/203] add functions to manage pipapienv

---
 NAMESPACE                 |  3 +++
 R/pipapi-env.R            | 42 +++++++++++++++++++++++++++++++++++++++
 man/get_from_pipapienv.Rd | 21 ++++++++++++++++++++
 man/get_pipapienv.Rd      | 17 ++++++++++++++++
 man/set_in_pipapienv.Rd   | 22 ++++++++++++++++++++
 5 files changed, 105 insertions(+)
 create mode 100644 R/pipapi-env.R
 create mode 100644 man/get_from_pipapienv.Rd
 create mode 100644 man/get_pipapienv.Rd
 create mode 100644 man/set_in_pipapienv.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 133f86b1..96b78000 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -16,11 +16,13 @@ export(get_aux_table)
 export(get_aux_table_ui)
 export(get_caller_names)
 export(get_ctr_alt_agg)
+export(get_from_pipapienv)
 export(get_grp_to_compute)
 export(get_impl_ctrs)
 export(get_md_vars)
 export(get_param_values)
 export(get_pip_version)
+export(get_pipapienv)
 export(get_user_alt_gt)
 export(get_user_x_code)
 export(get_valid_aux_long_format_tables)
@@ -40,6 +42,7 @@ export(return_if_exists)
 export(select_off_alt_agg)
 export(select_reporting_level)
 export(select_user_aggs)
+export(set_in_pipapienv)
 export(start_api)
 export(ui_cp_charts)
 export(ui_cp_download)
diff --git a/R/pipapi-env.R b/R/pipapi-env.R
new file mode 100644
index 00000000..630b277f
--- /dev/null
+++ b/R/pipapi-env.R
@@ -0,0 +1,42 @@
+# Getter function: Returns the entire .pipapienv environment
+#' Get the entire .pipapienv environment
+#'
+#' @return The .pipapienv environment
+#' @export
+#'
+#' @examples
+#' env <- get_pipapienv()
+get_pipapienv <- function() {
+  .pipapienv
+}
+
+# Getter for a specific key from .pipapienv
+#' Get a value from .pipapienv
+#'
+#' @param key A character string representing the key
+#'
+#' @return The value associated with the key in .pipapienv
+#' @export
+#'
+#' @examples
+#' set_in_pipapienv("example_key", 42)
+#' get_from_pipapienv("example_key") # returns 42
+get_from_pipapienv <- function(key) {
+  rlang::env_get(.pipapienv, key, default = NULL) # Returns NULL if key doesn't exist
+}
+
+# Setter function: Assign a value in .pipapienv
+#' Set a value in .pipapienv
+#'
+#' @param key A character string representing the key
+#' @param value The value to store in .pipapienv
+#'
+#' @return The assigned value (invisibly)
+#' @export
+#'
+#' @examples
+#' set_in_pipapienv("example_key", 42)
+set_in_pipapienv <- function(key, value) {
+  rlang::env_poke(.pipapienv, key, value)
+  invisible(value)  # Return value invisibly to avoid clutter in console
+}
diff --git a/man/get_from_pipapienv.Rd b/man/get_from_pipapienv.Rd
new file mode 100644
index 00000000..2876f683
--- /dev/null
+++ b/man/get_from_pipapienv.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/pipapi-env.R
+\name{get_from_pipapienv}
+\alias{get_from_pipapienv}
+\title{Get a value from .pipapienv}
+\usage{
+get_from_pipapienv(key)
+}
+\arguments{
+\item{key}{A character string representing the key}
+}
+\value{
+The value associated with the key in .pipapienv
+}
+\description{
+Get a value from .pipapienv
+}
+\examples{
+set_in_pipapienv("example_key", 42)
+get_from_pipapienv("example_key") # returns 42
+}
diff --git a/man/get_pipapienv.Rd b/man/get_pipapienv.Rd
new file mode 100644
index 00000000..933e7044
--- /dev/null
+++ b/man/get_pipapienv.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/pipapi-env.R
+\name{get_pipapienv}
+\alias{get_pipapienv}
+\title{Get the entire .pipapienv environment}
+\usage{
+get_pipapienv()
+}
+\value{
+The .pipapienv environment
+}
+\description{
+Get the entire .pipapienv environment
+}
+\examples{
+env <- get_pipapienv()
+}
diff --git a/man/set_in_pipapienv.Rd b/man/set_in_pipapienv.Rd
new file mode 100644
index 00000000..127287cb
--- /dev/null
+++ b/man/set_in_pipapienv.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/pipapi-env.R
+\name{set_in_pipapienv}
+\alias{set_in_pipapienv}
+\title{Set a value in .pipapienv}
+\usage{
+set_in_pipapienv(key, value)
+}
+\arguments{
+\item{key}{A character string representing the key}
+
+\item{value}{The value to store in .pipapienv}
+}
+\value{
+The assigned value (invisibly)
+}
+\description{
+Set a value in .pipapienv
+}
+\examples{
+set_in_pipapienv("example_key", 42)
+}

From 01accd633d8ae7889e94ecdeb377f5e2837d9a76 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Tue, 26 Aug 2025 10:36:27 -0400
Subject: [PATCH 080/203] implement loading of qs using multiple cores

---
 DESCRIPTION       | 3 ++-
 R/utils-pipdata.R | 4 +++-
 R/zzz.R           | 8 ++++++++
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index fa2ae7c1..0f625a0e 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -70,7 +70,8 @@ Imports:
     DBI,
     duckdb,
     jsonlite,
-    digest
+    digest,
+    parallel
 Remotes:
     PIP-Technical-Team/wbpip@DEV
 Depends: 
diff --git a/R/utils-pipdata.R b/R/utils-pipdata.R
index 597663d6..33ce4df6 100644
--- a/R/utils-pipdata.R
+++ b/R/utils-pipdata.R
@@ -12,9 +12,11 @@ load_list_refy <- \(input_list, path){
     paste0(x$country_code, "_", x$year)
   })
 
+  cores <- get_from_pipapienv("cores_to_use")
+
   dl <- lapply(inames, \(x) {
     qs::qread(file = fs::path(path, x, ext = "qs"),
-              nthreads = 4)
+              nthreads = cores)
     }) |>
     setNames(inames)
 
diff --git a/R/zzz.R b/R/zzz.R
index 9bb99af6..d3c42be7 100644
--- a/R/zzz.R
+++ b/R/zzz.R
@@ -35,6 +35,14 @@ pipapi_default_options <- list(
   toset <- !(names(pipapi_default_options) %in% names(op))
   if (any(toset)) options(pipapi_default_options[toset])
 
+
+  # set multi threats
+  available_cores <- parallel::detectCores() - 1
+
+  cores_to_use <- max(available_cores, 1) |>
+    min(8)
+  set_in_pipapienv("cores_to_use", cores_to_use)
+
   invisible()
 
 }

From 22679d1025d3cf065aac457b8ea09dcc2183d93a Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Tue, 26 Aug 2025 14:52:42 -0400
Subject: [PATCH 081/203] updates on pov_from_DT

---
 R/compute_fgt_new.R | 107 +++++++++++++++++++++++++++++++++++++++++++-
 R/fg_pip.R          |   6 +++
 R/pip_grp_new.R     |   7 +++
 man/DT_fgt_by_rl.Rd |  22 ---------
 man/map_fgt.Rd      |  21 +++++++--
 5 files changed, 136 insertions(+), 27 deletions(-)
 delete mode 100644 man/DT_fgt_by_rl.Rd

diff --git a/R/compute_fgt_new.R b/R/compute_fgt_new.R
index 9e9de70b..14cd897b 100644
--- a/R/compute_fgt_new.R
+++ b/R/compute_fgt_new.R
@@ -103,6 +103,8 @@ compute_fgt <- function(w, wt, povlines) {
   logw <- log(w) |>
     suppressWarnings()
 
+  tot_pop <- fsum(wt)
+
   for (i in seq_along(povlines)) {
     pov <- povlines[i]
     poor <- w < pov
@@ -116,7 +118,7 @@ compute_fgt <- function(w, wt, povlines) {
     # Optimized Watts index calculation
     keep <- poor & pos
     if (any(keep, na.rm = TRUE)) {
-      watts_vec[i] <- (fsum((log(pov) - logw[keep]) * wt[keep])) / fsum(wt)
+      watts_vec[i] <- (fsum((log(pov) - logw[keep]) * wt[keep])) / tot_pop
     } else {
       watts_vec[i] <- 0
     }
@@ -140,7 +142,7 @@ compute_fgt <- function(w, wt, povlines) {
 #' @param y list of indices for each reporting level
 #' @param nx name of data table. Usuall country code and year in the form "CCC_YYYY"
 #'
-#' @return data.table with FGT estimates by reporting level
+#' @rdname map_fgt
 #' @keywords  internal
 DT_fgt_by_rl <- \(x, y, nx) {
   DT_fgt <- lapply(names(y), \(rl) {
@@ -161,6 +163,36 @@ DT_fgt_by_rl <- \(x, y, nx) {
   )]
 }
 
+
+
+#' jkoin reporting level and lt list into one data.table
+#'
+#' @rdname map_fgt
+lt_to_dt <- \(x, y, nx) {
+  DT <- lapply(names(y), \(rl) {
+
+    idx <- y[[rl]]
+    x[idx, reporting_level := rl]
+
+  }) |>
+    rbindlist(fill = TRUE)
+
+
+  DT[, `:=`(
+    country_code   = gsub("([^_]+)(_.+)", "\\1", nx),
+    reporting_year = gsub("(.+_)([^_]+)", "\\2", nx)
+  )]
+}
+
+#' Map lt_to_dt
+#'
+#' @rdname map_fgt
+map_lt_to_dt <- \(lt, l_rl_rows) {
+  Map(lt_to_dt, lt, l_rl_rows, names(lt)) |>
+    rbindlist(fill = TRUE)
+}
+
+
 #' map over list of data.tables and indices to compute FGT by reporting_level
 #'
 #' @param lt list of data.tables with welfare and weight data
@@ -233,3 +265,74 @@ load_data_list <- \(metadata) {
 
 }
 
+
+
+pov_from_DT <- function(DT, povline, g) {
+  w       <- DT$welfare
+  wt      <- DT$weight
+  n_pov   <- length(povline)
+
+  ng      <- (g)
+  grp_ids <- (g)
+
+  # Precompute log(w) for efficiency
+  pos <- w > 0
+  logw <- fifelse(pos, log(w), NA_real_)
+
+  # Prepare result lists
+  fgt0 <- vector("list", n_pov)
+  fgt1 <- vector("list", n_pov)
+  fgt2 <- vector("list", n_pov)
+  watts <- vector("list", n_pov)
+
+  for (i in seq_along(povline)) {
+    pov <- povline[i]
+    poor <- w < pov
+    rel_dist <- fifelse(poor, 1 - w/pov, 0)
+    keep <- poor & pos
+    watts_val <- fmean((log(pov) - logw) * keep, g = g, w = wt)
+    fgt0[[i]] <- fmean(poor, g = g, w = wt)
+    fgt1[[i]] <- fmean(rel_dist, g = g, w = wt)
+    fgt2[[i]] <- fmean(rel_dist^2, g = g, w = wt)
+    watts[[i]] <- watts_val
+  }
+
+  data.table(
+    povline = rep(povline, each = ng),
+    group_id = rep(grp_ids, times = n_pov),
+    fgt0 = unlist(fgt0),
+    fgt1 = unlist(fgt1),
+    fgt2 = unlist(fgt2),
+    watts = unlist(watts)
+  )
+}
+
+
+
+
+
+# pov_from_DT2 <- function(DT, povline, g) {
+#   fgt0 <- numeric(length(povline))
+#   fgt1 <- numeric(length(povline))
+#   fgt2 <- numeric(length(povline))
+#   w <- DT$welfare
+#   wt <- DT$weight
+#
+#
+#   for (i in seq_along(povline)) {
+#     pov <- povline[i]
+#     poor <- w < pov
+#     rel_dist <- fifelse(poor, 1 - w/pov, 0)
+#     fgt0[i] <- fmean(poor, g = g, w = wt)
+#     fgt1[i] <- fmean(rel_dist, g = g, w = wt)
+#     fgt2[i] <- fmean(rel_dist^2, g = g, w = wt)
+#   }
+#
+#   list(fgt0 = fgt0, fgt1 = fgt1, fgt2 = fgt2)
+# }
+
+
+
+
+
+
diff --git a/R/fg_pip.R b/R/fg_pip.R
index 07f0e9df..5cab3696 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -78,6 +78,12 @@ fg_pip <- function(country,
   l_rl_rows <- get_rl_rows(lt_att)
 
 
+  DT <- map_lt_to_dt(lt, l_rl_rows)
+  g  <- GRP(DT,  ~ country_code + reporting_year + reporting_level)
+
+
+
+
   # ZP Add: do fgt estimations using `res <- lapply(lt, process_dt, povline = povline)`
   #-------------------------
   fgt <- map_fgt(lt, l_rl_rows) |>
diff --git a/R/pip_grp_new.R b/R/pip_grp_new.R
index 1dcf557a..51f5240e 100644
--- a/R/pip_grp_new.R
+++ b/R/pip_grp_new.R
@@ -30,6 +30,13 @@ pip_grp_new <- \(country         = "ALL",
   }
 
 
+
+
+
+
+
+
+
   out <- fg_pip(
     country         = country_code,
     year            = year,
diff --git a/man/DT_fgt_by_rl.Rd b/man/DT_fgt_by_rl.Rd
deleted file mode 100644
index 72663294..00000000
--- a/man/DT_fgt_by_rl.Rd
+++ /dev/null
@@ -1,22 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/compute_fgt_new.R
-\name{DT_fgt_by_rl}
-\alias{DT_fgt_by_rl}
-\title{compute FGT using indices by reporting level}
-\usage{
-DT_fgt_by_rl(x, y, nx)
-}
-\arguments{
-\item{x}{data.table from lt list, with welfare and weight vectors}
-
-\item{y}{list of indices for each reporting level}
-
-\item{nx}{name of data table. Usuall country code and year in the form "CCC_YYYY"}
-}
-\value{
-data.table with FGT estimates by reporting level
-}
-\description{
-This function is intended to be used inside \link{map_fgt}
-}
-\keyword{internal}
diff --git a/man/map_fgt.Rd b/man/map_fgt.Rd
index a192f3f2..19935f6b 100644
--- a/man/map_fgt.Rd
+++ b/man/map_fgt.Rd
@@ -1,12 +1,27 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/compute_fgt_new.R
-\name{map_fgt}
+\name{DT_fgt_by_rl}
+\alias{DT_fgt_by_rl}
+\alias{lt_to_dt}
+\alias{map_lt_to_dt}
 \alias{map_fgt}
-\title{map over list of data.tables and indices to compute FGT by reporting_level}
+\title{compute FGT using indices by reporting level}
 \usage{
+DT_fgt_by_rl(x, y, nx)
+
+lt_to_dt(x, y, nx)
+
+map_lt_to_dt(lt, l_rl_rows)
+
 map_fgt(lt, l_rl_rows)
 }
 \arguments{
+\item{x}{data.table from lt list, with welfare and weight vectors}
+
+\item{y}{list of indices for each reporting level}
+
+\item{nx}{name of data table. Usuall country code and year in the form "CCC_YYYY"}
+
 \item{lt}{list of data.tables with welfare and weight data}
 
 \item{l_rl_rows}{list of indeces}
@@ -15,6 +30,6 @@ map_fgt(lt, l_rl_rows)
 data.table with all measured
 }
 \description{
-map over list of data.tables and indices to compute FGT by reporting_level
+This function is intended to be used inside \link{map_fgt}
 }
 \keyword{internal}

From b3a7c0500b3be5a9515ae8247f66fe6a1b8250e3 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Wed, 27 Aug 2025 18:09:47 -0400
Subject: [PATCH 082/203] change aggregation to collapse

---
 R/compute_fgt_new.R                           | 39 +++++++----
 R/fg_pip.R                                    | 31 ++-------
 R/pip_grp.R                                   | 68 +++++++------------
 R/pip_grp_new.R                               | 17 +++--
 R/pip_new_lineups.R                           | 41 ++++++-----
 R/utils.R                                     | 36 ++++++++++
 ..._get_mean_median.Rd => get_mean_median.Rd} | 10 +--
 man/map_fgt.Rd                                |  8 +--
 8 files changed, 134 insertions(+), 116 deletions(-)
 rename man/{fg_get_mean_median.Rd => get_mean_median.Rd} (66%)

diff --git a/R/compute_fgt_new.R b/R/compute_fgt_new.R
index 14cd897b..8879d018 100644
--- a/R/compute_fgt_new.R
+++ b/R/compute_fgt_new.R
@@ -144,7 +144,7 @@ compute_fgt <- function(w, wt, povlines) {
 #'
 #' @rdname map_fgt
 #' @keywords  internal
-DT_fgt_by_rl <- \(x, y, nx) {
+DT_fgt_by_rl <- \(x, y, nx, povline) {
   DT_fgt <- lapply(names(y), \(rl) {
 
     idx <- y[[rl]]
@@ -168,7 +168,7 @@ DT_fgt_by_rl <- \(x, y, nx) {
 #' jkoin reporting level and lt list into one data.table
 #'
 #' @rdname map_fgt
-lt_to_dt <- \(x, y, nx) {
+lt_to_dt <- \(x, y, nx, povline) {
   DT <- lapply(names(y), \(rl) {
 
     idx <- y[[rl]]
@@ -187,8 +187,9 @@ lt_to_dt <- \(x, y, nx) {
 #' Map lt_to_dt
 #'
 #' @rdname map_fgt
-map_lt_to_dt <- \(lt, l_rl_rows) {
-  Map(lt_to_dt, lt, l_rl_rows, names(lt)) |>
+map_lt_to_dt <- \(lt, l_rl_rows, povline) {
+  Map(lt_to_dt, lt, l_rl_rows, names(lt),
+      MoreArgs = list(povline = povline)) |>
     rbindlist(fill = TRUE)
 }
 
@@ -200,8 +201,9 @@ map_lt_to_dt <- \(lt, l_rl_rows) {
 #'
 #' @return data.table with all measured
 #' @keywords internal
-map_fgt <- \(lt, l_rl_rows) {
-  Map(DT_fgt_by_rl, lt, l_rl_rows, names(lt)) |>
+map_fgt <- \(lt, l_rl_rows, povline) {
+  Map(DT_fgt_by_rl, lt, l_rl_rows, names(lt),
+      MoreArgs = list(povline = povline)) |>
     rbindlist(fill = TRUE)
 }
 
@@ -267,13 +269,13 @@ load_data_list <- \(metadata) {
 
 
-pov_from_DT <- function(DT, povline, g) {
+pov_from_DT <- function(DT, povline, g, cores = 1) {
   w       <- DT$welfare
   wt      <- DT$weight
   n_pov   <- length(povline)
 
-  ng      <- (g)
-  grp_ids <- (g)
+  ng      <- g$N.groups
+  grp_ids <- g$groups
 
   # Precompute log(w) for efficiency
   pos <- w > 0
@@ -290,21 +292,28 @@ pov_from_DT <- function(DT, povline, g) {
     poor <- w < pov
     rel_dist <- fifelse(poor, 1 - w/pov, 0)
     keep <- poor & pos
-    watts_val <- fmean((log(pov) - logw) * keep, g = g, w = wt)
-    fgt0[[i]] <- fmean(poor, g = g, w = wt)
-    fgt1[[i]] <- fmean(rel_dist, g = g, w = wt)
-    fgt2[[i]] <- fmean(rel_dist^2, g = g, w = wt)
+    watts_val <- fmean((log(pov) - logw) * keep,
+                       g = g, w = wt, nthreads  = cores )
+    fgt0[[i]] <- fmean(poor, g = g, w = wt,
+                       nthreads  = cores)
+    fgt1[[i]] <- fmean(rel_dist, g = g, w = wt,
+                       nthreads  = cores)
+    fgt2[[i]] <- fmean(rel_dist^2, g = g, w = wt,
+                       nthreads  = cores)
     watts[[i]] <- watts_val
   }
 
-  data.table(
+  out <- data.table(
     povline = rep(povline, each = ng),
-    group_id = rep(grp_ids, times = n_pov),
     fgt0 = unlist(fgt0),
     fgt1 = unlist(fgt1),
     fgt2 = unlist(fgt2),
     watts = unlist(watts)
   )
+  # Repeat group columns for each povline
+  grp_dt <- grp_ids[rep(seq_len(ng), times = n_pov)]
+  add_vars(out, pos = "front") <- grp_dt
+  out
 }
 
 
diff --git a/R/fg_pip.R b/R/fg_pip.R
index 5cab3696..c367f207 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -78,22 +78,25 @@ fg_pip <- function(country,
   l_rl_rows <- get_rl_rows(lt_att)
 
 
-  DT <- map_lt_to_dt(lt, l_rl_rows)
-  g  <- GRP(DT,  ~ country_code + reporting_year + reporting_level)
+  # DT <- map_lt_to_dt(lt, l_rl_rows)
+  # setorder(DT, country_code, reporting_year, reporting_level, welfare)
+  # g  <- GRP(DT,
+  #           ~ country_code + reporting_year + reporting_level,
+  #           sort = TRUE)
 
 
   # ZP Add: do fgt estimations using `res <- lapply(lt, process_dt, povline = povline)`
   #-------------------------
-  fgt <- map_fgt(lt, l_rl_rows) |>
+  fgt <- map_fgt(lt, l_rl_rows, povline) |>
     funique() # TO REMOVE
 
   # convert reporting year to numeric
   fgt[, reporting_year := as.numeric(reporting_year)]
 
   # Add just mean and median
-  res <- fg_get_mean_median(fgt, lkup)
+  res <- get_mean_median(fgt, lkup, fill_gaps = TRUE)
 
 
   # try metadata unique code
@@ -314,23 +317,3 @@ create_full_list <- function(country, year, refy_lkup, data_present_in_master) {
 
 
-
-#' merge into fgt table the mean and median from dist stats table in lkup
-#'
-#' @param fgt data,table with fgt measures
-#' @param lkup lkup
-#'
-#' @return data.table with with fgt, mean and median
-#' @keywords internal
-fg_get_mean_median <- \(fgt, lkup) {
-  joyn::joyn(x = fgt,
-             y = lkup$lineup_dist_stats[,
-                                        .(country_code, reporting_year,
-                                          reporting_level, mean, median)],
-             by = c('country_code', "reporting_year", "reporting_level"),
-             match_type = "m:1", # multiple povlines
-             keep = "left",
-             reportvar = FALSE,
-             verbose = FALSE)
-}
-
diff --git a/R/pip_grp.R b/R/pip_grp.R
index b8b9abdc..8d50b238 100644
--- a/R/pip_grp.R
+++ b/R/pip_grp.R
@@ -228,43 +228,31 @@ pip_aggregate_by <- function(df,
   to_keep <- all_cols[!all_cols %in% c("pop_in_poverty",
                                        "estimate_type")]
 
-  df <- df[, .SD, .SDcols = to_keep]
-
-  group_lkup <- group_lkup[, c("region_code",
-                               "reporting_year",
-                               "reporting_pop")]
+  df <- df[, .SD, .SDcols = to_keep] # I think we can ommit this part
 
   # Compute stats weighted average by groups
-  rgn <- df[, lapply(.SD, stats::weighted.mean,
-                     w = reporting_pop,
-                     na.rm = TRUE),
-            by = .(region_name,
-                   region_code,
-                   reporting_year,
-                   poverty_line),
-            .SDcols = weighted_cols
-            ]
-
-  rgn <- group_lkup[rgn,
-                    on = .(region_code, reporting_year),
-                    allow.cartesian = TRUE
-  ]
+  rgn <- df |>
+    fgroup_by(region_name,
+              region_code,
+              reporting_year,
+              poverty_line) |>
+    fselect(c(weighted_cols, "reporting_pop")) |>
+    fmean(w = reporting_pop, stub = FALSE)
 
   if (any(c("ALL", "WLD") %in% country)) {
     # Compute world aggregates
-    wld <- compute_world_aggregates(rgn = rgn,
-                                    cols = weighted_cols)
+    wld <- compute_world_aggregates(rgn = rgn)
     if (length(country) == 1) {
       if (country == "WLD") {
         # Return only world aggregate
         out <- wld
       } else if (country == "ALL") {
         # Combine with other regional aggregates
-        out <- rbind(rgn, wld, fill = TRUE)
+        out <- rowbind(rgn, wld, fill = TRUE)
       }
     } else {
       # Combine with other regional aggregates
-      out <- rbind(rgn, wld, fill = TRUE)
+      out <- rowbind(rgn, wld, fill = TRUE)
       # Return selection only
       if (!"ALL" %in% country) {
         out <- out[region_code %in% country, ]
@@ -283,24 +271,16 @@ pip_aggregate_by <- function(df,
 }
 
 
-compute_world_aggregates <- function(rgn, cols) {
+compute_world_aggregates <- function(rgn, cols = NULL) {
   # Compute stats
   # Grouping by poverty line as well since we now have vectorized poverty line values
-  wld <- rgn[, lapply(.SD,
-                      stats::weighted.mean,
-                      w = reporting_pop,
-                      na.rm = TRUE),
-             by = .(reporting_year, poverty_line),
-             .SDcols = cols
-  ]
-  # Compute yearly population WLD totals
-  tmp <- rgn[, .(reporting_pop = sum(reporting_pop)),
-             by = .(reporting_year, poverty_line)]
-
-
-  wld <- wld[tmp, on = .(reporting_year = reporting_year, poverty_line = poverty_line)]
-  wld[["region_code"]] <- "WLD"
-  wld[["region_name"]] <- "World"
+  wld <- rgn |>
+    fgroup_by(reporting_year,
+              poverty_line) |>
+    num_vars() |>
+    fmean(w = reporting_pop, stub = FALSE) |>
+    ftransform(region_code = "WLD",
+               region_name = "World")
 
   return(wld)
 
@@ -316,10 +296,10 @@ filter_for_aggregate_by <- function(df) {
   # If nationally representative survey is available, use it
   # Otherwise, use whatever is available
 
-  out <- df[, check := length(reporting_level),
-            by = c("country_code", "reporting_year", "poverty_line")]
-  out <- out[out$check == 1 | (out$check > 1 & reporting_level == "national"), ]
-
-  return(out)
+  df[, check := length(reporting_level),
+      by = c("country_code", "reporting_year", "poverty_line")
+      ][
+        check == 1 | (check > 1 & reporting_level == "national"),
+        ]
 
 }
diff --git a/R/pip_grp_new.R b/R/pip_grp_new.R
index 51f5240e..184c0583 100644
--- a/R/pip_grp_new.R
+++ b/R/pip_grp_new.R
@@ -29,14 +29,6 @@ pip_grp_new <- \(country         = "ALL",
     "ALL"
   }
 
-
-
-
-
-
-
-
-
   out <- fg_pip(
     country         = country_code,
     year            = year,
@@ -47,7 +39,14 @@ pip_grp_new <- \(country         = "ALL",
     ppp             = NULL,
     lkup           = lkup)
 
-  out <- treat_cache_and_main(out)
+  cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb")
+  if (!file.exists(cache_file_path)) {
+    # Create an empty duckdb file
+    create_duckdb_file(cache_file_path)
+  }
+  out <- treat_cache_and_main(out,
+                              cache_file_path = cache_file_path,
+                              lkup = lkup, fill_gaps = fill_gaps)
 
   # return empty dataframe if no metadata is found
   if (nrow(out) == 0) {
diff --git a/R/pip_new_lineups.R b/R/pip_new_lineups.R
index 33efc76f..0d3cb528 100644
--- a/R/pip_new_lineups.R
+++ b/R/pip_new_lineups.R
@@ -141,7 +141,9 @@ pip_new_lineups <- function(country         = "ALL",
 
   # Cache new data
   #---------------------------------------------
-  out <- treat_cache_and_main(out, cache_file_path)
+  out <- treat_cache_and_main(out,
+                              cache_file_path = cache_file_path,
+                              lkup = lkup, fill_gaps = fill_gaps)
 
   # Early return for empty table---------------
   if (nrow(out) == 0) return(pipapi::empty_response)
@@ -276,36 +278,43 @@ pip_new_lineups <- function(country         = "ALL",
 
 
+treat_cache_and_main <- \(out, cache_file_path,
+                          lkup, fill_gaps) {
 
-
-treat_cache_and_main <- \(out, cache_file_path) {
-  cached_data <- if (is.null(out$data_in_cache)) {
+    # early return of cache data if not available.
+  cached_data <-
+    if (is.null(out$data_in_cache)) {
     NULL
   } else if (is.data.frame(out$data_in_cache)) {
+
     if (fnrow(out$data_in_cache) == 0) {
       NULL
     } else {
-      qDT(out$data_in_cache)
+      ft <- qDT(out$data_in_cache)
+      if (fill_gaps) {
+        ft <-
+          fg_remove_duplicates(ft,
+                               use_new_lineup_version = lkup$use_new_lineup_version)
+
+      }
+
+      # Add just mean and median
+      get_mean_median(ft, lkup, fill_gaps = fill_gaps)
+
     }
   } else {
-    cli::cli_abort("{.code out$data_in_cache} must be NULL or data.frame not {.field {class(out$data_in_cache)}}")
+    cli::cli_abort(
+      "{.code out$data_in_cache} must be NULL or data.frame not
+      {.field {class(out$data_in_cache)}}"
+    )
   }
 
   main_data   <- qDT(out$main_data)
 
   if (nrow(main_data) > 0) {
-
-    if (is.null(out$data_in_cache)) {
-
+    if (is.null(cached_data)) {
       out <- main_data
     } else {
-
-      if (fill_gaps) {
-
-        cached_data <- fg_remove_duplicates(cached_data,
-                                            use_new_lineup_version = lkup$use_new_lineup_version)
-      }
-
       out <- main_data |>
         rowbind(cached_data)
     }
diff --git a/R/utils.R b/R/utils.R
index 33969942..0d84c2ac 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -1462,3 +1462,39 @@ unnest_dt_longer <- function(tbl, cols) {
 
 
+
+
+
+#' merge into fgt table the mean and median from dist stats table in lkup
+#'
+#' @param fgt data,table with fgt measures
+#' @param lkup lkup
+#' @param fill_gaps logical. whether to use lineup estimates
+#'
+#' @return data.table with with fgt, mean and median
+#' @keywords internal
+get_mean_median <- \(fgt, lkup, fill_gaps) {
+
+  if (isFALSE(lkup$use_new_lineup_version)) return(fgt)
+
+  if (fill_gaps) {
+    dist <- lkup$lineup_dist_stats[,
+                                   .(country_code, reporting_year,
+                                     reporting_level, mean, median)]
+  } else {
+    dist <- lkup$dist_stats[,
+                            .(country_code, reporting_year,
+                              reporting_level, mean,
+                              median = survey_median_ppp)]
+
+  }
+  joyn::joyn(x = fgt,
+             y = dist,
+             by = c('country_code', "reporting_year", "reporting_level"),
+             match_type = "m:1", # multiple povlines
+             keep = "left",
+             reportvar = FALSE,
+             verbose = FALSE)
+}
+
+
diff --git a/man/fg_get_mean_median.Rd b/man/get_mean_median.Rd
similarity index 66%
rename from man/fg_get_mean_median.Rd
rename to man/get_mean_median.Rd
index 88667176..f4fed163 100644
--- a/man/fg_get_mean_median.Rd
+++ b/man/get_mean_median.Rd
@@ -1,15 +1,17 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/fg_pip.R
-\name{fg_get_mean_median}
-\alias{fg_get_mean_median}
+% Please edit documentation in R/utils.R
+\name{get_mean_median}
+\alias{get_mean_median}
 \title{merge into fgt table the mean and median from dist stats table in lkup}
 \usage{
-fg_get_mean_median(fgt, lkup)
+get_mean_median(fgt, lkup, fill_gaps)
 }
 \arguments{
 \item{fgt}{data,table with fgt measures}
 
 \item{lkup}{lkup}
+
+\item{fill_gaps}{logical. whether to use lineup estimates}
 }
 \value{
 data.table with with fgt, mean and median
diff --git a/man/map_fgt.Rd b/man/map_fgt.Rd
index 19935f6b..e4d1c224 100644
--- a/man/map_fgt.Rd
+++ b/man/map_fgt.Rd
@@ -7,13 +7,13 @@
 \alias{map_fgt}
 \title{compute FGT using indices by reporting level}
 \usage{
-DT_fgt_by_rl(x, y, nx)
+DT_fgt_by_rl(x, y, nx, povline)
 
-lt_to_dt(x, y, nx)
+lt_to_dt(x, y, nx, povline)
 
-map_lt_to_dt(lt, l_rl_rows)
+map_lt_to_dt(lt, l_rl_rows, povline)
 
-map_fgt(lt, l_rl_rows)
+map_fgt(lt, l_rl_rows, povline)
 }
 \arguments{
 \item{x}{data.table from lt list, with welfare and weight vectors}

From 83ed0fff1cbc8c3f8763c787902cebac02b49c05 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Sat, 30 Aug 2025 09:08:47 -0400
Subject: [PATCH 083/203] add validate_country_codes function

---
 R/compute_fgt_new.R |  2 +-
 R/create_lkups.R    |  3 ++-
 R/pip_new_lineups.R | 28 +++++++++++++++++++---------
 3 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/R/compute_fgt_new.R b/R/compute_fgt_new.R
index 8879d018..2d09a1e8 100644
--- a/R/compute_fgt_new.R
+++ b/R/compute_fgt_new.R
@@ -275,7 +275,7 @@ pov_from_DT <- function(DT, povline, g, cores = 1) {
   n_pov   <- length(povline)
 
   ng      <- g$N.groups
-  grp_ids <- g$groups
+  grp_ids <- qDT(g$groups)
 
   # Precompute log(w) for efficiency
   pos <- w > 0
diff --git a/R/create_lkups.R b/R/create_lkups.R
index 531f7933..890a7c04 100644
--- a/R/create_lkups.R
+++ b/R/create_lkups.R
@@ -16,7 +16,8 @@ create_versioned_lkups <-
 
     versions <- names(data_dirs)
     # versions[1] <- "latest_release"
-    versions_paths <- lapply(data_dirs, create_lkups, versions = versions)
+    versions_paths <- mapply(create_lkups, data_dirs, versions,
+                             SIMPLIFY = FALSE, USE.NAMES = FALSE)
     names(versions_paths) <- versions
 
     return(list(versions = versions,
diff --git a/R/pip_new_lineups.R b/R/pip_new_lineups.R
index 0d3cb528..df71ad34 100644
--- a/R/pip_new_lineups.R
+++ b/R/pip_new_lineups.R
@@ -98,13 +98,10 @@ pip_new_lineups <- function(country         = "ALL",
   # **** TO BE REMOVED **** REMOVAL ENDS HERE
 
   # Countries vector ------------
-  lcv <- # List with countries vectors
-    create_countries_vctr(
-      country         =  country,
-      year            =  year,
-      valid_years     =  lkup$valid_years,
-      aux_files       =  lkup$aux_files
-    )
+  validate_country_codes(country = country, lkup = lkup)
+
+
+
   # lcv$est_ctrs has all the country_code that we are interested in
 
   cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb")
@@ -116,7 +113,7 @@ pip_new_lineups <- function(country         = "ALL",
   if (fill_gaps) {
     ## lineup years-----------------
     out <- fg_pip(
-      country            = lcv$est_ctrs,
+      country            = country,
       year               = year,
       povline            = povline,
       popshare           = popshare,
@@ -128,7 +125,7 @@ pip_new_lineups <- function(country         = "ALL",
   } else {
     ## survey years ------------------
     out <- rg_pip(
-      country         = lcv$est_ctrs,
+      country         = country,
       year            = year,
       povline         = povline,
       popshare        = popshare,
@@ -328,3 +325,16 @@ treat_cache_and_main <- \(out, cache_file_path,
 
   setDT(out)
 }
+
+
+
+validate_country_codes <- \(country, lkup) {
+  cls <- lkup$aux_files$country_list$country_code |>
+    unique()
+
+  if (any(!country %in% cls)) {
+    wcls <- which(!country %in% cls)
+    cli::cli_abort("{.field {country[wcls]}} {?is/are} not {?a/} valid country code{?s}")
+  }
+  invisible(TRUE)
+}

From c0b5da3b4e4d4379d8371ee26ff62611316e447e Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Tue, 2 Sep 2025 10:09:32 -0400
Subject: [PATCH 084/203] temporal fix to include CMD info

---
 R/create_lkups.R | 111 ++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 95 insertions(+), 16 deletions(-)

diff --git a/R/create_lkups.R b/R/create_lkups.R
index 890a7c04..e547e277 100644
--- a/R/create_lkups.R
+++ b/R/create_lkups.R
@@ -195,6 +195,83 @@ create_lkups <- function(data_dir, versions) {
       refy_lkup      <- fst::read_fst(refy_lkup_path,
                                       as.data.table = TRUE)
 
+
+      # ZP ADD - CREATE OBJECT: lineup years
+      #___________________________________________________________________________
+      lineup_years_path <-
+        fs::path(data_dir,
+                 "estimations/lineup_years.fst")
+
+      lineup_years <- fst::read_fst(lineup_years_path) |>
+        as.list() # Why Is this a list?
+
+
+
+      # --- START NOTE AC> Include here the refy_lkup for CMD
+      ncountries <- nrow(country_list)
+      ly <- lineup_years$lineup_years
+
+
+      # Add reporting year
+      cmd2 <- CJ(country_list$country_code,
+                 ly, c("national", "urban", "rural"))
+
+
+      cmd <-
+        country_list[rep(1:.N,
+                         each = length(ly))
+                     ][,
+                       reporting_year := rep(ly, times = ncountries)
+                       # Add reporting level
+                       ][rep(1:.N, each = 3),
+                         ][,
+                           reporting_level :=
+                             rep(c("national", "urban", "rural"),
+                                 times = (length(ly)*ncountries))]
+
+      cmd <- joyn::joyn(x          = cmd,
+                         y          = refy_lkup,
+                         by         = c('country_code',
+                                        'reporting_year',
+                                        'reporting_level'),
+                         keep       = "anti",
+                         # reportvar  = FALSE,
+                         match_type = "1:1")
+
+      # Delete unnecessary reporting levels
+      cmd <- cmd[(reporting_level == "national" & .joyn == "x")
+                 ][,
+                   .joyn := NULL]
+
+      # build some variables
+      cmd[,
+           `:=`(
+               cache_id = paste(country_code, reporting_year,"NOSVY_D1_CON_CMD",
+                                sep = "_"),
+               survey_coverage = "national",
+               welfare_type = "consumption",
+               distribution_type = "CMD distribution",
+               is_interpolated = FALSE,
+               is_used_for_line_up = TRUE,
+               is_used_for_aggregation = FALSE,
+               estimation_type = "CMD estimation",
+               display_cp = "0",
+               monotonic = TRUE, # ?
+               same_direction = TRUE, # NA ?
+               relative_distance = 1,
+               lineup_approach = "CMD",
+               mult_factor = 1
+
+             )]
+
+      # Append lineup and CMD info
+
+      refy_lkup <- rbindlist(list(refy_lkup, cmd),
+                             use.names = TRUE,
+                             fill = TRUE)
+
+
+      # Create additional variables
       refy_lkup[ ,
                  path := {
                    fs::path(data_dir,
@@ -212,10 +289,10 @@ create_lkups <- function(data_dir, versions) {
                                           reporting_level,
                                           sep = "_")]
 
-      if ("region_code" %in% names(refy_lkup)) {
-        refy_lkup[,
-                  region_code := NULL]
-      }
+      # if ("region_code" %in% names(refy_lkup)) {
+      #   refy_lkup[,
+      #             region_code := NULL]
+      # }
 
 
       refy_lkup[,
@@ -229,14 +306,24 @@ create_lkups <- function(data_dir, versions) {
                                                collapse = "|"),
                 by = .(interpolation_id)]
 
-      refy_lkup <- joyn::joyn(x          = refy_lkup,
-                              y          = countries,
-                              by         = 'country_code',
+
+      # Temporal fix
+      refy_lkup <- joyn::joyn(refy_lkup, country_list,
+                              by         = c('country_code',
+                                             'reporting_year',
+                                             'reporting_level'),
                               keep       = "left",
                               reportvar  = FALSE,
-                              match_type = "m:1")
+                              match_type = "1:1",
+                              update_values = TRUE)
+
 
 
+
+      # --- END inclussion of CMD data.
+
+      refy_lkup <- refy_lkup[reporting_year %in% lineup_years$lineup_years, ]
+
       gv(refy_lkup,
          c("monotonic",
            "same_direction",
@@ -256,15 +343,7 @@ create_lkups <- function(data_dir, versions) {
            "relative_distance")) <- NULL
 
 
-      # ZP ADD - CREATE OBJECT: lineup years
-      #___________________________________________________________________________
-      lineup_years_path <-
-        fs::path(data_dir,
-                 "estimations/lineup_years.fst")
 
-      lineup_years <- fst::read_fst(lineup_years_path) |>
-        as.list()
-      refy_lkup <- refy_lkup[reporting_year %in% lineup_years$lineup_years, ]
 
       # ZP ADD - CREATE OBJECT: lineup dist stats
       #___________________________________________________________________________

From cb756d3e4b91078478ba2e746be80bbc561084cc Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Tue, 2 Sep 2025 10:10:37 -0400
Subject: [PATCH 085/203] improve efficiency

---
 R/create_lkups.R | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/R/create_lkups.R b/R/create_lkups.R
index e547e277..4cd9e306 100644
--- a/R/create_lkups.R
+++ b/R/create_lkups.R
@@ -213,21 +213,12 @@ create_lkups <- function(data_dir, versions) {
 
 
       # Add reporting year
-      cmd2 <- CJ(country_list$country_code,
+      cmd <- CJ(country_list$country_code,
                  ly, c("national", "urban", "rural"))
 
-
-      cmd <-
-        country_list[rep(1:.N,
-                         each = length(ly))
-                     ][,
-                       reporting_year := rep(ly, times = ncountries)
-                       # Add reporting level
-                       ][rep(1:.N, each = 3),
-                         ][,
-                           reporting_level :=
-                             rep(c("national", "urban", "rural"),
-                                 times = (length(ly)*ncountries))]
+      setnames(cmd, new = c('country_code',
+                            'reporting_year',
+                            'reporting_level'))
 
       cmd <- joyn::joyn(x          = cmd,
                          y          = refy_lkup,

From 11c5e440390c476f9a1e230044701d19383c5927 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Tue, 2 Sep 2025 10:14:08 -0400
Subject: [PATCH 086/203] tempooral fix complete

---
 R/create_lkups.R | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/R/create_lkups.R b/R/create_lkups.R
index 4cd9e306..d4d26413 100644
--- a/R/create_lkups.R
+++ b/R/create_lkups.R
@@ -300,12 +300,10 @@ create_lkups <- function(data_dir, versions) {
 
       # Temporal fix
       refy_lkup <- joyn::joyn(refy_lkup, country_list,
-                              by         = c('country_code',
-                                             'reporting_year',
-                                             'reporting_level'),
+                              by         = "country_code",
                               keep       = "left",
                               reportvar  = FALSE,
-                              match_type = "1:1",
+                              match_type = "m:1",
                               update_values = TRUE)
 
 
From 6cfcca9e55338d33305b15390df4cb3da3c4734f Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Tue, 2 Sep 2025 16:35:50 -0400
Subject: [PATCH 087/203] update population

---
 R/create_lkups.R | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/R/create_lkups.R b/R/create_lkups.R
index d4d26413..2e30c646 100644
--- a/R/create_lkups.R
+++ b/R/create_lkups.R
@@ -307,6 +307,25 @@ create_lkups <- function(data_dir, versions) {
                               update_values = TRUE)
 
 
+      # merge population
+      popl <- pivot(pop,
+            ids = c("country_code", "data_level"),
+            names = list(variable = "reporting_year",
+                         value = "reporting_pop"),
+            how = "longer") |>
+        frename(data_level = reporting_level)
+
+      refy_lkup <- joyn::joyn(refy_lkup, popl,
+                              by         = c('country_code',
+                                             'reporting_year',
+                                             'reporting_level'),
+                              keep       = "left",
+                              reportvar  = FALSE,
+                              match_type = "1:1",
+                              update_values = TRUE)
+
+
+
 
 
       # --- END inclussion of CMD data.

From ea094e1c0eb554e67682513774fbbccd51e9c410 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Wed, 3 Sep 2025 12:08:09 -0400
Subject: [PATCH 088/203] make sure we don't have factors in years of
 population

---
 R/create_lkups.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/R/create_lkups.R b/R/create_lkups.R
index 2e30c646..7556f2ad 100644
--- a/R/create_lkups.R
+++ b/R/create_lkups.R
@@ -313,6 +313,7 @@ create_lkups <- function(data_dir, versions) {
             names = list(variable = "reporting_year",
                          value = "reporting_pop"),
             how = "longer") |>
+        ftransform(reporting_year = as_integer_factor(reporting_year)) |>
         frename(data_level = reporting_level)
 
       refy_lkup <- joyn::joyn(refy_lkup, popl,

From 9ad8a6dd40cd7b56abe49fd6e811e84202e92bb3 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Thu, 4 Sep 2025 17:15:55 -0400
Subject: [PATCH 089/203] fix bug in country_list

---
 R/create_lkups.R |  1 +
 R/pip_grp_new.R  |  4 ++--
 man/pip_agg.Rd   | 16 +++++++++++++++-
 man/pip_gg.Rd    | 20 --------------------
 4 files changed, 18 insertions(+), 23 deletions(-)
 delete mode 100644 man/pip_gg.Rd

diff --git a/R/create_lkups.R b/R/create_lkups.R
index 7556f2ad..e335511a 100644
--- a/R/create_lkups.R
+++ b/R/create_lkups.R
@@ -92,6 +92,7 @@ create_lkups <- function(data_dir, versions) {
   ## country_list ----
   cl_lkup_path    <- fs::path(data_dir, "_aux/country_list.fst")
   country_list     <- fst::read_fst(cl_lkup_path, as.data.table = TRUE)
+  data.table::setnames(country_list, 'region', 'region_name') # Why is this necessary?
 
   ## countries ----
   cts_path <- fs::path(data_dir, "_aux/countries.fst")
diff --git a/R/pip_grp_new.R b/R/pip_grp_new.R
index 184c0583..352e652d 100644
--- a/R/pip_grp_new.R
+++ b/R/pip_grp_new.R
@@ -1,5 +1,5 @@
 #' New way to estimate Aggregate data
-#' @rdname pip_gg
+#' @rdname pip_agg
 pip_grp_new <- \(country         = "ALL",
                  year            = "ALL",
                  povline         = 1.9,
@@ -46,7 +46,7 @@ pip_grp_new <- \(country         = "ALL",
   }
   out <- treat_cache_and_main(out,
                               cache_file_path = cache_file_path,
-                              lkup = lkup, fill_gaps = fill_gaps)
+                              lkup = lkup, fill_gaps = TRUE)
 
   # return empty dataframe if no metadata is found
   if (nrow(out) == 0) {
diff --git a/man/pip_agg.Rd b/man/pip_agg.Rd
index 9c197ecb..3eb27292 100644
--- a/man/pip_agg.Rd
+++ b/man/pip_agg.Rd
@@ -1,8 +1,9 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/pip_agg.R, R/pip_grp_logic.R
+% Please edit documentation in R/pip_agg.R, R/pip_grp_logic.R, R/pip_grp_new.R
 \name{pip_agg}
 \alias{pip_agg}
 \alias{pip_grp_logic}
+\alias{pip_grp_new}
 \title{Logic for computing new aggregate}
 \usage{
 pip_agg(
@@ -30,6 +31,17 @@ pip_grp_logic(
   lkup_hash = lkup$cache_data_id$hash_pip_grp,
   additional_ind = FALSE
 )
+
+pip_grp_new(
+  country = "ALL",
+  year = "ALL",
+  povline = 1.9,
+  welfare_type = c("all", "consumption", "income"),
+  reporting_level = c("all", "national"),
+  lkup,
+  censor = TRUE,
+  additional_ind = FALSE
+)
 }
 \arguments{
 \item{country}{character: Country ISO 3 codes}
@@ -63,6 +75,8 @@ data.table
 Logic for computing new aggregate
 
 Old way to estimate aggregate data
+
+New way to estimate Aggregate data
 }
 \examples{
 \dontrun{
diff --git a/man/pip_gg.Rd b/man/pip_gg.Rd
deleted file mode 100644
index 285bcec0..00000000
--- a/man/pip_gg.Rd
+++ /dev/null
@@ -1,20 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/pip_grp_new.R
-\name{pip_grp_new}
-\alias{pip_grp_new}
-\title{New way to estimate Aggregate data}
-\usage{
-pip_grp_new(
-  country = "ALL",
-  year = "ALL",
-  povline = 1.9,
-  welfare_type = c("all", "consumption", "income"),
-  reporting_level = c("all", "national"),
-  lkup,
-  censor = TRUE,
-  additional_ind = FALSE
-)
-}
-\description{
-New way to estimate Aggregate data
-}

From 772de99c58ebcd8ee5e32eaa3b82b0b5640d0112 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Thu, 4 Sep 2025 20:17:31 -0400
Subject: [PATCH 090/203] preliminary results... working but ugly

---
 R/fg_pip.R              | 73 +++++------------------------------------
 R/utils-pipdata.R       |  6 +---
 man/create_full_list.Rd | 12 ++-----
 3 files changed, 12 insertions(+), 79 deletions(-)

diff --git a/R/fg_pip.R b/R/fg_pip.R
index c367f207..8d46dc31 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -58,10 +58,7 @@ fg_pip <- function(country,
                 data_in_cache = data_present_in_master))
   }
 
-  full_list <- create_full_list(country                = country,
-                                year                   = year,
-                                refy_lkup              = refy_lkup,
-                                data_present_in_master = data_present_in_master)
+  full_list <- create_full_list(metadata = metadata)
 
   lt <-
     load_list_refy(input_list = full_list,
@@ -251,68 +248,14 @@ fg_assign_nas_values_to_dup_cols <- function(df,
 
 #' Create full list for fg data load, not including country-years in cache
 #'
-#' @param country Country selected in [fg_pip] function
-#' @param year Year/s selected in [fg_pip] function
-#' @param refy_lkup reference year lkup table with full lineups and new method
-#' @param data_present_in_master cache data
-#'
-#' @return list
-create_full_list <- function(country, year, refy_lkup, data_present_in_master) {
-
-  if (!is.null(data_present_in_master)) {
-    data_not_in_cache <-
-      joyn::anti_join(x         =  refy_lkup,
-                      y         =  data_present_in_master,
-                      by        = c("country_code", "reporting_year"),
-                      reportvar = FALSE,
-                      verbose   = FALSE)
-  } else {
-    data_not_in_cache <- refy_lkup
-  }
-
-  # Extract unique combinations of country-year
-  if (any(c("ALL", "WLD") %in% country)) {
-    cntry <- data_not_in_cache$country_code |>
-      funique()
-  } else {
-    cntry <- data_not_in_cache[country_code %in% country,
-    ]$country_code |>
-      funique()
-  }
-  if (any(c("ALL") %in% year)) {
-    yr <- data_not_in_cache$reporting_year |>
-      unique()
-  } else {
-    yr <- data_not_in_cache[reporting_year %in% year,
-    ]$reporting_year |>
-      funique()
-  }
-  dtemp <-
-    data_not_in_cache |>
-    fsubset(country_code     %in% cntry &
-              reporting_year %in% yr) |>
-    fsubset(reporting_year %in% lkup$valid_years$lineup_years) |>
-    fselect(country_code,
-            year = reporting_year) |>
-    funique()
-
-  # Split years by country
-  full_list <- dtemp[,
-                     .(year = list(year)),
-                     by = country_code
-                     ][,
-                       .(country_code, year = year)
-                     ]
-
-  # Convert to desired structure
-  full_list <- list(
-    country_code = full_list$country_code,
-    year         = lapply(full_list$year,
-                          as.numeric))
-
-
-  full_list
+#' @param metadata data table from subset_lkup()$lkup
+#' @return data.table
+create_full_list <- function(metadata) {
 
+  x <- metadata[, .(country_code, reporting_year)
+                ][,
+                  inames := paste0(country_code, "_", reporting_year)]
+  x
 }
 
 
diff --git a/R/utils-pipdata.R b/R/utils-pipdata.R
index 33ce4df6..86f82230 100644
--- a/R/utils-pipdata.R
+++ b/R/utils-pipdata.R
@@ -6,13 +6,9 @@
 #' @return character vector
 #' @keywords internal
 load_list_refy <- \(input_list, path){
-  input_list <- transform_input(input_list)
-
-  inames <- lapply(input_list, \(x) {
-    paste0(x$country_code, "_", x$year)
-  })
 
   cores <- get_from_pipapienv("cores_to_use")
+  inames <- input_list$inames
 
   dl <- lapply(inames, \(x) {
     qs::qread(file = fs::path(path, x, ext = "qs"),
diff --git a/man/create_full_list.Rd b/man/create_full_list.Rd
index 249ae8b6..5fe3dbc9 100644
--- a/man/create_full_list.Rd
+++ b/man/create_full_list.Rd
@@ -4,19 +4,13 @@
 \alias{create_full_list}
 \title{Create full list for fg data load, not including country-years in cache}
 \usage{
-create_full_list(country, year, refy_lkup, data_present_in_master)
+create_full_list(metadata)
 }
 \arguments{
-\item{country}{Country selected in \link{fg_pip} function}
-
-\item{year}{Year/s selected in \link{fg_pip} function}
-
-\item{refy_lkup}{reference year lkup table with full lineups and new method}
-
-\item{data_present_in_master}{cache data}
+\item{metadata}{data table from subset_lkup()$lkup}
 }
 \value{
-list
+data.table
 }
 \description{
 Create full list for fg data load, not including country-years in cache

From bac2abffa738d11a038039d1e9172ac2b168ef65 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Thu, 4 Sep 2025 21:37:19 -0400
Subject: [PATCH 091/203] fix issue with missclassified CMDs

---
 R/create_lkups.R    | 18 +++++++++++++-----
 R/pip_grp.R         | 30 ++++++++++++++++++++----------
 R/pip_grp_logic.R   |  1 -
 R/pip_grp_new.R     |  1 -
 R/pip_new_lineups.R |  1 -
 R/pip_old.R         |  1 -
 R/pip_old_lineups.R |  1 -
 7 files changed, 33 insertions(+), 20 deletions(-)

diff --git a/R/create_lkups.R b/R/create_lkups.R
index e335511a..9f2a847e 100644
--- a/R/create_lkups.R
+++ b/R/create_lkups.R
@@ -228,12 +228,18 @@ create_lkups <- function(data_dir, versions) {
                                         'reporting_level'),
                          keep       = "anti",
                          # reportvar  = FALSE,
-                         match_type = "1:1")
+                         match_type = "1:1") |>
+        setDT()
+
+      # get number or reporing levels left after anti join
+      # if less than three, it should NOT be CMD (e.g., ARG or CHN)
+      cmd[, n := .N,
+          by = c("country_code", "reporting_year")]
 
       # Delete unnecessary reporting levels
-      cmd <- cmd[(reporting_level == "national" & .joyn == "x")
-                 ][,
-                   .joyn := NULL]
+      cmd <- cmd[(reporting_level == "national" &
+                     .joyn == "x" & n == 3)
+                 ]
 
       # build some variables
       cmd[,
@@ -252,7 +258,9 @@ create_lkups <- function(data_dir, versions) {
                same_direction = TRUE, # NA ?
                relative_distance = 1,
                lineup_approach = "CMD",
-               mult_factor = 1
+               mult_factor = 1,
+               .joyn = NULL,
+               n     = NULL
 
              )]
 
diff --git a/R/pip_grp.R b/R/pip_grp.R
index 8d50b238..5a470a37 100644
--- a/R/pip_grp.R
+++ b/R/pip_grp.R
@@ -86,7 +86,6 @@ pip_grp <- function(country         = "ALL",
 
     out <- pip_aggregate_by(
       df = out,
-      group_lkup = lkup[["pop_region"]],
       country = country,
       return_cols = lkup$return_cols$pip_grp
     )
@@ -209,13 +208,11 @@ pip_aggregate <- function(df, by = NULL, return_cols) {
 
 #' Aggregate by predefined groups
 #' @param df data.frame: Response from `fg_pip_old()` or `rg_pip()`.
-#' @param group_lkup data.frame: Group lkup table (pop_region)
 #' @param country character: Selected countries / regions
 #' @param return_cols list: lkup$return_cols$pip_grp object. Controls returned
 #' columns
 #' @keywords internal
 pip_aggregate_by <- function(df,
-                             group_lkup,
                              country = "ALL",
                              return_cols) {
 
@@ -228,7 +225,7 @@ pip_aggregate_by <- function(df,
   to_keep <- all_cols[!all_cols %in% c("pop_in_poverty",
                                        "estimate_type")]
 
-  df <- df[, .SD, .SDcols = to_keep] # I think we can ommit this part
+  # df <- df[, .SD, .SDcols = to_keep] # I think we can ommit this part
 
   # Compute stats weighted average by groups
   rgn <- df |>
@@ -239,6 +236,19 @@ pip_aggregate_by <- function(df,
     fselect(c(weighted_cols, "reporting_pop")) |>
     fmean(w = reporting_pop, stub = FALSE)
 
+  # Africas aggregation
+  afr <- df |>
+    fgroup_by(africa_split,
+              africa_split_code,
+              reporting_year,
+              poverty_line) |>
+    fselect(c(weighted_cols, "reporting_pop")) |>
+    fmean(w = reporting_pop, stub = FALSE) |>
+    fsubset(!is.na(africa_split_code))
+
+
+
+
   if (any(c("ALL", "WLD") %in% country)) {
     # Compute world aggregates
     wld <- compute_world_aggregates(rgn = rgn)
@@ -248,11 +258,11 @@ pip_aggregate_by <- function(df,
         out <- wld
       } else if (country == "ALL") {
         # Combine with other regional aggregates
-        out <- rowbind(rgn, wld, fill = TRUE)
+        out <- rowbind(rgn, afr, wld, fill = TRUE)
       }
     } else {
       # Combine with other regional aggregates
-      out <- rowbind(rgn, wld, fill = TRUE)
+      out <- rowbind(rgn, afr, wld, fill = TRUE)
       # Return selection only
       if (!"ALL" %in% country) {
         out <- out[region_code %in% country, ]
@@ -260,14 +270,14 @@ pip_aggregate_by <- function(df,
     }
   } else {
     # Return only selected regions
-    out <- rgn
-  }
+    out <- rowbind(rgn, afr, fill = TRUE) |>
+      _[region_code %in% country, ]
 
+  }
 
   # Compute population living in poverty
-  out$pop_in_poverty <- round(out$headcount * out$reporting_pop, 0)
+  out[, pop_in_poverty := round(headcount * reporting_pop, 0)]
 
-  return(out)
 }
 
 
diff --git a/R/pip_grp_logic.R b/R/pip_grp_logic.R
index 5503b4c7..497dc907 100644
--- a/R/pip_grp_logic.R
+++ b/R/pip_grp_logic.R
@@ -315,7 +315,6 @@ pip_grp_helper <- function(lcv_country,
 
     out <- pip_aggregate_by(
       df = out,
-      group_lkup = lkup[["pop_region"]],
       country = country,
       return_cols = lkup$return_cols$pip_grp
     )
diff --git a/R/pip_grp_new.R b/R/pip_grp_new.R
index 352e652d..e4880abb 100644
--- a/R/pip_grp_new.R
+++ b/R/pip_grp_new.R
@@ -71,7 +71,6 @@ pip_grp_new <- \(country         = "ALL",
 
     out <- pip_aggregate_by(
       df = out,
-      group_lkup = lkup[["pop_region"]],
       country = country,
       return_cols = lkup$return_cols$pip_grp
     )
diff --git a/R/pip_new_lineups.R b/R/pip_new_lineups.R
index df71ad34..9687901a 100644
--- a/R/pip_new_lineups.R
+++ b/R/pip_new_lineups.R
@@ -171,7 +171,6 @@ pip_new_lineups <- function(country         = "ALL",
 
     out <- pip_aggregate_by(
       df          = out,
-      group_lkup  = lkup[["pop_region"]],
       return_cols = lkup$return_cols$pip_grp)
 
     # Censor regional values
diff --git a/R/pip_old.R b/R/pip_old.R
index 3fc5cee7..aa9c28fd 100644
--- a/R/pip_old.R
+++ b/R/pip_old.R
@@ -184,7 +184,6 @@ pip_old <- function(country         = "ALL",
 
     out <- pip_aggregate_by(
       df          = out,
-      group_lkup  = lkup[["pop_region"]],
       return_cols = lkup$return_cols$pip_grp
     )
     # Censor regional values
diff --git a/R/pip_old_lineups.R b/R/pip_old_lineups.R
index 8994a3ba..6424d6c5 100644
--- a/R/pip_old_lineups.R
+++ b/R/pip_old_lineups.R
@@ -182,7 +182,6 @@ pip_old_lineups <- function(country         = "ALL",
 
     out <- pip_aggregate_by(
       df          = out,
-      group_lkup  = lkup[["pop_region"]],
       return_cols = lkup$return_cols$pip_grp
     )
     # Censor regional values

From 264e62b82746c755e0b6aba63664cb4d39f2022b Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Thu, 4 Sep 2025 21:51:31 -0400
Subject: [PATCH 092/203] fix small issue

---
 R/pip_grp.R | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/R/pip_grp.R b/R/pip_grp.R
index 5a470a37..ac9040ac 100644
--- a/R/pip_grp.R
+++ b/R/pip_grp.R
@@ -244,7 +244,9 @@ pip_aggregate_by <- function(df,
               poverty_line) |>
     fselect(c(weighted_cols, "reporting_pop")) |>
     fmean(w = reporting_pop, stub = FALSE) |>
-    fsubset(!is.na(africa_split_code))
+    fsubset(!is.na(africa_split_code)) |>
+    frename(africa_split_code = region_code,
+            africa_split      = region_name)
 
 
From 1c7d05474e5b2155136c31fbe350d54bf7bd59e1 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Fri, 5 Sep 2025 09:01:48 -0400
Subject: [PATCH 093/203] set default for data_dir in get_aux_table()

---
 R/get_aux_table.R   | 12 +++++++++++-
 R/pip_grp.R         |  3 ++-
 R/pip_new_lineups.R |  3 ++-
 R/utils-pipdata.R   |  9 ++++++++-
 4 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/R/get_aux_table.R b/R/get_aux_table.R
index 93604585..5f2bae95 100644
--- a/R/get_aux_table.R
+++ b/R/get_aux_table.R
@@ -7,7 +7,17 @@
 #' @return data.frame
 #' @export
 #'
-get_aux_table <- function(data_dir, table, long_format = FALSE) {
+get_aux_table <- function(data_dir = NULL,
+                          table, long_format = FALSE) {
+
+  if (is.null(data_dir)) {
+    if (exists("lkup", inherits = TRUE)) {
+      data_dir <- get("lkup", inherits = TRUE)$data_root
+    } else {
+      cli::cli_abort("{.code data_dir} not defined and {.field lkup} not found.")
+  }
+}
+
   if (long_format && !table %in% get_valid_aux_long_format_tables()) {
     long_format <- FALSE
   }
diff --git a/R/pip_grp.R b/R/pip_grp.R
index ac9040ac..842a1e16 100644
--- a/R/pip_grp.R
+++ b/R/pip_grp.R
@@ -237,6 +237,7 @@ pip_aggregate_by <- function(df,
     fmean(w = reporting_pop, stub = FALSE)
 
   # Africas aggregation
+  if (any(c("ALL", "WLD") %in% country)) {
   afr <- df |>
     fgroup_by(africa_split,
               africa_split_code,
@@ -248,7 +249,7 @@ pip_aggregate_by <- function(df,
     frename(africa_split_code = region_code,
             africa_split      = region_name)
 
-
+  }
 
 
   if (any(c("ALL", "WLD") %in% country)) {
diff --git a/R/pip_new_lineups.R b/R/pip_new_lineups.R
index 9687901a..5153f037 100644
--- a/R/pip_new_lineups.R
+++ b/R/pip_new_lineups.R
@@ -329,7 +329,8 @@ treat_cache_and_main <- \(out, cache_file_path,
 
 validate_country_codes <- \(country, lkup) {
   cls <- lkup$aux_files$country_list$country_code |>
-    unique()
+    unique() |>
+    c("ALL")
 
   if (any(!country %in% cls)) {
     wcls <- which(!country %in% cls)
diff --git a/R/utils-pipdata.R b/R/utils-pipdata.R
index 86f82230..39def155 100644
--- a/R/utils-pipdata.R
+++ b/R/utils-pipdata.R
@@ -10,7 +10,14 @@ load_list_refy <- \(input_list, path){
   cores <- get_from_pipapienv("cores_to_use")
   inames <- input_list$inames
 
-  dl <- lapply(inames, \(x) {
+  seq_flex <- if (interactive()) {
+    cli::cli_progress_along
+  } else {
+    base::seq_along
+  }
+
+  dl <- lapply(seq_flex(inames), \(i) {
+    x <- inames[i]
     qs::qread(file = fs::path(path, x, ext = "qs"),
               nthreads = cores)
     }) |>

From 9397c8e4fd28fa2eeeefa8947b091522544638b4 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Fri, 5 Sep 2025 16:03:25 -0400
Subject: [PATCH 094/203] this is the most recent

---
 R/pip_grp.R | 48 ++++++++++++++++++++++--------------------------
 1 file changed, 22 insertions(+), 26 deletions(-)

diff --git a/R/pip_grp.R b/R/pip_grp.R
index 842a1e16..15281b76 100644
--- a/R/pip_grp.R
+++ b/R/pip_grp.R
@@ -214,7 +214,7 @@ pip_aggregate <- function(df, by = NULL, return_cols) {
 #' @keywords internal
 pip_aggregate_by <- function(df,
                              country = "ALL",
-                             return_cols) {
+                             return_cols = NULL) {
 
   all_cols <- return_cols$cols
   weighted_cols <- return_cols$weighted_average_cols
@@ -236,9 +236,17 @@ pip_aggregate_by <- function(df,
     fselect(c(weighted_cols, "reporting_pop")) |>
     fmean(w = reporting_pop, stub = FALSE)
 
-  # Africas aggregation
+  # World aggregation
   if (any(c("ALL", "WLD") %in% country)) {
-  afr <- df |>
+    # Compute world aggregates
+    wld <- compute_world_aggregates(rgn = rgn)
+  } else {
+    wld <- NULL
+  }
+
+  # Africas aggregation
+  if (any(c("ALL", "AFE", "AFW") %in% country)) {
+  rgn <- df |>
     fgroup_by(africa_split,
               africa_split_code,
               reporting_year,
@@ -247,40 +255,28 @@ pip_aggregate_by <- function(df,
     fmean(w = reporting_pop, stub = FALSE) |>
     fsubset(!is.na(africa_split_code)) |>
     frename(africa_split_code = region_code,
-            africa_split      = region_name)
-
+            africa_split      = region_name) |>
+    rowbind(rgn, fill = TRUE)
   }
 
 
-  if (any(c("ALL", "WLD") %in% country)) {
-    # Compute world aggregates
-    wld <- compute_world_aggregates(rgn = rgn)
     if (length(country) == 1) {
       if (country == "WLD") {
         # Return only world aggregate
-        out <- wld
-      } else if (country == "ALL") {
-        # Combine with other regional aggregates
-        out <- rowbind(rgn, afr, wld, fill = TRUE)
-      }
-    } else {
-      # Combine with other regional aggregates
-      out <- rowbind(rgn, afr, wld, fill = TRUE)
-      # Return selection only
-      if (!"ALL" %in% country) {
-        out <- out[region_code %in% country, ]
+        wld[, pop_in_poverty := round(headcount * reporting_pop, 0)]
+        return(wld)
       }
     }
-  } else {
-    # Return only selected regions
-    out <- rowbind(rgn, afr, fill = TRUE) |>
-      _[region_code %in% country, ]
 
-  }
+  # Combine with other regional aggregates
+    out <- rowbind(rgn, wld, fill = TRUE)
+    out[, pop_in_poverty := round(headcount * reporting_pop, 0)]
 
-  # Compute population living in poverty
-  out[, pop_in_poverty := round(headcount * reporting_pop, 0)]
+    if ("ALL" %in% country) {
+      return(out)
+    }
 
+    out[region_code %in% country, ]
 }
 
 
From 0c6b838d9fdd1949dc96f4caec0db65b76c0970b Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Sat, 6 Sep 2025 10:14:55 -0400
Subject: [PATCH 095/203] fix create_countries_vctr addint lkup

---
 R/create_countries_vctr.R                   |  7 ++++---
 R/pip_grp_logic.R                           |  7 ++++---
 R/pip_old.R                                 |  3 +--
 R/pip_old_lineups.R                         |  3 +--
 tests/testthat/test-create_countries_vctr.R | 21 +++++++--------------
 5 files changed, 17 insertions(+), 24 deletions(-)

diff --git a/R/create_countries_vctr.R b/R/create_countries_vctr.R
index 230118c4..72eb5c8e 100644
--- a/R/create_countries_vctr.R
+++ b/R/create_countries_vctr.R
@@ -6,14 +6,15 @@
 #'
 #' @inheritParams pip
 #' @param valid_years list: Valid years information provided through lkup object
-#' @param aux_files list: List of auxiliary tables provided through lkup object
+#' @param lkup lkup object
 #'
 #' @return a list of vectors with countries and regions code to be used in
 #'   `pip()` and `pip_grp()`
 create_countries_vctr <- function(country,
                                   year,
-                                  valid_years,
-                                  aux_files) {
+                                  lkup) {
+  valid_years <-   lkup$valid_years
+  aux_files   <-   lkup$aux_files
 
   # STEP 1: Setup ----
   ## init Return list ----
diff --git a/R/pip_grp_logic.R b/R/pip_grp_logic.R
index 497dc907..216bb0f1 100644
--- a/R/pip_grp_logic.R
+++ b/R/pip_grp_logic.R
@@ -44,8 +44,7 @@ pip_grp_logic <- function(country         = "ALL",
     create_countries_vctr(
       country         =  country,
       year            =  year,
-      valid_years     =  lkup$valid_years,
-      aux_files       =  lkup$aux_files
+      lkup            = lkup
     )
 
   # use the same names as before to avoid inconsistencies
@@ -84,7 +83,9 @@ pip_grp_logic <- function(country         = "ALL",
       lkup            = lkup
       )
     # For now just rowbinding two dataframes, but we would need to use it more smartly in the future
-    fg_pip_master <- collapse::rowbind(fg_pip_master)
+    fg_pip_master <- collapse::rowbind(fg_pip_master,
+                                       # THis should not be necessary
+                                       fill = TRUE)
 
     if (!data.table::is.data.table(fg_pip_master)) {
       setDT(fg_pip_master)
diff --git a/R/pip_old.R b/R/pip_old.R
index aa9c28fd..ae62bb22 100644
--- a/R/pip_old.R
+++ b/R/pip_old.R
@@ -100,8 +100,7 @@ pip_old <- function(country         = "ALL",
     create_countries_vctr(
       country         =  country,
       year            =  year,
-      valid_years     =  lkup$valid_years,
-      aux_files       =  lkup$aux_files
+      lkup            = lkup
     )
   # lcv$est_ctrs has all the country_code that we are interested in
 
diff --git a/R/pip_old_lineups.R b/R/pip_old_lineups.R
index 6424d6c5..fc01620b 100644
--- a/R/pip_old_lineups.R
+++ b/R/pip_old_lineups.R
@@ -102,8 +102,7 @@ pip_old_lineups <- function(country         = "ALL",
     create_countries_vctr(
       country         =  country,
       year            =  year,
-      valid_years     =  lkup$valid_years,
-      aux_files       =  lkup$aux_files
+      lkup            = lkup
     )
   # lcv$est_ctrs has all the country_code that we are interested in
 
diff --git a/tests/testthat/test-create_countries_vctr.R b/tests/testthat/test-create_countries_vctr.R
index fae4b133..2c4dbc7b 100644
--- a/tests/testthat/test-create_countries_vctr.R
+++ b/tests/testthat/test-create_countries_vctr.R
@@ -457,8 +457,7 @@ test_that("create_vector_countries output the expected object", {
   year = "2010"
   out <- create_countries_vctr(country = country,
                                year    = year,
-                               valid_years = valid_years,
-                               aux_files = aux_files)
+                               lkup            = lkup)
   expect_true(is.list(out))
   expect_equal(sort(names(out)),
                sort(c("ctr_off_reg",
@@ -479,8 +478,7 @@ test_that("create_vector_countries works for countries selection", {
   year = "ALL"
   out <- create_countries_vctr(country = country,
                                year    = year,
-                               valid_years = valid_years,
-                               aux_files = aux_files)
+                               lkup            = lkup)
 
   # Selects all countries with survey data when country="ALL"
   expect_equal(sort(out$est_ctrs), sort(aux_files$countries$country_code))
@@ -492,8 +490,7 @@ test_that("create_vector_countries Returns correct results when country = ALL",
   year = "2010"
   out <- create_countries_vctr(country = country,
                                year    = year,
-                               valid_years = valid_years,
-                               aux_files = aux_files)
+                               lkup            = lkup)
   # Correct official regions
   expect_equal(out$user_off_reg,
                c("ALL",
@@ -514,8 +511,7 @@ test_that("create_vector_countries Returns correct results when country = WLD",
   year = "2010"
   out <- create_countries_vctr(country = country,
                                year    = year,
-                               valid_years = valid_years,
-                               aux_files = aux_files)
+                               lkup            = lkup)
   # Correct official regions
   expect_equal(out$user_off_reg,
                c("ALL",
@@ -534,8 +530,7 @@ test_that("create_vector_countries Returns correct results when country = altern
   year = "ALL"
   out <- create_countries_vctr(country = country,
                                year    = year,
-                               valid_years = valid_years,
-                               aux_files = aux_files)
+                               lkup            = lkup)
 
   # Returns no official region
   expect_true(is.null(out$user_off_reg))
@@ -554,8 +549,7 @@ test_that("create_vector_countries returns correct results when country = offica
   year = "ALL"
   out <- create_countries_vctr(country = country,
                                year    = year,
-                               valid_years = valid_years,
-                               aux_files = aux_files)
+                               lkup            = lkup)
 
   # Returns no official region
   expect_equal(out$user_off_reg, country)
@@ -574,8 +568,7 @@ test_that("create_vector_countries returns correct results when country = aggreg
   year = "ALL"
   out <- create_countries_vctr(country = country,
                                year    = year,
-                               valid_years = valid_years,
-                               aux_files = aux_files)
+                               lkup            = lkup)
 
   # Returns no official region
   expect_equal(out$user_off_reg, off_country)

From f3bd966d44eecf7c51448c0167baba9b135cca28 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Sat, 6 Sep 2025 16:42:22 -0400
Subject: [PATCH 096/203] fix test

---
 tests/testthat/test-create_countries_vctr.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/testthat/test-create_countries_vctr.R b/tests/testthat/test-create_countries_vctr.R
index 2c4dbc7b..fb88a02e 100644
--- a/tests/testthat/test-create_countries_vctr.R
+++ b/tests/testthat/test-create_countries_vctr.R
@@ -568,7 +568,7 @@ test_that("create_vector_countries returns correct results when country = aggreg
   year = "ALL"
   out <- create_countries_vctr(country = country,
                                year    = year,
-                               lkup            = lkup)
+                               lkup    = lkup)
 
   # Returns no official region
   expect_equal(out$user_off_reg, off_country)

From 6d15ac19d896225cf7a6e1bbd27caa7afd061a4a Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Sat, 6 Sep 2025 17:19:47 -0400
Subject: [PATCH 097/203] remporal fix in number of aggregates to include

---
 R/create_countries_vctr.R | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/R/create_countries_vctr.R b/R/create_countries_vctr.R
index 72eb5c8e..4cb985f1 100644
--- a/R/create_countries_vctr.R
+++ b/R/create_countries_vctr.R
@@ -54,7 +54,12 @@ create_countries_vctr <- function(country,
   }
   # STEP 2: Identify regions ----
   ## All available aggregates ----
-  aggs      <- aux_files$regions  ## all aggregates
+  aggs      <- aux_files$regions  |> ## all aggregates
+    # temporal filter
+    _[grouping_type %in% c("region", "regionpcn", "africa_split", "word")]
+
+
+
   ## Official grouping type ----
   off_gt <- c("region", "world") #c("region")
   ## Official valid region codes ----

From 6435d70ad1cb6fe8cc8b17101d300aba5f3ad796 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Sat, 6 Sep 2025 18:00:03 -0400
Subject: [PATCH 098/203] include region and regionpcn in old way

---
 R/fg_pip_old.R    |  3 ++-
 R/pip_grp_logic.R | 12 ++++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/R/fg_pip_old.R b/R/fg_pip_old.R
index b5fab88b..2da40213 100644
--- a/R/fg_pip_old.R
+++ b/R/fg_pip_old.R
@@ -52,7 +52,8 @@ fg_pip_old <- function(country,
 
   # Return empty dataframe if no metadata is found
   if (nrow(metadata) == 0) {
-    return(list(main_data = pipapi::empty_response_fg, data_in_cache = data_present_in_master))
+    return(list(main_data = pipapi::empty_response_fg,
+                data_in_cache = data_present_in_master))
   }
 
   unique_survey_files <- unique(metadata$data_interpolation_id)
diff --git a/R/pip_grp_logic.R b/R/pip_grp_logic.R
index 216bb0f1..e6f54e9b 100644
--- a/R/pip_grp_logic.R
+++ b/R/pip_grp_logic.R
@@ -83,13 +83,13 @@ pip_grp_logic <- function(country         = "ALL",
       lkup            = lkup
       )
     # For now just rowbinding two dataframes, but we would need to use it more smartly in the future
-    fg_pip_master <- collapse::rowbind(fg_pip_master,
-                                       # THis should not be necessary
-                                       fill = TRUE)
+    fg_pip_master <- rowbind(fg_pip_master,
+                           # THis should not be necessary
+                          fill = TRUE) |>
+      setDT()
 
-    if (!data.table::is.data.table(fg_pip_master)) {
-      setDT(fg_pip_master)
-    }
+    fg_pip_master[is.na(mean),
+                  mean := predicted_mean_ppp]
 
     add_vars_out_of_pipeline(fg_pip_master, fill_gaps = TRUE, lkup = lkup)
 

From b6d24cf610235fad266c085cd2de53f298f46053 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Sat, 6 Sep 2025 18:13:55 -0400
Subject: [PATCH 099/203] aggregation working for old, new, and previous... not
 pretty but works

---
 R/pip_grp.R | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/R/pip_grp.R b/R/pip_grp.R
index 15281b76..df0797c2 100644
--- a/R/pip_grp.R
+++ b/R/pip_grp.R
@@ -258,6 +258,20 @@ pip_aggregate_by <- function(df,
             africa_split      = region_name) |>
     rowbind(rgn, fill = TRUE)
   }
+  # Vintage aggregation
+  if ("ALL" %in% country & "regionpcn_code" %in% names(df)) {
+  rgn <- df |>
+    fgroup_by(regionpcn,
+              regionpcn_code,
+              reporting_year,
+              poverty_line) |>
+    fselect(c(weighted_cols, "reporting_pop")) |>
+    fmean(w = reporting_pop, stub = FALSE) |>
+    fsubset(!is.na(regionpcn_code)) |>
+    frename(regionpcn_code = region_code,
+            regionpcn      = region_name) |>
+    rowbind(rgn, fill = TRUE)
+  }
 
 
     if (length(country) == 1) {

From 82804149d96d99e01169676a02519e50a33b5153 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Mon, 8 Sep 2025 18:20:53 -0400
Subject: [PATCH 100/203] first version of cumsum instead of mean

---
 NAMESPACE                    |   1 +
 R/compute_fgt_new.R          | 133 ++++++++++++++++++++++++++++++++++-
 R/fg_pip.R                   |  14 ++--
 R/utils-pipdata.R            |  16 +++--
 man/create_countries_vctr.Rd |   6 +-
 man/fgt_watts_cumsum.Rd      |  21 ++++++
 man/get_aux_table.Rd         |   2 +-
 man/load_list_refy.Rd        |   2 +-
 man/pip_aggregate_by.Rd      |   4 +-
 9 files changed, 176 insertions(+), 23 deletions(-)
 create mode 100644 man/fgt_watts_cumsum.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 96b78000..e9c62630 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -11,6 +11,7 @@ export(citation_from_version)
 export(create_etag_header)
 export(create_return_cols)
 export(create_versioned_lkups)
+export(fgt_watts_cumsum)
 export(fillin_list)
 export(get_aux_table)
 export(get_aux_table_ui)
diff --git a/R/compute_fgt_new.R b/R/compute_fgt_new.R
index 2d09a1e8..8b4f0b0c 100644
--- a/R/compute_fgt_new.R
+++ b/R/compute_fgt_new.R
@@ -1,3 +1,5 @@
+# OLD APPROACH WITH MEAN --------------
+
 # Efficient FGT calculation for a data.table and vector of poverty lines
 #' Title
 #'
@@ -145,7 +147,9 @@ compute_fgt <- function(w, wt, povlines) {
 #' @rdname map_fgt
 #' @keywords  internal
 DT_fgt_by_rl <- \(x, y, nx, povline) {
-  DT_fgt <- lapply(names(y), \(rl) {
+  uni_rl <- names(y) |>
+    unique()
+  DT_fgt <- lapply(uni_rl, \(rl) {
 
     idx <- y[[rl]]
     w   <- x[idx, welfare]
@@ -343,5 +347,132 @@ pov_from_DT <- function(DT, povline, g, cores = 1) {
 
 
+# NEW ARPPOACH USING CUMSUM ------------------
+
+#' compute fgt and watts using cumulative welfare rather than means
+#'
+#' @param y numeric welfare (sorted ascending within the subgroup)
+#' @param w numeric weights (same order as y)
+#' @param lines numeric vector of poverty lines
+#'
+#' @return Returns a data.table with columns: line, fgt0,fgt1,fgt2,watts
+#' @export
+fgt_watts_cumsum <- function(y, w, lines) {
+  # types
+
+  y     <- as.double(y)
+  w     <- as.double(w)
+  lines <- as.double(lines)
+  n <- length(y)
+
+  if (n == 0L) {
+    return(data.table(line = lines,
+                      fgt0 = 0, fgt1 = 0, fgt2 = 0, watts = 0))
+  }
+
+  # total weight (collapse: fsum is very fast)
+  W <- fsum(w)
+
+  # cumulative sums (collapse: fcumsum is multithreaded-aware, very fast)
+  cw    <- fcumsum(w)
+  cwy   <- fcumsum(w * y)
+  cwy2  <- fcumsum(w * (y * y))
+
+  # Watts needs log(y) with y>0; clamp tiny positives for safety
+  y_pos <- pmax(y, 1e-12)
+  cwlog <- fcumsum(w * log(y_pos))
+
+  # index of last obs <= line for each z (0..n)
+  i <- findInterval(lines, y)
+
+  take <- function(cs) {
+    out <- cs[pmax.int(i, 0L)]
+    out[i == 0L] <- 0
+    out
+  }
+
+  cw_i    <- take(cw)
+  cwy_i   <- take(cwy)
+  cwy2_i  <- take(cwy2)
+  cwlog_i <- take(cwlog)
+
+  z    <- lines
+  z2   <- z * z
+  z_s  <- pmax(z, 1e-12)
+  z2_s <- pmax(z2, 1e-24)
+
+  data.table(
+    line  = lines,
+    fgt0  = cw_i / W,
+    fgt1  = (z * cw_i - cwy_i) / (z_s * W),
+    fgt2  = (z2 * cw_i - 2 * z * cwy_i + cwy2_i) / (z2_s * W),
+    watts = (log(z_s) * cw_i - cwlog_i) / W
+  )
+}
+
+
+
+# tl: list of data.tables, each with columns id, reporting_level, welfare, weight
+# Assumes: within each element, data are sorted by welfare within each reporting_level
+fgt_watts_list <- function(tl, lines) {
+  rbindlist(lapply(tl, function(dt) {
+    # compute per (id, reporting_level)
+    dt[, fgt_watts_cumsum(welfare, weight, lines),
+       by = .(id, reporting_level)]
+  }),
+  fill = TRUE)
+}
+
+
+# DT: one big data.table with id, reporting_level, welfare, weight
+# Assumes: within each (id, reporting_level), rows are sorted by welfare
+
+
+# DT <- rbindlist(lt)
+
+
+fgt_watts_dt <- function(DT, lines) {
+  DT[, fgt_watts_cumsum(welfare, weight, lines),
+     by = .(id, reporting_level)]
+}
+
+
+# lines <- c(1:100)
+# rlt <- fgt_watts_list(lt, lines)
+# rdt <- fgt_watts_dt(DT, lines)
+#
+# waldo::compare(rlt, rdt)
+#
+#
+#
+#
+# bench <- microbenchmark::microbenchmark(
+#   times = 50,
+#   lt = {
+#     fgt_watts_list(lt, lines)
+#   },
+#   dt = {
+#     fgt_watts_dt(DT, lines)
+#
+#   }
+# )
+# if (requireNamespace("highcharter")) {
+#   hc_dt <- highcharter::data_to_boxplot(bench,
+#                                         time,
+#                                         expr,
+#                                         add_outliers = FALSE,
+#                                         name = "Time in milliseconds")
+#
+#   highcharter::highchart() |>
+#     highcharter::hc_xAxis(type = "category") |>
+#     highcharter::hc_chart(inverted=TRUE) |>
+#     highcharter::hc_add_series_list(hc_dt)
+#
+# } else {
+#   boxplot(bench, outline = FALSE)
+# }
+#
+
+
 
 
diff --git a/R/fg_pip.R b/R/fg_pip.R
index 8d46dc31..10d993e1 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -61,18 +61,17 @@ fg_pip <- function(country,
   full_list <- create_full_list(metadata = metadata)
 
   lt <-
-    load_list_refy(input_list = full_list,
-                   path       = fs::path(data_dir, "lineup_data"))
+    load_list_refy(input_list = full_list)
 
   # lt <- lapply(lt, \(x) {
   #                  add_attributes_as_columns_vectorized(x)
   #              })
 
   # Extract some attributes
-  lt_att <- get_lt_attr(lt)
+  # lt_att <- get_lt_attr(lt)
 
   # get rows indices
-  l_rl_rows <- get_rl_rows(lt_att)
+  # l_rl_rows <- get_rl_rows(lt_att)
 
 
   # DT <- map_lt_to_dt(lt, l_rl_rows)
@@ -252,10 +251,9 @@ fg_assign_nas_values_to_dup_cols <- function(df,
 #' @return data.table
 create_full_list <- function(metadata) {
 
-  x <- metadata[, .(country_code, reporting_year)
-                ][,
-                  inames := paste0(country_code, "_", reporting_year)]
-  x
+  metadata[, path] |>
+    funique()
+
 }
 
 
diff --git a/R/utils-pipdata.R b/R/utils-pipdata.R
index 39def155..a30d1160 100644
--- a/R/utils-pipdata.R
+++ b/R/utils-pipdata.R
@@ -5,10 +5,11 @@
 #'
 #' @return character vector
 #' @keywords internal
-load_list_refy <- \(input_list, path){
+load_list_refy <- \(input_list){
 
   cores <- get_from_pipapienv("cores_to_use")
-  inames <- input_list$inames
+  inames <- fs::path_file(input_list) |>
+    fs::path_ext_remove()
 
   seq_flex <- if (interactive()) {
     cli::cli_progress_along
@@ -16,10 +17,11 @@ load_list_refy <- \(input_list, path){
     base::seq_along
   }
 
-  dl <- lapply(seq_flex(inames), \(i) {
-    x <- inames[i]
-    qs::qread(file = fs::path(path, x, ext = "qs"),
-              nthreads = cores)
+  dl <- lapply(seq_flex(input_list), \(x) {
+    qfile <- input_list[x]
+    qname <- inames[x]
+    qs::qread(file = qfile, nthreads = cores) |>
+      _[, id := qname]
     }) |>
     setNames(inames)
 
@@ -27,6 +29,8 @@ load_list_refy <- \(input_list, path){
 }
 
 
+
+
 #' transform input list
 #'
 #' @inheritParams load_list_refy
diff --git a/man/create_countries_vctr.Rd b/man/create_countries_vctr.Rd
index 87a18e80..c62f8276 100644
--- a/man/create_countries_vctr.Rd
+++ b/man/create_countries_vctr.Rd
@@ -4,16 +4,16 @@
 \alias{create_countries_vctr}
 \title{Create countries vectors}
 \usage{
-create_countries_vctr(country, year, valid_years, aux_files)
+create_countries_vctr(country, year, lkup)
 }
 \arguments{
 \item{country}{character: Country ISO 3 codes}
 
 \item{year}{integer: Reporting year}
 
-\item{valid_years}{list: Valid years information provided through lkup object}
+\item{lkup}{lkup object}
 
-\item{aux_files}{list: List of auxiliary tables provided through lkup object}
+\item{valid_years}{list: Valid years information provided through lkup object}
 }
 \value{
 a list of vectors with countries and regions code to be used in
diff --git a/man/fgt_watts_cumsum.Rd b/man/fgt_watts_cumsum.Rd
new file mode 100644
index 00000000..74d7ecdf
--- /dev/null
+++ b/man/fgt_watts_cumsum.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/compute_fgt_new.R
+\name{fgt_watts_cumsum}
+\alias{fgt_watts_cumsum}
+\title{compute fgt and watts using cumulative welfare rather than means}
+\usage{
+fgt_watts_cumsum(y, w, lines)
+}
+\arguments{
+\item{y}{numeric welfare (sorted ascending within the subgroup)}
+
+\item{w}{numeric weights (same order as y)}
+
+\item{lines}{numeric vector of poverty lines}
+}
+\value{
+Returns a data.table with columns: line, fgt0,fgt1,fgt2,watts
+}
+\description{
+compute fgt and watts using cumulative welfare rather than means
+}
diff --git a/man/get_aux_table.Rd b/man/get_aux_table.Rd
index 66a90941..f11a9261 100644
--- a/man/get_aux_table.Rd
+++ b/man/get_aux_table.Rd
@@ -4,7 +4,7 @@
 \alias{get_aux_table}
 \title{Return specified auxiliary data}
 \usage{
-get_aux_table(data_dir, table, long_format = FALSE)
+get_aux_table(data_dir = NULL, table, long_format = FALSE)
 }
 \arguments{
 \item{data_dir}{character: Data directory}
diff --git a/man/load_list_refy.Rd b/man/load_list_refy.Rd
index 59a37d7c..b1533a4f 100644
--- a/man/load_list_refy.Rd
+++ b/man/load_list_refy.Rd
@@ -4,7 +4,7 @@
 \alias{load_list_refy}
 \title{load refy list}
 \usage{
-load_list_refy(input_list, path)
+load_list_refy(input_list)
 }
 \arguments{
 \item{input_list}{list. output from \link{create_full_list}}
diff --git a/man/pip_aggregate_by.Rd b/man/pip_aggregate_by.Rd
index 98bf18f7..642567e7 100644
--- a/man/pip_aggregate_by.Rd
+++ b/man/pip_aggregate_by.Rd
@@ -4,13 +4,11 @@
 \alias{pip_aggregate_by}
 \title{Aggregate by predefined groups}
 \usage{
-pip_aggregate_by(df, group_lkup, country = "ALL", return_cols)
+pip_aggregate_by(df, country = "ALL", return_cols = NULL)
 }
 \arguments{
 \item{df}{data.frame: Response from \code{fg_pip_old()} or \code{rg_pip()}.}
 
-\item{group_lkup}{data.frame: Group lkup table (pop_region)}
-
 \item{country}{character: Selected countries / regions}
 
 \item{return_cols}{list: lkup$return_cols$pip_grp object. Controls returned

From 8d736fc8ec36496d3b572caca0393f152ecd0034 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Wed, 10 Sep 2025 11:43:43 -0400
Subject: [PATCH 101/203] small update

---
 R/compute_fgt_new.R | 76 +++++++++++++++++++++++++--------------------
 1 file changed, 42 insertions(+), 34 deletions(-)

diff --git a/R/compute_fgt_new.R b/R/compute_fgt_new.R
index 8b4f0b0c..75a91693 100644
--- a/R/compute_fgt_new.R
+++ b/R/compute_fgt_new.R
@@ -357,57 +357,65 @@ pov_from_DT <- function(DT, povline, g, cores = 1) {
 #'
 #' @return Returns a data.table with columns: line, fgt0,fgt1,fgt2,watts
 #' @export
-fgt_watts_cumsum <- function(y, w, lines) {
-  # types
+fgt_watts_cumsum <- function(y, w, lines,
+                             check_sorted = FALSE) {
+  # 1) (Optional) type normalization: only coerce if needed
+  if (!is.double(y))     y     <- as.double(y)
+  if (!is.double(w))     w     <- as.double(w)
+  if (!is.double(lines)) lines <- as.double(lines)
+
+  # 2) Optional sort check (fail fast during validation)
+  if (check_sorted) {
+    if (is.unsorted(y, strictly = FALSE)) {
+      cli::cli_abort("{.code y} must be sorted ascending within the subgroup.
+                     Set {.code check_sorted=FALSE} to skip.")
+      }
+  }
 
-  y     <- as.double(y)
-  w     <- as.double(w)
-  lines <- as.double(lines)
+  # 3) Handle empty group quickly
   n <- length(y)
-
   if (n == 0L) {
     return(data.table(line = lines,
                       fgt0 = 0, fgt1 = 0, fgt2 = 0, watts = 0))
   }
 
-  # total weight (collapse: fsum is very fast)
-  W <- fsum(w)
-
-  # cumulative sums (collapse: fcumsum is multithreaded-aware, very fast)
-  cw    <- fcumsum(w)
-  cwy   <- fcumsum(w * y)
-  cwy2  <- fcumsum(w * (y * y))
+  # 4) Group totals and prefix sums (collapse is very fast)
+  W     <- fsum(w)             # total weight
+  cw    <- fcumsum(w)          # cum pop
+  cwy   <- fcumsum(w * y)      # cum welfare
+  cwy2  <- fcumsum(w * (y*y))  # cum  pop by welfare sqr
 
-  # Watts needs log(y) with y>0; clamp tiny positives for safety
+  # Watts: use log(y) (y must be positive; clamp tiny to avoid -Inf)
   y_pos <- pmax(y, 1e-12)
-  cwlog <- fcumsum(w * log(y_pos))
+  cwlog <- fcumsum(w * log(y_pos))  # Σ w log y
 
-  # index of last obs <= line for each z (0..n)
+  # 5) For each z in `lines`, i = number of y's ≤ z (0..n)
+  #    findInterval is C-coded; for increasing y, i = count(y ≤ z)
   i <- findInterval(lines, y)
 
-  take <- function(cs) {
-    out <- cs[pmax.int(i, 0L)]
-    out[i == 0L] <- 0
-    out
-  }
+  # 6) Pull the needed prefixes at indices i; use the "prefix trick" for i=0
+  #    so that when no poor, sum is 0: c(0, cs)[i + 1].
+  take <- function(cs) c(0, cs)[i + 1L]
 
-  cw_i    <- take(cw)
-  cwy_i   <- take(cwy)
-  cwy2_i  <- take(cwy2)
-  cwlog_i <- take(cwlog)
+  cw_i    <- take(cw)     # Σ_{y≤z} w
+  cwy_i   <- take(cwy)    # Σ_{y≤z} w y
+  cwy2_i  <- take(cwy2)   # Σ_{y≤z} w y^2
+  cwlog_i <- take(cwlog)  # Σ_{y≤z} w log y
 
+  # 7) Assemble the indices, with small clamps for numerical safety
   z    <- lines
   z2   <- z * z
-  z_s  <- pmax(z, 1e-12)
-  z2_s <- pmax(z2, 1e-24)
+  z_s  <- pmax(z,  1e-12)   # avoid division by 0 in FGT1, Watts
+  z2_s <- pmax(z2, 1e-24)   # avoid division by 0 in FGT2
 
-  data.table(
-    line  = lines,
-    fgt0  = cw_i / W,
-    fgt1  = (z * cw_i - cwy_i) / (z_s * W),
-    fgt2  = (z2 * cw_i - 2 * z * cwy_i + cwy2_i) / (z2_s * W),
-    watts = (log(z_s) * cw_i - cwlog_i) / W
-  )
+  # 8) Closed-form formulas from the algebra above
+  fgt0  <- cw_i / W
+  fgt1  <- (z * cw_i - cwy_i) / (z_s * W)
+  fgt2  <- (z2 * cw_i - 2 * z * cwy_i + cwy2_i) / (z2_s * W)
+  watts <- (log(z_s) * cw_i - cwlog_i) / W
+
+  # 9) Return tidy result
+  data.table(line = lines, fgt0 = fgt0, fgt1 = fgt1, fgt2 = fgt2, watts = watts)
 }
 
 
From fadce923ba15372895782407ac0c470bb3a73046 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Wed, 10 Sep 2025 11:47:07 -0400
Subject: [PATCH 102/203] replace load_list_refy to read from fst files

---
 R/utils-pipdata.R | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/R/utils-pipdata.R b/R/utils-pipdata.R
index a30d1160..dc5b823d 100644
--- a/R/utils-pipdata.R
+++ b/R/utils-pipdata.R
@@ -7,25 +7,28 @@
 #' @keywords internal
 load_list_refy <- \(input_list){
 
-  cores <- get_from_pipapienv("cores_to_use")
-  inames <- fs::path_file(input_list) |>
+  id_names <- input_list |>
+    fs::path_file() |>
     fs::path_ext_remove()
 
-  seq_flex <- if (interactive()) {
-    cli::cli_progress_along
-  } else {
-    base::seq_along
-  }
+    seq_flex <- if (interactive()) {
+      cli::cli_progress_along
+    } else {
+      base::seq_along
+    }
+
 
-  dl <- lapply(seq_flex(input_list), \(x) {
-    qfile <- input_list[x]
-    qname <- inames[x]
-    qs::qread(file = qfile, nthreads = cores) |>
-      _[, id := qname]
-    }) |>
-    setNames(inames)
+  lfst <- lapply(seq_flex(input_list),
+                 \(i) {
+                   x <- lup_files[i]
+                   idn <- fs::path_file(x) |>
+                     fs::path_ext_remove()
+                   fst::read_fst(x, as.data.table = TRUE) |>
+                     _[, id := idn]
+                 }) |>
+    setNames(id_names)
 
-  dl
+  lfst
 }
 
 
From 968bbf3a0d6708c38799b08be38dce0801efe070 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Wed, 10 Sep 2025 12:07:05 -0400
Subject: [PATCH 103/203] remove first attempt of cumcum method

---
 R/compute_fgt_new.R | 135 --------------------------------------------
 1 file changed, 135 deletions(-)

diff --git a/R/compute_fgt_new.R b/R/compute_fgt_new.R
index 75a91693..2923e530 100644
--- a/R/compute_fgt_new.R
+++ b/R/compute_fgt_new.R
@@ -349,138 +349,3 @@ pov_from_DT <- function(DT, povline, g, cores = 1) {
 
 # NEW ARPPOACH USING CUMSUM ------------------
 
-#' compute fgt and watts using cumulative welfare rather than means
-#'
-#' @param y numeric welfare (sorted ascending within the subgroup)
-#' @param w numeric weights (same order as y)
-#' @param lines numeric vector of poverty lines
-#'
-#' @return Returns a data.table with columns: line, fgt0,fgt1,fgt2,watts
-#' @export
-fgt_watts_cumsum <- function(y, w, lines,
-                             check_sorted = FALSE) {
-  # 1) (Optional) type normalization: only coerce if needed
-  if (!is.double(y))     y     <- as.double(y)
-  if (!is.double(w))     w     <- as.double(w)
-  if (!is.double(lines)) lines <- as.double(lines)
-
-  # 2) Optional sort check (fail fast during validation)
-  if (check_sorted) {
-    if (is.unsorted(y, strictly = FALSE)) {
-      cli::cli_abort("{.code y} must be sorted ascending within the subgroup.
-                     Set {.code check_sorted=FALSE} to skip.")
-      }
-  }
-
-  # 3) Handle empty group quickly
-  n <- length(y)
-  if (n == 0L) {
-    return(data.table(line = lines,
-                      fgt0 = 0, fgt1 = 0, fgt2 = 0, watts = 0))
-  }
-
-  # 4) Group totals and prefix sums (collapse is very fast)
-  W     <- fsum(w)             # total weight
-  cw    <- fcumsum(w)          # cum pop
-  cwy   <- fcumsum(w * y)      # cum welfare
-  cwy2  <- fcumsum(w * (y*y))  # cum  pop by welfare sqr
-
-  # Watts: use log(y) (y must be positive; clamp tiny to avoid -Inf)
-  y_pos <- pmax(y, 1e-12)
-  cwlog <- fcumsum(w * log(y_pos))  # Σ w log y
-
-  # 5) For each z in `lines`, i = number of y's ≤ z (0..n)
-  #    findInterval is C-coded; for increasing y, i = count(y ≤ z)
-  i <- findInterval(lines, y)
-
-  # 6) Pull the needed prefixes at indices i; use the "prefix trick" for i=0
-  #    so that when no poor, sum is 0: c(0, cs)[i + 1].
-  take <- function(cs) c(0, cs)[i + 1L]
-
-  cw_i    <- take(cw)     # Σ_{y≤z} w
-  cwy_i   <- take(cwy)    # Σ_{y≤z} w y
-  cwy2_i  <- take(cwy2)   # Σ_{y≤z} w y^2
-  cwlog_i <- take(cwlog)  # Σ_{y≤z} w log y
-
-  # 7) Assemble the indices, with small clamps for numerical safety
-  z    <- lines
-  z2   <- z * z
-  z_s  <- pmax(z,  1e-12)   # avoid division by 0 in FGT1, Watts
-  z2_s <- pmax(z2, 1e-24)   # avoid division by 0 in FGT2
-
-  # 8) Closed-form formulas from the algebra above
-  fgt0  <- cw_i / W
-  fgt1  <- (z * cw_i - cwy_i) / (z_s * W)
-  fgt2  <- (z2 * cw_i - 2 * z * cwy_i + cwy2_i) / (z2_s * W)
-  watts <- (log(z_s) * cw_i - cwlog_i) / W
-
-  # 9) Return tidy result
-  data.table(line = lines, fgt0 = fgt0, fgt1 = fgt1, fgt2 = fgt2, watts = watts)
-}
-
-
-
-# tl: list of data.tables, each with columns id, reporting_level, welfare, weight
-# Assumes: within each element, data are sorted by welfare within each reporting_level
-fgt_watts_list <- function(tl, lines) {
-  rbindlist(lapply(tl, function(dt) {
-    # compute per (id, reporting_level)
-    dt[, fgt_watts_cumsum(welfare, weight, lines),
-       by = .(id, reporting_level)]
-  }),
-  fill = TRUE)
-}
-
-
-# DT: one big data.table with id, reporting_level, welfare, weight
-# Assumes: within each (id, reporting_level), rows are sorted by welfare
-
-
-# DT <- rbindlist(lt)
-
-
-fgt_watts_dt <- function(DT, lines) {
-  DT[, fgt_watts_cumsum(welfare, weight, lines),
-     by = .(id, reporting_level)]
-}
-
-
-# lines <- c(1:100)
-# rlt <- fgt_watts_list(lt, lines)
-# rdt <- fgt_watts_dt(DT, lines)
-#
-# waldo::compare(rlt, rdt)
-#
-#
-#
-#
-# bench <- microbenchmark::microbenchmark(
-#   times = 50,
-#   lt = {
-#     fgt_watts_list(lt, lines)
-#   },
-#   dt = {
-#     fgt_watts_dt(DT, lines)
-#
-#   }
-# )
-# if (requireNamespace("highcharter")) {
-#   hc_dt <- highcharter::data_to_boxplot(bench,
-#                                         time,
-#                                         expr,
-#                                         add_outliers = FALSE,
-#                                         name = "Time in milliseconds")
-#
-#   highcharter::highchart() |>
-#     highcharter::hc_xAxis(type = "category") |>
-#     highcharter::hc_chart(inverted=TRUE) |>
-#     highcharter::hc_add_series_list(hc_dt)
-#
-# } else {
-#   boxplot(bench, outline = FALSE)
-# }
-#
-
-
-
-

From fd0dc36b579bdbf3a90323fe71d4c071a6ec7510 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Wed, 10 Sep 2025 12:12:08 -0400
Subject: [PATCH 104/203] new file for new method of FGT using cumsum

---
 R/compute_fgt_new.R |  4 ----
 R/fg_pip.R          |  2 +-
 R/fgt_cumsum.R      | 30 ++++++++++++++++++++++++++++++
 3 files changed, 31 insertions(+), 5 deletions(-)
 create mode 100644 R/fgt_cumsum.R

diff --git a/R/compute_fgt_new.R b/R/compute_fgt_new.R
index 2923e530..79cf0ecf 100644
--- a/R/compute_fgt_new.R
+++ b/R/compute_fgt_new.R
@@ -345,7 +345,3 @@ pov_from_DT <- function(DT, povline, g, cores = 1) {
 # }
 
 
-
-
-# NEW ARPPOACH USING CUMSUM ------------------
-
diff --git a/R/fg_pip.R b/R/fg_pip.R
index 10d993e1..da323540 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -60,7 +60,7 @@ fg_pip <- function(country,
 
   full_list <- create_full_list(metadata = metadata)
 
-  lt <-
+  lfst <-
     load_list_refy(input_list = full_list)
 
   # lt <- lapply(lt, \(x) {
diff --git a/R/fgt_cumsum.R b/R/fgt_cumsum.R
new file mode 100644
index 00000000..477b327e
--- /dev/null
+++ b/R/fgt_cumsum.R
@@ -0,0 +1,30 @@
+#' format the lfst list to usable data to estimate poverty
+#'
+#' @param lfst list from load_list_refy()
+#'
+#' @return list with DT and g (GRP object)
+#' @keywords internal
+format_lfst <- \(lfst) {
+
+  DT <- rbindlist(lfst, fill = TRUE)
+
+  # Convert to factors (is it faster?)
+  DT[, names(.SD) := lapply(.SD, qF),
+     .SDcols = c("id", "reporting_level")]
+
+  # fix
+  # DT[index == 0,
+  #    names(.SD) := 0,
+  #    .SDcols = is.numeric]
+  #
+  # DT <- DT[!grepl("^CHN_", id)]
+
+  ## Grouping ----------
+
+  g <- GRP(DT, ~ id + reporting_level, sort = FALSE)
+
+  list(DT = DT,
+       g = g)
+}
+
+

From 30c50b9eca3324dedf3d05d3769fd7a6eac5afd2 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Wed, 10 Sep 2025 14:21:23 -0400
Subject: [PATCH 105/203] document helpers for fgt_cumcum

---
 NAMESPACE               |   2 +-
 R/fgt_cumsum.R          | 272 ++++++++++++++++++++++++++++++++++++++++
 R/utils-pipdata.R       |  33 -----
 man/build_pair_dict.Rd  |  35 ++++++
 man/decode_pairs.Rd     |  38 ++++++
 man/dot-get_pairs_df.Rd |  22 ++++
 man/encode_pairs.Rd     |  41 ++++++
 man/fgt_watts_cumsum.Rd |  21 ----
 man/format_lfst.Rd      |  38 ++++++
 man/get_total_pop.Rd    |  26 ++++
 man/load_list_refy.Rd   |  22 +++-
 man/transform_input.Rd  |   2 +-
 man/update_pair_dict.Rd |  24 ++++
 13 files changed, 518 insertions(+), 58 deletions(-)
 create mode 100644 man/build_pair_dict.Rd
 create mode 100644 man/decode_pairs.Rd
 create mode 100644 man/dot-get_pairs_df.Rd
 create mode 100644 man/encode_pairs.Rd
 delete mode 100644 man/fgt_watts_cumsum.Rd
 create mode 100644 man/format_lfst.Rd
 create mode 100644 man/get_total_pop.Rd
 create mode 100644 man/update_pair_dict.Rd

diff --git a/NAMESPACE b/NAMESPACE
index e9c62630..9386e37b 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -11,7 +11,6 @@ export(citation_from_version)
 export(create_etag_header)
 export(create_return_cols)
 export(create_versioned_lkups)
-export(fgt_watts_cumsum)
 export(fillin_list)
 export(get_aux_table)
 export(get_aux_table_ui)
@@ -24,6 +23,7 @@ export(get_md_vars)
 export(get_param_values)
 export(get_pip_version)
 export(get_pipapienv)
+export(get_total_pop)
 export(get_user_alt_gt)
 export(get_user_x_code)
 export(get_valid_aux_long_format_tables)
diff --git a/R/fgt_cumsum.R b/R/fgt_cumsum.R
index 477b327e..dc5b9b7c 100644
--- a/R/fgt_cumsum.R
+++ b/R/fgt_cumsum.R
@@ -1,9 +1,18 @@
+# MAIN -------------------------------
+
 #' format the lfst list to usable data to estimate poverty
 #'
 #' @param lfst list from load_list_refy()
 #'
 #' @return list with DT and g (GRP object)
 #' @keywords internal
+#' Format lfst list for poverty estimation
+#'
+#' Takes a list from load_list_refy() and returns a list with a data.table and a GRP object for grouped operations.
+#'
+#' @param lfst List from load_list_refy().
+#' @return List with elements: DT (data.table) and g (GRP object).
+#' @keywords internal
 format_lfst <- \(lfst) {
 
   DT <- rbindlist(lfst, fill = TRUE)
@@ -28,3 +37,266 @@ format_lfst <- \(lfst) {
 }
 
 
+
+
+#' get total population by country year and reprting level
+#'
+#' @param LDTg list from format_lfst() with DT and g objects
+#'
+#' @return data.table  with total population by `g[["groups]]`
+#' @export
+#'
+#' @examples
+#' Get total population by country, year, and reporting level
+#'
+#' Computes total population by group using the output of format_lfst().
+#'
+#' @param LDTg List from format_lfst() with DT and g objects.
+#' @return data.table with total population by group.
+#' @export
+#' @examples
+#' # get_total_pop(format_lfst(lfst))
+get_total_pop <- \(LDTg) {
+  list2env(LDTg)
+  add_vars(g[["groups"]],
+           get_vars(DT, c("weight")) |>
+           fsum(g)) |>
+    setnames(old = "weight",new =  "totpop")
+}
+
+
+
+# --- helpers ---------------------------------------------------------------
+#' load refy list
+#'
+#' @param input_list list. output from [create_full_list]
+#' @param path character: directory path
+#'
+#' @return character vector
+#' @keywords internal
+#' Load refy list
+#'
+#' Loads a list of files and returns a named list of data.tables, each with an id column.
+#'
+#' @param input_list Character vector of file paths (output from create_full_list).
+#' @return Named list of data.tables, each with an id column.
+#' @keywords internal
+load_list_refy <- \(input_list){
+
+  id_names <- input_list |>
+    fs::path_file() |>
+    fs::path_ext_remove()
+
+  seq_flex <- if (interactive()) {
+    cli::cli_progress_along
+  } else {
+    base::seq_along
+  }
+
+
+  lfst <- lapply(seq_flex(input_list),
+                 \(i) {
+                   x <- lup_files[i]
+                   idn <- fs::path_file(x) |>
+                     fs::path_ext_remove()
+                   fst::read_fst(x, as.data.table = TRUE) |>
+                     _[, id := idn]
+                 }) |>
+    setNames(id_names)
+
+  lfst
+}
+
+
+# Pull just the 2 id columns as a base R data.frame (robust across classes)
+#' Extract id and level columns as a data.frame
+#'
+#' @param X data.table or data.frame
+#' @param id_col Name of id column
+#' @param level_col Name of level column
+#' @return data.frame with id and level columns
+#' @keywords internal
+.get_pairs_df <- function(X, id_col, level_col) {
+  as.data.frame(X[, c(id_col, level_col), drop = FALSE])
+}
+
+
+# ------------------------------- #
+# 1) Build pair dictionary (DT)   #
+# ------------------------------- #
+
+#' Dictionary for fast joins
+#'
+#' @param lkup lkup object
+#' @param fill_gaps TRUE for lineup years, FALSE for survey years
+#'
+#' @return data.table with dictionary for merges.
+#' @keywords internal
+#' Build dictionary for fast joins
+#'
+#' Creates a data.table dictionary for merging by id and reporting_level.
+#'
+#' @param lkup Lookup object containing refy_lkup and svy_lkup.
+#' @param fill_gaps Logical, TRUE for lineup years, FALSE for survey years.
+#' @return data.table with columns id, reporting_level, and code.
+#' @keywords internal
+build_pair_dict <- function(lkup, fill_gaps = TRUE) {
+
+  FT <- if (fill_gaps) {
+    lkup$refy_lkup[, .(country_code, reporting_year, reporting_level)]
+  } else {
+    lkup$svy_lkup[, .(country_code, reporting_year, reporting_level)]
+  } |>
+    funique()
+
+  FT[, id := paste0(country_code, "_", reporting_year)
+     ][, c("country_code", "reporting_year") := NULL]
+
+  cols <- c("id", "reporting_level")
+  dict <- unique(FT[, ..cols])
+
+  # deterministic code order
+  setorderv(dict, cols, order = 1L)     # radix by default
+  dict[, code := as.integer(.I)]        # fast in DT
+  setkeyv(dict, cols)                   # fast key lookups when needed
+  setindexv(dict, "code")               # index on code
+  dict
+}
+
+
+# -------------------------------------------- #
+# 2) Encode: add integer code via collapse::join
+# -------------------------------------------- #
+# DT: data.table to encode (by reference not guaranteed since join copies x->result)
+# dict: data.table from build_pair_dict()
+# code_col: name of code column to write
+#' Encode pairs with integer code
+#'
+#' Adds an integer code column to a data.table by joining with a dictionary.
+#'
+#' @param DT data.table to encode.
+#' @param dict data.table from build_pair_dict().
+#' @param id_col Name of id column.
+#' @param level_col Name of reporting level column.
+#' @param code_col Name of code column to write.
+#' @param drop_labels Logical, drop id and level columns if TRUE.
+#' @param strict Logical, error if any pairs are missing from dict.
+#' @param verbose Integer, verbosity level.
+#' @return data.table with code column added.
+#' @keywords internal
+encode_pairs <- function(DT, dict,
+                         id_col = "id", level_col = "reporting_level",
+                         code_col = "id_rl",
+                         drop_labels = FALSE,
+                         strict = TRUE,
+                         verbose = 0L) {
+
+  stopifnot(is.data.table(DT), is.data.table(dict))
+  cols <- c(id_col, level_col)
+  stopifnot(all(cols %in% names(DT)), all(c(cols, "code") %in% names(dict)))
+
+
+  out <- join(
+    x = DT,
+    y = dict,
+    on = cols,
+    how = "left",
+    drop.dup.cols = "y",
+    validate = "m:1",
+    verbose = verbose
+  )
+  # Ensure it's a data.table (join usually preserves)
+  if (!is.data.table(out)) setDT(out)
+
+  # Rename 'code' -> code_col if needed
+  if (code_col != "code" && "code" %in% names(out)) {
+    setnames(out, "code", code_col)
+  }
+
+  if (strict) {
+    if (anyNA(out[[code_col]])) {
+      nas <- is.na(out[[code_col]])
+      miss <- unique(out[nas, ..cols])[1:min(10L, sum(nas))]
+      cli::cli_abort(
+        c(
+          "encode_pairs(): {fsum(nas)} unseen (id, reporting_level) pair(s).",
+          "Examples:\n{paste(capture.output(print(miss)), collapse = '\n')}"
+        )
+      )
+    }
+  }
+
+  if (drop_labels) out[, (cols) := NULL]
+  out
+}
+
+# ------------------------------------------------ #
+# 3) Decode: join labels by code via collapse::join #
+# ------------------------------------------------ #
+#' Decode integer code to id and reporting level
+#'
+#' Joins labels by code using a dictionary.
+#'
+#' @param DT data.table to decode.
+#' @param dict data.table from build_pair_dict().
+#' @param code_col Name of code column in DT.
+#' @param id_col Name of id column in dict.
+#' @param level_col Name of reporting level column in dict.
+#' @param keep_code Logical, keep code column if TRUE.
+#' @param verbose Integer, verbosity level.
+#' @return data.table with id and reporting_level columns added.
+#' @keywords internal
+decode_pairs <- function(DT, dict,
+                         code_col = "id_rl",
+                         id_col = "id", level_col = "reporting_level",
+                         keep_code = TRUE,
+                         verbose = 0L) {
+  stopifnot(is.data.table(DT), is.data.table(dict))
+  stopifnot(code_col %in% names(DT), all(c("code", id_col, level_col) %in% names(dict)))
+
+  dict_min <- dict[, .(code, ..id_col = get(id_col), ..level_col = get(level_col))]
+  setnames(dict_min, c("code", id_col, level_col))
+
+  out <- collapse::join(
+    x = DT, y = dict_min,
+    on = setNames("code", code_col),   # map DT[code_col] to dict$code
+    how = "left",
+    drop.dup.cols = "y",
+    validate = "m:1",
+    verbose = verbose
+  )
+  if (!is.data.table(out)) setDT(out)
+  if (!keep_code) out[, (code_col) := NULL]
+  out[]
+}
+
+# ----------------------------------------------------- #
+# 4) Update dict with new pairs (append-only, fast DT)  #
+# ----------------------------------------------------- #
+#' Update dictionary with new pairs
+#'
+#' Appends new (id, reporting_level) pairs to the dictionary if needed.
+#'
+#' @param dict data.table dictionary from build_pair_dict().
+#' @param DT data.table with id and reporting_level columns.
+#' @param id_col Name of id column.
+#' @param level_col Name of reporting level column.
+#' @return Updated data.table dictionary.
+#' @keywords internal
+update_pair_dict <- function(dict, DT,
+                             id_col = "id", level_col = "reporting_level") {
+  stopifnot(is.data.table(dict), is.data.table(DT))
+  cols <- c(id_col, level_col)
+  stopifnot(all(c(cols, "code") %in% names(dict)), all(cols %in% names(DT)))
+
+  new_pairs <- fsetdiff(unique(DT[, ..cols]), dict[, ..cols])
+  if (nrow(new_pairs)) {
+    new_pairs[, code := as.integer(max(dict$code) + seq_len(.N))]
+    setkeyv(new_pairs, cols)
+    setindexv(new_pairs, "code")
+    dict <- rbindlist(list(dict, new_pairs), use.names = TRUE)
+    setkeyv(dict, cols)
+    setindexv(dict, "code")
+  }
+  dict[]
+}
diff --git a/R/utils-pipdata.R b/R/utils-pipdata.R
index dc5b823d..4274521c 100644
--- a/R/utils-pipdata.R
+++ b/R/utils-pipdata.R
@@ -1,36 +1,3 @@
-#' load refy list
-#'
-#' @param input_list list. output from [create_full_list]
-#' @param path character: directory path
-#'
-#' @return character vector
-#' @keywords internal
-load_list_refy <- \(input_list){
-
-  id_names <- input_list |>
-    fs::path_file() |>
-    fs::path_ext_remove()
-
-    seq_flex <- if (interactive()) {
-      cli::cli_progress_along
-    } else {
-      base::seq_along
-    }
-
-
-  lfst <- lapply(seq_flex(input_list),
-                 \(i) {
-                   x <- lup_files[i]
-                   idn <- fs::path_file(x) |>
-                     fs::path_ext_remove()
-                   fst::read_fst(x, as.data.table = TRUE) |>
-                     _[, id := idn]
-                 }) |>
-    setNames(id_names)
-
-  lfst
-}
-
 
 
diff --git a/man/build_pair_dict.Rd b/man/build_pair_dict.Rd
new file mode 100644
index 00000000..6fe82339
--- /dev/null
+++ b/man/build_pair_dict.Rd
@@ -0,0 +1,35 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/fgt_cumsum.R
+\name{build_pair_dict}
+\alias{build_pair_dict}
+\title{Dictionary for fast joins}
+\usage{
+build_pair_dict(lkup, fill_gaps = TRUE)
+}
+\arguments{
+\item{lkup}{Lookup object containing refy_lkup and svy_lkup.}
+
+\item{fill_gaps}{Logical, TRUE for lineup years, FALSE for survey years.}
+}
+\value{
+data.table with dictionary for merges.
+
+data.table with columns id, reporting_level, and code.
+}
+\description{
+Dictionary for fast joins
+}
+\keyword{Build}
+\keyword{Creates}
+\keyword{a}
+\keyword{and}
+\keyword{by}
+\keyword{data.table}
+\keyword{dictionary}
+\keyword{fast}
+\keyword{for}
+\keyword{id}
+\keyword{internal}
+\keyword{joins}
+\keyword{merging}
+\keyword{reporting_level.}
diff --git a/man/decode_pairs.Rd b/man/decode_pairs.Rd
new file mode 100644
index 00000000..96a5537d
--- /dev/null
+++ b/man/decode_pairs.Rd
@@ -0,0 +1,38 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/fgt_cumsum.R
+\name{decode_pairs}
+\alias{decode_pairs}
+\title{Decode integer code to id and reporting level}
+\usage{
+decode_pairs(
+  DT,
+  dict,
+  code_col = "id_rl",
+  id_col = "id",
+  level_col = "reporting_level",
+  keep_code = TRUE,
+  verbose = 0L
+)
+}
+\arguments{
+\item{DT}{data.table to decode.}
+
+\item{dict}{data.table from build_pair_dict().}
+
+\item{code_col}{Name of code column in DT.}
+
+\item{id_col}{Name of id column in dict.}
+
+\item{level_col}{Name of reporting level column in dict.}
+
+\item{keep_code}{Logical, keep code column if TRUE.}
+
+\item{verbose}{Integer, verbosity level.}
+}
+\value{
+data.table with id and reporting_level columns added.
+}
+\description{
+Joins labels by code using a dictionary.
+}
+\keyword{internal}
diff --git a/man/dot-get_pairs_df.Rd b/man/dot-get_pairs_df.Rd
new file mode 100644
index 00000000..4ef63c05
--- /dev/null
+++ b/man/dot-get_pairs_df.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/fgt_cumsum.R
+\name{.get_pairs_df}
+\alias{.get_pairs_df}
+\title{Extract id and level columns as a data.frame}
+\usage{
+.get_pairs_df(X, id_col, level_col)
+}
+\arguments{
+\item{X}{data.table or data.frame}
+
+\item{id_col}{Name of id column}
+
+\item{level_col}{Name of level column}
+}
+\value{
+data.frame with id and level columns
+}
+\description{
+Extract id and level columns as a data.frame
+}
+\keyword{internal}
diff --git a/man/encode_pairs.Rd b/man/encode_pairs.Rd
new file mode 100644
index 00000000..162a095c
--- /dev/null
+++ b/man/encode_pairs.Rd
@@ -0,0 +1,41 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/fgt_cumsum.R
+\name{encode_pairs}
+\alias{encode_pairs}
+\title{Encode pairs with integer code}
+\usage{
+encode_pairs(
+  DT,
+  dict,
+  id_col = "id",
+  level_col = "reporting_level",
+  code_col = "id_rl",
+  drop_labels = FALSE,
+  strict = TRUE,
+  verbose = 0L
+)
+}
+\arguments{
+\item{DT}{data.table to encode.}
+
+\item{dict}{data.table from build_pair_dict().}
+
+\item{id_col}{Name of id column.}
+
+\item{level_col}{Name of reporting level column.}
+
+\item{code_col}{Name of code column to write.}
+
+\item{drop_labels}{Logical, drop id and level columns if TRUE.}
+
+\item{strict}{Logical, error if any pairs are missing from dict.}
+
+\item{verbose}{Integer, verbosity level.}
+}
+\value{
+data.table with code column added.
+}
+\description{
+Adds an integer code column to a data.table by joining with a dictionary.
+}
+\keyword{internal}
diff --git a/man/fgt_watts_cumsum.Rd b/man/fgt_watts_cumsum.Rd
deleted file mode 100644
index 74d7ecdf..00000000
--- a/man/fgt_watts_cumsum.Rd
+++ /dev/null
@@ -1,21 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/compute_fgt_new.R
-\name{fgt_watts_cumsum}
-\alias{fgt_watts_cumsum}
-\title{compute fgt and watts using cumulative welfare rather than means}
-\usage{
-fgt_watts_cumsum(y, w, lines)
-}
-\arguments{
-\item{y}{numeric welfare (sorted ascending within the subgroup)}
-
-\item{w}{numeric weights (same order as y)}
-
-\item{lines}{numeric vector of poverty lines}
-}
-\value{
-Returns a data.table with columns: line, fgt0,fgt1,fgt2,watts
-}
-\description{
-compute fgt and watts using cumulative welfare rather than means
-}
diff --git a/man/format_lfst.Rd b/man/format_lfst.Rd
new file mode 100644
index 00000000..0667de50
--- /dev/null
+++ b/man/format_lfst.Rd
@@ -0,0 +1,38 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/fgt_cumsum.R
+\name{format_lfst}
+\alias{format_lfst}
+\title{format the lfst list to usable data to estimate poverty}
+\usage{
+format_lfst(lfst)
+}
+\arguments{
+\item{lfst}{List from load_list_refy().}
+}
+\value{
+list with DT and g (GRP object)
+
+List with elements: DT (data.table) and g (GRP object).
+}
+\description{
+format the lfst list to usable data to estimate poverty
+}
+\keyword{Format}
+\keyword{GRP}
+\keyword{Takes}
+\keyword{a}
+\keyword{and}
+\keyword{data.table}
+\keyword{estimation}
+\keyword{for}
+\keyword{from}
+\keyword{grouped}
+\keyword{internal}
+\keyword{lfst}
+\keyword{list}
+\keyword{load_list_refy()}
+\keyword{object}
+\keyword{operations.}
+\keyword{poverty}
+\keyword{returns}
+\keyword{with}
diff --git a/man/get_total_pop.Rd b/man/get_total_pop.Rd
new file mode 100644
index 00000000..b73f558c
--- /dev/null
+++ b/man/get_total_pop.Rd
@@ -0,0 +1,26 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/fgt_cumsum.R
+\name{get_total_pop}
+\alias{get_total_pop}
+\title{get total population by country year and reprting level}
+\usage{
+get_total_pop(LDTg)
+}
+\arguments{
+\item{LDTg}{List from format_lfst() with DT and g objects.}
+}
+\value{
+data.table  with total population by \verb{g[["groups]]}
+
+data.table with total population by group.
+}
+\description{
+get total population by country year and reprting level
+}
+\examples{
+Get total population by country, year, and reporting level
+
+Computes total population by group using the output of format_lfst().
+
+# get_total_pop(format_lfst(lfst))
+}
diff --git a/man/load_list_refy.Rd b/man/load_list_refy.Rd
index b1533a4f..75fb33df 100644
--- a/man/load_list_refy.Rd
+++ b/man/load_list_refy.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/utils-pipdata.R
+% Please edit documentation in R/fgt_cumsum.R
 \name{load_list_refy}
 \alias{load_list_refy}
 \title{load refy list}
@@ -7,14 +7,32 @@
 load_list_refy(input_list)
 }
 \arguments{
-\item{input_list}{list. output from \link{create_full_list}}
+\item{input_list}{Character vector of file paths (output from create_full_list).}
 
 \item{path}{character: directory path}
 }
 \value{
 character vector
+
+Named list of data.tables, each with an id column.
 }
 \description{
 load refy list
 }
+\keyword{Load}
+\keyword{Loads}
+\keyword{a}
+\keyword{an}
+\keyword{and}
+\keyword{column.}
+\keyword{data.tables,}
+\keyword{each}
+\keyword{files}
+\keyword{id}
 \keyword{internal}
+\keyword{list}
+\keyword{named}
+\keyword{of}
+\keyword{refy}
+\keyword{returns}
+\keyword{with}
diff --git a/man/transform_input.Rd b/man/transform_input.Rd
index e460cf0f..8415f5ac 100644
--- a/man/transform_input.Rd
+++ b/man/transform_input.Rd
@@ -7,7 +7,7 @@
 transform_input(input_list)
 }
 \arguments{
-\item{input_list}{list. output from \link{create_full_list}}
+\item{input_list}{Character vector of file paths (output from create_full_list).}
 }
 \value{
 formated list
diff --git a/man/update_pair_dict.Rd b/man/update_pair_dict.Rd
new file mode 100644
index 00000000..a142e98d
--- /dev/null
+++ b/man/update_pair_dict.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/fgt_cumsum.R
+\name{update_pair_dict}
+\alias{update_pair_dict}
+\title{Update dictionary with new pairs}
+\usage{
+update_pair_dict(dict, DT, id_col = "id", level_col = "reporting_level")
+}
+\arguments{
+\item{dict}{data.table dictionary from build_pair_dict().}
+
+\item{DT}{data.table with id and reporting_level columns.}
+
+\item{id_col}{Name of id column.}
+
+\item{level_col}{Name of reporting level column.}
+}
+\value{
+Updated data.table dictionary.
+}
+\description{
+Appends new (id, reporting_level) pairs to the dictionary if needed.
+}
+\keyword{internal}

From 01c52cc5a939bcc46d224d59b4957f74c33b5a8e Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Wed, 10 Sep 2025 14:28:35 -0400
Subject: [PATCH 106/203] improve helpers

---
 R/fgt_cumsum.R | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/R/fgt_cumsum.R b/R/fgt_cumsum.R
index dc5b9b7c..0c68c154 100644
--- a/R/fgt_cumsum.R
+++ b/R/fgt_cumsum.R
@@ -248,17 +248,25 @@ encode_pairs <- function(DT, dict,
 #' @keywords internal
 decode_pairs <- function(DT, dict,
                          code_col = "id_rl",
-                         id_col = "id", level_col = "reporting_level",
+                         id_col = "id",
+                         level_col = "reporting_level",
                          keep_code = TRUE,
                          verbose = 0L) {
-  stopifnot(is.data.table(DT), is.data.table(dict))
-  stopifnot(code_col %in% names(DT), all(c("code", id_col, level_col) %in% names(dict)))
+  stopifnot(exprs = {
+    is.data.table(DT)
+    is.data.table(dict)
+    })
+  stopifnot(exprs = {
+    code_col %in% names(DT)
+    all(c("code", id_col, level_col) %in% names(dict))
+    })
+
+  # dict_min <- dict[, .(code, ..id_col = get(id_col), ..level_col = get(level_col))]
+  # setnames(dict_min, c("code", id_col, level_col))
 
-  dict_min <- dict[, .(code, ..id_col = get(id_col), ..level_col = get(level_col))]
-  setnames(dict_min, c("code", id_col, level_col))
-
-  out <- collapse::join(
-    x = DT, y = dict_min,
+  out <- join(
+    x = DT,
+    y = dict,
     on = setNames("code", code_col),   # map DT[code_col] to dict$code
     how = "left",
     drop.dup.cols = "y",
@@ -267,7 +275,7 @@ decode_pairs <- function(DT, dict,
   )
   if (!is.data.table(out)) setDT(out)
   if (!keep_code) out[, (code_col) := NULL]
-  out[]
+  out
 }
 
 # ----------------------------------------------------- #
@@ -298,5 +306,5 @@ update_pair_dict <- function(dict, DT,
     setkeyv(dict, cols)
     setindexv(dict, "code")
   }
-  dict[]
+  dict
 }

From 76c6610465c733b8ddad2921433b9e6ceea7a7b6 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Wed, 10 Sep 2025 18:14:47 -0400
Subject: [PATCH 107/203] first attempt of new cumcum functions

---
 NAMESPACE               |   1 -
 R/compute_fgt_new.R     |   3 +-
 R/fg_pip.R              |  12 +++
 R/fgt_cumsum.R          | 216 ++++++++++++++++++++++++----------------
 man/compute_fgt.Rd      |  22 ++++
 man/decode_pairs.Rd     |   5 +-
 man/dot-get_pairs_df.Rd |  22 ----
 man/format_lfst.Rd      |   2 +-
 man/get_total_pop.Rd    |  16 +--
 9 files changed, 177 insertions(+), 122 deletions(-)
 create mode 100644 man/compute_fgt.Rd
 delete mode 100644 man/dot-get_pairs_df.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 9386e37b..96b78000 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -23,7 +23,6 @@ export(get_md_vars)
 export(get_param_values)
 export(get_pip_version)
 export(get_pipapienv)
-export(get_total_pop)
 export(get_user_alt_gt)
 export(get_user_x_code)
 export(get_valid_aux_long_format_tables)
diff --git a/R/compute_fgt_new.R b/R/compute_fgt_new.R
index 79cf0ecf..c6d52d39 100644
--- a/R/compute_fgt_new.R
+++ b/R/compute_fgt_new.R
@@ -78,7 +78,7 @@ compute_fgt_dt <- function(dt, welfare, weight, povlines, mean_and_med = FALSE)
 
 
-# Efficient FGT calculation for vectors (No data.table)
+#' Efficient FGT calculation for vectors (No data.table)
 #'
 #' @param w character: welfare variable name
 #' @param wt character: weight variable name
@@ -87,7 +87,6 @@ compute_fgt_dt <- function(dt, welfare, weight, povlines, mean_and_med = FALSE)
 #' @return data.table with estimates poverty estimates
 #' @keywords internal
 compute_fgt <- function(w, wt, povlines) {
-  n   <- length(w)
   m   <- length(povlines)
 
   # Pre-allocate result matrix
diff --git a/R/fg_pip.R b/R/fg_pip.R
index da323540..86ce5f5b 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -58,11 +58,23 @@ fg_pip <- function(country,
                 data_in_cache = data_present_in_master))
   }
 
+  dict <- build_pair_dict(lkup = lkup, fill_gaps = TRUE)
+
+
   full_list <- create_full_list(metadata = metadata)
 
   lfst <-
     load_list_refy(input_list = full_list)
 
+  LDTg <- format_lfst(lfst = lfst,
+                      dict = dict)
+  tpop <- get_total_pop(LDTg = LDTg,
+                        dict = dict)
+  fgt <-  fgt_cumsum(LDTg   = LDTg,
+                     tpop   = tpop,
+                     povline = povline) |>
+    decode_pairs(dict = dict)
+
   # lt <- lapply(lt, \(x) {
   #                  add_attributes_as_columns_vectorized(x)
   #              })
diff --git a/R/fgt_cumsum.R b/R/fgt_cumsum.R
index 0c68c154..9dd4fb02 100644
--- a/R/fgt_cumsum.R
+++ b/R/fgt_cumsum.R
@@ -13,113 +13,110 @@
 #' @param lfst List from load_list_refy().
 #' @return List with elements: DT (data.table) and g (GRP object).
 #' @keywords internal
-format_lfst <- \(lfst) {
+format_lfst <- \(lfst, dict) {
 
   DT <- rbindlist(lfst, fill = TRUE)
 
   # Convert to factors (is it faster?)
-  DT[, names(.SD) := lapply(.SD, qF),
-     .SDcols = c("id", "reporting_level")]
+  if (!is.integer(DT$index)) {
+    DT[, index := as.integer(index)]
+  }
 
-  # fix
-  # DT[index == 0,
-  #    names(.SD) := 0,
-  #    .SDcols = is.numeric]
-  #
-  # DT <- DT[!grepl("^CHN_", id)]
+  out <- encode_pairs(DT = DT,
+                      dict = dict,
+                      drop_labels = TRUE)
 
   ## Grouping ----------
+  g <- GRP(out, ~ id_rl, sort = FALSE)
 
-  g <- GRP(DT, ~ id + reporting_level, sort = FALSE)
-
-  list(DT = DT,
+  list(DT = out,
        g = g)
 }
 
 
-#' get total population by country year and reprting level
-#'
-#' @param LDTg list from format_lfst() with DT and g objects
-#'
-#' @return data.table  with total population by `g[["groups]]`
-#' @export
-#'
-#' @examples
-#' Get total population by country, year, and reporting level
-#'
 #' Computes total population by group using the output of format_lfst().
 #'
 #' @param LDTg List from format_lfst() with DT and g objects.
+#' @param dict data dictionary from build_pair_dict()
+#'
 #' @return data.table with total population by group.
-#' @export
-#' @examples
-#' # get_total_pop(format_lfst(lfst))
-get_total_pop <- \(LDTg) {
-  list2env(LDTg)
+#' @keywords internal
+get_total_pop <- \(LDTg, dict) {
+  list2env(LDTg, envir = environment())
+  rm(LDTg)
   add_vars(g[["groups"]],
            get_vars(DT, c("weight")) |>
            fsum(g)) |>
-    setnames(old = "weight",new =  "totpop")
+    setnames(old = "weight",
+             new =  "W") |>
+    encode_pairs(dict, drop_labels = TRUE)
 }
 
 
-
-# --- helpers ---------------------------------------------------------------
-#' load refy list
-#'
-#' @param input_list list. output from [create_full_list]
-#' @param path character: directory path
-#'
-#' @return character vector
-#' @keywords internal
-#' Load refy list
-#'
-#' Loads a list of files and returns a named list of data.tables, each with an id column.
-#'
-#' @param input_list Character vector of file paths (output from create_full_list).
-#' @return Named list of data.tables, each with an id column.
-#' @keywords internal
-load_list_refy <- \(input_list){
-
-  id_names <- input_list |>
-    fs::path_file() |>
-    fs::path_ext_remove()
-
-  seq_flex <- if (interactive()) {
-    cli::cli_progress_along
-  } else {
-    base::seq_along
+fgt_cumsum <- \(LDTg, tpop, povline,
+                drop_vars = TRUE) {
+  list2env(LDTg, envir = environment())
+  rm(LDTg)
+
+  # Temporal values to be added to the data.table
+  tz      <- pmax(povline, 1e-12)
+  tz2     <- pmax(povline^2, 1e-16)
+  tlogz   <- log(tz)
+
+  # 1) Compute cutpoint index for each z, using ONLY non-zero rows for welfare
+  #    -> findInterval(povline, welfare) returns values in 0..N (never N+1)
+  ID <- DT[index > 0L,
+           {
+             idx <- findInterval(povline, welfare, left.open = TRUE)
+             # 2) Attach z, z2, logz in-group (no replication/copies)
+             data.table(index = idx,
+                        z     = tz,
+                        z2    = tz2,
+                        logz  = tlogz)
+           },
+           by = id_rl]
+
+  # 3) Minimal cumulative view (shallow column subset; avoids copying DT)
+  DT_min <- get_vars(DT,
+                     c("id_rl", "index", "cw", "cwy", "cwy2", "cwylog"))
+
+  # 4) join cutpoints to cumulatives (index==0 hits the already-present zero row)
+  CS <- join(
+    x = ID,
+    y = DT_min,
+    on   = c("id_rl","index"),
+    how  = "left",
+    validate = "m:1",          # many cutpoints -> 1 cumulative row
+    drop.dup.cols = "y",
+    verbose = 0) |>
+  # 5) Bring total population W
+    join(tpop,
+         on = "id_rl",
+         how = "left",
+         validate = "m:1",
+         drop.dup.cols = "y") |>
+    setorder(id_rl, index)
+
+
+  # 6) Compute measures (vectorized). Small clamps for numerical safety.
+  CS[, `:=`(
+    headcount        = cw / W,
+    poverty_gap      = (z * cw - cwy) / (z_s * W),
+    poverty_severity = (z2 * cw - 2 * z * cwy + cwy2) / (z2_s * W),
+    watts            = (logz * cw - cwylog) / W
+  )]
+
+  if (!drop_vars) {
+    return(CS)
   }
+  get_vars(CS, c("id_rl", "headcount", "poverty_gap", "poverty_severity", "watts"))
 
-
-  lfst <- lapply(seq_flex(input_list),
-                 \(i) {
-                   x <- lup_files[i]
-                   idn <- fs::path_file(x) |>
-                     fs::path_ext_remove()
-                   fst::read_fst(x, as.data.table = TRUE) |>
-                     _[, id := idn]
-                 }) |>
-    setNames(id_names)
-
-  lfst
 }
 
 
-# Pull just the 2 id columns as a base R data.frame (robust across classes)
-#' Extract id and level columns as a data.frame
-#'
-#' @param X data.table or data.frame
-#' @param id_col Name of id column
-#' @param level_col Name of level column
-#' @return data.frame with id and level columns
-#' @keywords internal
-.get_pairs_df <- function(X, id_col, level_col) {
-  as.data.frame(X[, c(id_col, level_col), drop = FALSE])
-}
-
+# --- helpers ---------------------------------------------------------------
 
 # ------------------------------- #
 # 1) Build pair dictionary (DT)   #
@@ -243,14 +240,17 @@ encode_pairs <- function(DT, dict,
 #' @param id_col Name of id column in dict.
 #' @param level_col Name of reporting level column in dict.
 #' @param keep_code Logical, keep code column if TRUE.
+#' @param add_true_vars logical, add `country_code` and `reporting_year`
 #' @param verbose Integer, verbosity level.
+#'
 #' @return data.table with id and reporting_level columns added.
 #' @keywords internal
 decode_pairs <- function(DT, dict,
                          code_col = "id_rl",
                          id_col = "id",
                          level_col = "reporting_level",
-                         keep_code = TRUE,
+                         keep_code = FALSE,
+                         add_true_vars = TRUE,
                          verbose = 0L) {
   stopifnot(exprs = {
     is.data.table(DT)
@@ -261,9 +261,6 @@ decode_pairs <- function(DT, dict,
     all(c("code", id_col, level_col) %in% names(dict))
     })
 
-  # dict_min <- dict[, .(code, ..id_col = get(id_col), ..level_col = get(level_col))]
-  # setnames(dict_min, c("code", id_col, level_col))
-
   out <- join(
     x = DT,
     y = dict,
@@ -272,8 +269,16 @@ decode_pairs <- function(DT, dict,
     drop.dup.cols = "y",
     validate = "m:1",
     verbose = verbose
-  )
-  if (!is.data.table(out)) setDT(out)
+  ) |>
+    qDT()
+
+  if (add_true_vars) {
+    out[, `:=`(
+        country_code   = gsub("(.+)(_.+)", "\\1", id),
+        reporting_year = as.integer(gsub("(.+_)(.+)", "\\2", id))
+      )]
+  }
+
   if (!keep_code) out[, (code_col) := NULL]
   out
 }
@@ -308,3 +313,46 @@ update_pair_dict <- function(dict, DT,
   }
   dict
 }
+
+
+
+#' load refy list
+#'
+#' @param input_list list. output from [create_full_list]
+#' @param path character: directory path
+#'
+#' @return character vector
+#' @keywords internal
+#' Load refy list
+#'
+#' Loads a list of files and returns a named list of data.tables, each with an id column.
+#'
+#' @param input_list Character vector of file paths (output from create_full_list).
+#' @return Named list of data.tables, each with an id column.
+#' @keywords internal
+load_list_refy <- \(input_list){
+
+  id_names <- input_list |>
+    fs::path_file() |>
+    fs::path_ext_remove()
+
+  seq_flex <- if (interactive()) {
+    cli::cli_progress_along
+  } else {
+    base::seq_along
+  }
+
+
+  lfst <- lapply(seq_flex(input_list),
+                 \(i) {
+                   x <- lup_files[i]
+                   idn <- fs::path_file(x) |>
+                     fs::path_ext_remove()
+                   fst::read_fst(x, as.data.table = TRUE) |>
+                     _[, id := idn]
+                 }) |>
+    setNames(id_names)
+
+  lfst
+}
+
diff --git a/man/compute_fgt.Rd b/man/compute_fgt.Rd
new file mode 100644
index 00000000..0a5b479e
--- /dev/null
+++ b/man/compute_fgt.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/compute_fgt_new.R
+\name{compute_fgt}
+\alias{compute_fgt}
+\title{Efficient FGT calculation for vectors (No data.table)}
+\usage{
+compute_fgt(w, wt, povlines)
+}
+\arguments{
+\item{w}{character: welfare variable name}
+
+\item{wt}{character: weight variable name}
+
+\item{povlines}{double: vector with poverty lines}
+}
+\value{
+data.table with estimates poverty estimates
+}
+\description{
+Efficient FGT calculation for vectors (No data.table)
+}
+\keyword{internal}
diff --git a/man/decode_pairs.Rd b/man/decode_pairs.Rd
index 96a5537d..988829e2 100644
--- a/man/decode_pairs.Rd
+++ b/man/decode_pairs.Rd
@@ -10,7 +10,8 @@ decode_pairs(
   code_col = "id_rl",
   id_col = "id",
   level_col = "reporting_level",
-  keep_code = TRUE,
+  keep_code = FALSE,
+  add_true_vars = TRUE,
   verbose = 0L
 )
 }
@@ -27,6 +28,8 @@ decode_pairs(
 
 \item{keep_code}{Logical, keep code column if TRUE.}
 
+\item{add_true_vars}{logical, add \code{country_code} and \code{reporting_year}}
+
 \item{verbose}{Integer, verbosity level.}
 }
 \value{
diff --git a/man/dot-get_pairs_df.Rd b/man/dot-get_pairs_df.Rd
deleted file mode 100644
index 4ef63c05..00000000
--- a/man/dot-get_pairs_df.Rd
+++ /dev/null
@@ -1,22 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/fgt_cumsum.R
-\name{.get_pairs_df}
-\alias{.get_pairs_df}
-\title{Extract id and level columns as a data.frame}
-\usage{
-.get_pairs_df(X, id_col, level_col)
-}
-\arguments{
-\item{X}{data.table or data.frame}
-
-\item{id_col}{Name of id column}
-
-\item{level_col}{Name of level column}
-}
-\value{
-data.frame with id and level columns
-}
-\description{
-Extract id and level columns as a data.frame
-}
-\keyword{internal}
diff --git a/man/format_lfst.Rd b/man/format_lfst.Rd
index 0667de50..806e6161 100644
--- a/man/format_lfst.Rd
+++ b/man/format_lfst.Rd
@@ -4,7 +4,7 @@
 \alias{format_lfst}
 \title{format the lfst list to usable data to estimate poverty}
 \usage{
-format_lfst(lfst)
+format_lfst(lfst, dict)
 }
 \arguments{
 \item{lfst}{List from load_list_refy().}
diff --git a/man/get_total_pop.Rd b/man/get_total_pop.Rd
index b73f558c..4808e0e7 100644
--- a/man/get_total_pop.Rd
+++ b/man/get_total_pop.Rd
@@ -2,25 +2,19 @@
 % Please edit documentation in R/fgt_cumsum.R
 \name{get_total_pop}
 \alias{get_total_pop}
-\title{get total population by country year and reprting level}
+\title{Computes total population by group using the output of format_lfst().}
 \usage{
-get_total_pop(LDTg)
+get_total_pop(LDTg, dict)
 }
 \arguments{
 \item{LDTg}{List from format_lfst() with DT and g objects.}
+
+\item{dict}{data dictionary from build_pair_dict()}
 }
 \value{
-data.table  with total population by \verb{g[["groups]]}
-
 data.table with total population by group.
 }
 \description{
-get total population by country year and reprting level
-}
-\examples{
-Get total population by country, year, and reporting level
-
 Computes total population by group using the output of format_lfst().
-
-# get_total_pop(format_lfst(lfst))
 }
+\keyword{internal}

From a182f088057872dc14d7cc3bc5b1e973d0e00462 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Wed, 10 Sep 2025 18:20:54 -0400
Subject: [PATCH 108/203] improve speed in get_mean_median

---
 R/fg_pip.R     | 27 ---------------------------
 R/fgt_cumsum.R |  6 ++++--
 R/utils.R      | 14 +++++++-------
 3 files changed, 11 insertions(+), 36 deletions(-)

diff --git a/R/fg_pip.R b/R/fg_pip.R
index 86ce5f5b..f3481c5c 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -75,33 +75,6 @@ fg_pip <- function(country,
                      povline = povline) |>
     decode_pairs(dict = dict)
 
-  # lt <- lapply(lt, \(x) {
-  #                  add_attributes_as_columns_vectorized(x)
-  #              })
-
-  # Extract some attributes
-  # lt_att <- get_lt_attr(lt)
-
-  # get rows indices
-  # l_rl_rows <- get_rl_rows(lt_att)
-
-
-  # DT <- map_lt_to_dt(lt, l_rl_rows)
-  # setorder(DT, country_code, reporting_year, reporting_level, welfare)
-  # g  <- GRP(DT,
-  #           ~ country_code + reporting_year + reporting_level,
-  #           sort = TRUE)
-
-
-
-
-  # ZP Add: do fgt estimations using `res <- lapply(lt, process_dt, povline = povline)`
-  #-------------------------
-  fgt <- map_fgt(lt, l_rl_rows, povline) |>
-    funique() # TO REMOVE
-
-  # convert reporting year to numeric
-  fgt[, reporting_year := as.numeric(reporting_year)]
 
   # Add just mean and median
   res <- get_mean_median(fgt, lkup, fill_gaps = TRUE)
diff --git a/R/fgt_cumsum.R b/R/fgt_cumsum.R
index 9dd4fb02..02666587 100644
--- a/R/fgt_cumsum.R
+++ b/R/fgt_cumsum.R
@@ -240,7 +240,8 @@ encode_pairs <- function(DT, dict,
 #' @param id_col Name of id column in dict.
 #' @param level_col Name of reporting level column in dict.
 #' @param keep_code Logical, keep code column if TRUE.
-#' @param add_true_vars logical, add `country_code` and `reporting_year`
+#' @param add_true_vars logical, add `country_code` and `reporting_year` and
+#' removes var `id`
 #' @param verbose Integer, verbosity level.
 #'
 #' @return data.table with id and reporting_level columns added.
@@ -276,7 +277,8 @@ decode_pairs <- function(DT, dict,
     out[, `:=`(
         country_code   = gsub("(.+)(_.+)", "\\1", id),
         reporting_year = as.integer(gsub("(.+_)(.+)", "\\2", id))
-      )]
+      )][,
+         id := NULL]
   }
 
   if (!keep_code) out[, (code_col) := NULL]
diff --git a/R/utils.R b/R/utils.R
index 0d84c2ac..cf118506 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -1478,14 +1478,14 @@ get_mean_median <- \(fgt, lkup, fill_gaps) {
   if (isFALSE(lkup$use_new_lineup_version)) return(fgt)
 
   if (fill_gaps) {
-    dist <- lkup$lineup_dist_stats[,
-                                   .(country_code, reporting_year,
-                                     reporting_level, mean, median)]
+    dist <- get_vars(lkup$lineup_dist_stats,
+                     c("country_code", "reporting_year",
+                       "reporting_level", "mean", "median"))
   } else {
-    dist <- lkup$dist_stats[,
-                            .(country_code, reporting_year,
-                              reporting_level, mean,
-                              median = survey_median_ppp)]
+    dist <- get_vars(lkup$dist_stats,
+                     c("country_code", "reporting_year",
+                       "reporting_level", "mean", "survey_median_ppp"))
+    setnames(dist, "survey_median_ppp", "median")
 
   }
   joyn::joyn(x = fgt,

From 72f96e29d03f0a7700b75660e84eb1c230cf45e4 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Wed, 10 Sep 2025 18:23:30 -0400
Subject: [PATCH 109/203] add join rather than joyn

---
 R/utils.R           | 13 ++++++-------
 man/decode_pairs.Rd |  3 ++-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/R/utils.R b/R/utils.R
index cf118506..60fa57ac 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -1488,13 +1488,12 @@ get_mean_median <- \(fgt, lkup, fill_gaps) {
     setnames(dist, "survey_median_ppp", "median")
 
   }
-  joyn::joyn(x = fgt,
-             y = dist,
-             by = c('country_code', "reporting_year", "reporting_level"),
-             match_type = "m:1", # multiple povlines
-             keep = "left",
-             reportvar = FALSE,
-             verbose = FALSE)
+  join(x = fgt,
+      y = dist,
+      on = c('country_code', "reporting_year", "reporting_level"),
+      how = "left",
+      validate = "m:1", # multiple povlines
+      verbose = 0L)
 }
 
 
diff --git a/man/decode_pairs.Rd b/man/decode_pairs.Rd
index 988829e2..c77164bc 100644
--- a/man/decode_pairs.Rd
+++ b/man/decode_pairs.Rd
@@ -28,7 +28,8 @@ decode_pairs(
 
 \item{keep_code}{Logical, keep code column if TRUE.}
 
-\item{add_true_vars}{logical, add \code{country_code} and \code{reporting_year}}
+\item{add_true_vars}{logical, add \code{country_code} and \code{reporting_year} and
+removes var \code{id}}
 
 \item{verbose}{Integer, verbosity level.}
 }

From 85aaf04238f7f47289f14ee17a46962bdd9c5857 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Wed, 10 Sep 2025 18:27:59 -0400
Subject: [PATCH 110/203] improve documentation

---
 R/fgt_cumsum.R          | 85 +++++++++++++++++++----------------------
 man/build_pair_dict.Rd  | 20 ++--------
 man/decode_pairs.Rd     |  8 ++--
 man/encode_pairs.Rd     |  5 ++-
 man/fgt_cumsum.Rd       | 24 ++++++++++++
 man/format_lfst.Rd      | 31 ++++-----------
 man/get_total_pop.Rd    |  9 +++--
 man/load_list_refy.Rd   | 27 ++-----------
 man/transform_input.Rd  |  2 +-
 man/update_pair_dict.Rd |  5 ++-
 10 files changed, 93 insertions(+), 123 deletions(-)
 create mode 100644 man/fgt_cumsum.Rd

diff --git a/R/fgt_cumsum.R b/R/fgt_cumsum.R
index 02666587..687dd177 100644
--- a/R/fgt_cumsum.R
+++ b/R/fgt_cumsum.R
@@ -1,17 +1,13 @@
 # MAIN -------------------------------
 
-#' format the lfst list to usable data to estimate poverty
+#' Format loaded survey list for grouped poverty analysis
 #'
-#' @param lfst list from load_list_refy()
+#' Combines a list of survey data.tables into a single data.table and encodes group identifiers using a dictionary.
+#' Returns a data.table and a GRP object for efficient grouped operations, used as a preprocessing step for FGT and population calculations.
 #'
-#' @return list with DT and g (GRP object)
-#' @keywords internal
-#' Format lfst list for poverty estimation
-#'
-#' Takes a list from load_list_refy() and returns a list with a data.table and a GRP object for grouped operations.
-#'
-#' @param lfst List from load_list_refy().
-#' @return List with elements: DT (data.table) and g (GRP object).
+#' @param lfst Named list of data.tables, as returned by load_list_refy().
+#' @param dict data.table dictionary for id/reporting_level encoding (from build_pair_dict()).
+#' @return List with elements: DT (data.table of all surveys, with id_rl) and g (GRP object for grouping by id_rl).
 #' @keywords internal
 format_lfst <- \(lfst, dict) {
 
@@ -36,12 +32,14 @@ format_lfst <- \(lfst, dict) {
 
 
-#' Computes total population by group using the output of format_lfst().
+#' Compute total population by survey and reporting level
 #'
-#' @param LDTg List from format_lfst() with DT and g objects.
-#' @param dict data dictionary from build_pair_dict()
+#' Sums the weights for each (id, reporting_level) group in the combined survey data.
+#' Used as a denominator for FGT and Watts index calculations.
 #'
-#' @return data.table with total population by group.
+#' @param LDTg List from format_lfst() with DT and g objects.
+#' @param dict data.table dictionary for id/reporting_level encoding (from build_pair_dict()).
+#' @return data.table with total population by group (columns: id_rl, W).
 #' @keywords internal
 get_total_pop <- \(LDTg, dict) {
   list2env(LDTg, envir = environment())
@@ -54,7 +52,16 @@ get_total_pop <- \(LDTg, dict) {
     encode_pairs(dict, drop_labels = TRUE)
 }
 
-
+#' Compute FGT and Watts indices for all groups and poverty lines
+#'
+#' Calculates headcount, poverty gap, poverty severity, and Watts index for each group and poverty line using cumulative sums.
+#'
+#' @param LDTg List from format_lfst() with DT and g objects.
+#' @param tpop data.table with total population by group (from get_total_pop()).
+#' @param povline Numeric vector of poverty lines.
+#' @param drop_vars Logical, if TRUE returns only summary columns.
+#' @return data.table with FGT and Watts measures by group and poverty line.
+#' @keywords internal
 fgt_cumsum <- \(LDTg, tpop, povline,
                 drop_vars = TRUE) {
   list2env(LDTg, envir = environment())
@@ -122,16 +129,10 @@ fgt_cumsum <- \(LDTg, tpop, povline,
 # 1) Build pair dictionary (DT)   #
 # ------------------------------- #
 
-#' Dictionary for fast joins
-#'
-#' @param lkup lkup object
-#' @param fill_gaps TRUE for lineup years, FALSE for survey years
-#'
-#' @return data.table with dictionary for merges.
-#' @keywords internal
-#' Build dictionary for fast joins
+#' Build dictionary for id/reporting_level encoding
 #'
-#' Creates a data.table dictionary for merging by id and reporting_level.
+#' Creates a data.table dictionary for mapping (id, reporting_level) pairs to integer codes for fast joins and decoding.
+#' Used for efficient merging and decoding in the FGT pipeline.
 #'
 #' @param lkup Lookup object containing refy_lkup and svy_lkup.
 #' @param fill_gaps Logical, TRUE for lineup years, FALSE for survey years.
@@ -164,12 +165,10 @@ build_pair_dict <- function(lkup, fill_gaps = TRUE) {
 # -------------------------------------------- #
 # 2) Encode: add integer code via collapse::join
 # -------------------------------------------- #
-# DT: data.table to encode (by reference not guaranteed since join copies x->result)
-# dict: data.table from build_pair_dict()
-# code_col: name of code column to write
-#' Encode pairs with integer code
+#' Encode (id, reporting_level) pairs as integer codes
 #'
-#' Adds an integer code column to a data.table by joining with a dictionary.
+#' Joins a data.table with a dictionary to add an integer code column for each (id, reporting_level) pair.
+#' Used for efficient grouping and decoding in the FGT pipeline.
 #'
 #' @param DT data.table to encode.
 #' @param dict data.table from build_pair_dict().
@@ -230,9 +229,10 @@ encode_pairs <- function(DT, dict,
 # ------------------------------------------------ #
 # 3) Decode: join labels by code via collapse::join #
 # ------------------------------------------------ #
-#' Decode integer code to id and reporting level
+#' Decode integer code to (id, reporting_level) labels
 #'
-#' Joins labels by code using a dictionary.
+#' Joins a data.table with a dictionary to recover id and reporting_level columns from integer codes.
+#' Used after FGT calculations to restore human-readable labels.
 #'
 #' @param DT data.table to decode.
 #' @param dict data.table from build_pair_dict().
@@ -240,10 +240,8 @@ encode_pairs <- function(DT, dict,
 #' @param id_col Name of id column in dict.
 #' @param level_col Name of reporting level column in dict.
 #' @param keep_code Logical, keep code column if TRUE.
-#' @param add_true_vars logical, add `country_code` and `reporting_year` and
-#' removes var `id`
+#' @param add_true_vars Logical, add country_code and reporting_year columns and remove id.
 #' @param verbose Integer, verbosity level.
-#'
 #' @return data.table with id and reporting_level columns added.
 #' @keywords internal
 decode_pairs <- function(DT, dict,
@@ -288,9 +286,10 @@ decode_pairs <- function(DT, dict,
 # ----------------------------------------------------- #
 # 4) Update dict with new pairs (append-only, fast DT)  #
 # ----------------------------------------------------- #
-#' Update dictionary with new pairs
+#' Update dictionary with new (id, reporting_level) pairs
 #'
-#' Appends new (id, reporting_level) pairs to the dictionary if needed.
+#' Appends new (id, reporting_level) pairs to the dictionary if needed, ensuring all groups are encoded.
+#' Used to keep the dictionary in sync with new survey data.
 #'
 #' @param dict data.table dictionary from build_pair_dict().
 #' @param DT data.table with id and reporting_level columns.
@@ -318,18 +317,12 @@ update_pair_dict <- function(dict, DT,
 
 
-#' load refy list
-#'
-#' @param input_list list. output from [create_full_list]
-#' @param path character: directory path
-#'
-#' @return character vector
-#' @keywords internal
-#' Load refy list
+#' Load survey data from file list
 #'
-#' Loads a list of files and returns a named list of data.tables, each with an id column.
+#' Reads a list of survey files (e.g., .fst) and returns a named list of data.tables, each with an id column.
+#' Used as the first step in the pipeline after creating the file list.
 #'
-#' @param input_list Character vector of file paths (output from create_full_list).
+#' @param input_list Character vector of file paths (from create_full_list()).
 #' @return Named list of data.tables, each with an id column.
 #' @keywords internal
 load_list_refy <- \(input_list){
diff --git a/man/build_pair_dict.Rd b/man/build_pair_dict.Rd
index 6fe82339..c0c1d212 100644
--- a/man/build_pair_dict.Rd
+++ b/man/build_pair_dict.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/fgt_cumsum.R
 \name{build_pair_dict}
 \alias{build_pair_dict}
-\title{Dictionary for fast joins}
+\title{Build dictionary for id/reporting_level encoding}
 \usage{
 build_pair_dict(lkup, fill_gaps = TRUE)
 }
@@ -12,24 +12,10 @@ build_pair_dict(lkup, fill_gaps = TRUE)
 \item{fill_gaps}{Logical, TRUE for lineup years, FALSE for survey years.}
 }
 \value{
-data.table with dictionary for merges.
-
 data.table with columns id, reporting_level, and code.
 }
 \description{
-Dictionary for fast joins
+Creates a data.table dictionary for mapping (id, reporting_level) pairs to integer codes for fast joins and decoding.
+Used for efficient merging and decoding in the FGT pipeline.
 }
-\keyword{Build}
-\keyword{Creates}
-\keyword{a}
-\keyword{and}
-\keyword{by}
-\keyword{data.table}
-\keyword{dictionary}
-\keyword{fast}
-\keyword{for}
-\keyword{id}
 \keyword{internal}
-\keyword{joins}
-\keyword{merging}
-\keyword{reporting_level.}
diff --git a/man/decode_pairs.Rd b/man/decode_pairs.Rd
index c77164bc..cefc628b 100644
--- a/man/decode_pairs.Rd
+++ b/man/decode_pairs.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/fgt_cumsum.R
 \name{decode_pairs}
 \alias{decode_pairs}
-\title{Decode integer code to id and reporting level}
+\title{Decode integer code to (id, reporting_level) labels}
 \usage{
 decode_pairs(
   DT,
@@ -28,8 +28,7 @@ decode_pairs(
 
 \item{keep_code}{Logical, keep code column if TRUE.}
 
-\item{add_true_vars}{logical, add \code{country_code} and \code{reporting_year} and
-removes var \code{id}}
+\item{add_true_vars}{Logical, add country_code and reporting_year columns and remove id.}
 
 \item{verbose}{Integer, verbosity level.}
 }
@@ -37,6 +36,7 @@ removes var \code{id}}
 data.table with id and reporting_level columns added.
 }
 \description{
-Joins labels by code using a dictionary.
+Joins a data.table with a dictionary to recover id and reporting_level columns from integer codes.
+Used after FGT calculations to restore human-readable labels.
 }
 \keyword{internal}
diff --git a/man/encode_pairs.Rd b/man/encode_pairs.Rd
index 162a095c..01841436 100644
--- a/man/encode_pairs.Rd
+++ b/man/encode_pairs.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/fgt_cumsum.R
 \name{encode_pairs}
 \alias{encode_pairs}
-\title{Encode pairs with integer code}
+\title{Encode (id, reporting_level) pairs as integer codes}
 \usage{
 encode_pairs(
   DT,
@@ -36,6 +36,7 @@ encode_pairs(
 data.table with code column added.
 }
 \description{
-Adds an integer code column to a data.table by joining with a dictionary.
+Joins a data.table with a dictionary to add an integer code column for each (id, reporting_level) pair.
+Used for efficient grouping and decoding in the FGT pipeline.
 }
 \keyword{internal}
diff --git a/man/fgt_cumsum.Rd b/man/fgt_cumsum.Rd
new file mode 100644
index 00000000..535ff3e9
--- /dev/null
+++ b/man/fgt_cumsum.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/fgt_cumsum.R
+\name{fgt_cumsum}
+\alias{fgt_cumsum}
+\title{Compute FGT and Watts indices for all groups and poverty lines}
+\usage{
+fgt_cumsum(LDTg, tpop, povline, drop_vars = TRUE)
+}
+\arguments{
+\item{LDTg}{List from format_lfst() with DT and g objects.}
+
+\item{tpop}{data.table with total population by group (from get_total_pop()).}
+
+\item{povline}{Numeric vector of poverty lines.}
+
+\item{drop_vars}{Logical, if TRUE returns only summary columns.}
+}
+\value{
+data.table with FGT and Watts measures by group and poverty line.
+}
+\description{
+Calculates headcount, poverty gap, poverty severity, and Watts index for each group and poverty line using cumulative sums.
+}
+\keyword{internal}
diff --git a/man/format_lfst.Rd b/man/format_lfst.Rd
index 806e6161..c1268268 100644
--- a/man/format_lfst.Rd
+++ b/man/format_lfst.Rd
@@ -2,37 +2,20 @@
 % Please edit documentation in R/fgt_cumsum.R
 \name{format_lfst}
 \alias{format_lfst}
-\title{format the lfst list to usable data to estimate poverty}
+\title{Format loaded survey list for grouped poverty analysis}
 \usage{
 format_lfst(lfst, dict)
 }
 \arguments{
-\item{lfst}{List from load_list_refy().}
+\item{lfst}{Named list of data.tables, as returned by load_list_refy().}
+
+\item{dict}{data.table dictionary for id/reporting_level encoding (from build_pair_dict()).}
 }
 \value{
-list with DT and g (GRP object)
-
-List with elements: DT (data.table) and g (GRP object).
+List with elements: DT (data.table of all surveys, with id_rl) and g (GRP object for grouping by id_rl).
 }
 \description{
-format the lfst list to usable data to estimate poverty
+Combines a list of survey data.tables into a single data.table and encodes group identifiers using a dictionary.
+Returns a data.table and a GRP object for efficient grouped operations, used as a preprocessing step for FGT and population calculations.
 }
-\keyword{Format}
-\keyword{GRP}
-\keyword{Takes}
-\keyword{a}
-\keyword{and}
-\keyword{data.table}
-\keyword{estimation}
-\keyword{for}
-\keyword{from}
-\keyword{grouped}
 \keyword{internal}
-\keyword{lfst}
-\keyword{list}
-\keyword{load_list_refy()}
-\keyword{object}
-\keyword{operations.}
-\keyword{poverty}
-\keyword{returns}
-\keyword{with}
diff --git a/man/get_total_pop.Rd b/man/get_total_pop.Rd
index 4808e0e7..a83525bd 100644
--- a/man/get_total_pop.Rd
+++ b/man/get_total_pop.Rd
@@ -2,19 +2,20 @@
 % Please edit documentation in R/fgt_cumsum.R
 \name{get_total_pop}
 \alias{get_total_pop}
-\title{Computes total population by group using the output of format_lfst().}
+\title{Compute total population by survey and reporting level}
 \usage{
 get_total_pop(LDTg, dict)
 }
 \arguments{
 \item{LDTg}{List from format_lfst() with DT and g objects.}
 
-\item{dict}{data dictionary from build_pair_dict()}
+\item{dict}{data.table dictionary for id/reporting_level encoding (from build_pair_dict()).}
 }
 \value{
-data.table with total population by group.
+data.table with total population by group (columns: id_rl, W).
 }
 \description{
-Computes total population by group using the output of format_lfst().
+Sums the weights for each (id, reporting_level) group in the combined survey data.
+Used as a denominator for FGT and Watts index calculations.
 }
 \keyword{internal}
diff --git a/man/load_list_refy.Rd b/man/load_list_refy.Rd
index 75fb33df..b91f4d82 100644
--- a/man/load_list_refy.Rd
+++ b/man/load_list_refy.Rd
@@ -2,37 +2,18 @@
 % Please edit documentation in R/fgt_cumsum.R
 \name{load_list_refy}
 \alias{load_list_refy}
-\title{load refy list}
+\title{Load survey data from file list}
 \usage{
 load_list_refy(input_list)
 }
 \arguments{
-\item{input_list}{Character vector of file paths (output from create_full_list).}
-
-\item{path}{character: directory path}
+\item{input_list}{Character vector of file paths (from create_full_list()).}
 }
 \value{
-character vector
-
 Named list of data.tables, each with an id column.
 }
 \description{
-load refy list
+Reads a list of survey files (e.g., .fst) and returns a named list of data.tables, each with an id column.
+Used as the first step in the pipeline after creating the file list.
 }
-\keyword{Load}
-\keyword{Loads}
-\keyword{a}
-\keyword{an}
-\keyword{and}
-\keyword{column.}
-\keyword{data.tables,}
-\keyword{each}
-\keyword{files}
-\keyword{id}
 \keyword{internal}
-\keyword{list}
-\keyword{named}
-\keyword{of}
-\keyword{refy}
-\keyword{returns}
-\keyword{with}
diff --git a/man/transform_input.Rd b/man/transform_input.Rd
index 8415f5ac..14664b98 100644
--- a/man/transform_input.Rd
+++ b/man/transform_input.Rd
@@ -7,7 +7,7 @@
 transform_input(input_list)
 }
 \arguments{
-\item{input_list}{Character vector of file paths (output from create_full_list).}
+\item{input_list}{Character vector of file paths (from create_full_list()).}
 }
 \value{
 formated list
diff --git a/man/update_pair_dict.Rd b/man/update_pair_dict.Rd
index a142e98d..4a41a16c 100644
--- a/man/update_pair_dict.Rd
+++ b/man/update_pair_dict.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/fgt_cumsum.R
 \name{update_pair_dict}
 \alias{update_pair_dict}
-\title{Update dictionary with new pairs}
+\title{Update dictionary with new (id, reporting_level) pairs}
 \usage{
 update_pair_dict(dict, DT, id_col = "id", level_col = "reporting_level")
 }
@@ -19,6 +19,7 @@ update_pair_dict(dict, DT, id_col = "id", level_col = "reporting_level")
 Updated data.table dictionary.
 }
 \description{
-Appends new (id, reporting_level) pairs to the dictionary if needed.
+Appends new (id, reporting_level) pairs to the dictionary if needed, ensuring all groups are encoded.
+Used to keep the dictionary in sync with new survey data.
 }
 \keyword{internal}

From 37a86ba782ce57204558f03c242d0fb56c1ae9db Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Wed, 10 Sep 2025 21:03:45 -0400
Subject: [PATCH 111/203] change lkup to read from fst

---
 R/create_lkups.R |  4 +++-
 R/fg_pip.R       | 28 ++++++++++++++++++----------
 R/fgt_cumsum.R   | 12 +++++-------
 3 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/R/create_lkups.R b/R/create_lkups.R
index 9f2a847e..6b42381d 100644
--- a/R/create_lkups.R
+++ b/R/create_lkups.R
@@ -186,6 +186,8 @@ create_lkups <- function(data_dir, versions) {
 
 
   # ZP ADD - CREATE OBJECT: refy_lkup
+
+  # CREATE OBJECT: refy_lkup -------------
   #___________________________________________________________________________
   if (use_new_lineup_version) {
       refy_lkup_path <- fs::path(data_dir,
@@ -279,7 +281,7 @@ create_lkups <- function(data_dir, versions) {
                             paste0(country_code,
                                    "_",
                                    reporting_year),
-                            ext = "qs") |>
+                            ext = "fst") |>
                      as.character()
                  }
       ]
diff --git a/R/fg_pip.R b/R/fg_pip.R
index f3481c5c..81a6bd11 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -58,23 +58,31 @@ fg_pip <- function(country,
                 data_in_cache = data_present_in_master))
   }
 
-  dict <- build_pair_dict(lkup = lkup, fill_gaps = TRUE)
-
+  # Build a dictionary for encoding (id, reporting_level) pairs as integer codes.
+  dict <- build_pair_dict(lkup = lkup,
+                          fill_gaps = TRUE)
 
+  # Create a list of file paths for all surveys to be loaded, based on the filtered metadata.
   full_list <- create_full_list(metadata = metadata)
 
-  lfst <-
-    load_list_refy(input_list = full_list)
+  # Load all survey data files into a named list of data.tables, each with an id column.
+  lfst <- load_list_refy(input_list = full_list)
 
+  # Combine all loaded surveys into a single data.table, encode group identifiers,
+  # and create a GRP object for efficient grouping.
   LDTg <- format_lfst(lfst = lfst,
                       dict = dict)
-  tpop <- get_total_pop(LDTg = LDTg,
-                        dict = dict)
-  fgt <-  fgt_cumsum(LDTg   = LDTg,
-                     tpop   = tpop,
-                     povline = povline) |>
-    decode_pairs(dict = dict)
 
+  # Compute the total population (sum of weights) for each group (id_rl) in
+  # the combined survey data.
+  tpop <- get_total_pop(LDTg = LDTg)
+
+  # Compute FGT and Watts indices for all groups and poverty lines, then decode
+  # integer codes back to (country_code, reporting_year, reporting_level).
+  fgt <- fgt_cumsum(LDTg = LDTg,
+                    tpop = tpop,
+                    povline = povline) |>
+    decode_pairs(dict = dict)
 
   # Add just mean and median
   res <- get_mean_median(fgt, lkup, fill_gaps = TRUE)
diff --git a/R/fgt_cumsum.R b/R/fgt_cumsum.R
index 687dd177..cb356415 100644
--- a/R/fgt_cumsum.R
+++ b/R/fgt_cumsum.R
@@ -38,18 +38,16 @@ format_lfst <- \(lfst, dict) {
 #' Used as a denominator for FGT and Watts index calculations.
 #'
 #' @param LDTg List from format_lfst() with DT and g objects.
-#' @param dict data.table dictionary for id/reporting_level encoding (from build_pair_dict()).
 #' @return data.table with total population by group (columns: id_rl, W).
 #' @keywords internal
-get_total_pop <- \(LDTg, dict) {
+get_total_pop <- \(LDTg) {
   list2env(LDTg, envir = environment())
   rm(LDTg)
   add_vars(g[["groups"]],
            get_vars(DT, c("weight")) |>
            fsum(g)) |>
     setnames(old = "weight",
-             new =  "W") |>
-    encode_pairs(dict, drop_labels = TRUE)
+             new =  "W")
 }
 
 #' Compute FGT and Watts indices for all groups and poverty lines
@@ -110,8 +108,8 @@ fgt_cumsum <- \(LDTg, tpop, povline,
   # 6) Compute measures (vectorized). Small clamps for numerical safety.
   CS[, `:=`(
     headcount        = cw / W,
-    poverty_gap      = (z * cw - cwy) / (z_s * W),
-    poverty_severity = (z2 * cw - 2 * z * cwy + cwy2) / (z2_s * W),
+    poverty_gap      = (z * cw - cwy) / (z * W),
+    poverty_severity = (z2 * cw - 2 * z * cwy + cwy2) / (z2 * W),
     watts            = (logz * cw - cwylog) / W
   )]
 
@@ -340,7 +338,7 @@ load_list_refy <- \(input_list){
 
   lfst <- lapply(seq_flex(input_list),
                  \(i) {
-                   x <- lup_files[i]
+                   x <- input_list[i]
                    idn <- fs::path_file(x) |>
                      fs::path_ext_remove()
                    fst::read_fst(x, as.data.table = TRUE) |>

From b02fbef1fa03ad6ca5c9c6bd10603eaba3bd6d3c Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Wed, 10 Sep 2025 21:18:06 -0400
Subject: [PATCH 112/203] pip(fill_gaps = TRUE) fully working

---
 R/fg_pip.R           | 4 +---
 R/fgt_cumsum.R       | 4 +++-
 man/get_total_pop.Rd | 4 +---
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/R/fg_pip.R b/R/fg_pip.R
index 81a6bd11..d4323ef1 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -127,9 +127,7 @@ fg_pip <- function(country,
               verbose       = 0,
               overid        = 2)
 
-  setnames(out,
-           "povline",
-           "poverty_line")
+  setnames(out, "povline", "poverty_line")
 
   # Ensure that out does not have duplicates
   out <- fg_remove_duplicates(out,
diff --git a/R/fgt_cumsum.R b/R/fgt_cumsum.R
index cb356415..d34c3ecd 100644
--- a/R/fgt_cumsum.R
+++ b/R/fgt_cumsum.R
@@ -113,10 +113,12 @@ fgt_cumsum <- \(LDTg, tpop, povline,
     watts            = (logz * cw - cwylog) / W
   )]
 
+  setnames(CS, "z", "povline")
   if (!drop_vars) {
     return(CS)
   }
-  get_vars(CS, c("id_rl", "headcount", "poverty_gap", "poverty_severity", "watts"))
+  get_vars(CS, c("id_rl", "povline", "headcount",
+                 "poverty_gap", "poverty_severity", "watts"))
 
 }
 
diff --git a/man/get_total_pop.Rd b/man/get_total_pop.Rd
index a83525bd..97b36860 100644
--- a/man/get_total_pop.Rd
+++ b/man/get_total_pop.Rd
@@ -4,12 +4,10 @@
 \alias{get_total_pop}
 \title{Compute total population by survey and reporting level}
 \usage{
-get_total_pop(LDTg, dict)
+get_total_pop(LDTg)
 }
 \arguments{
 \item{LDTg}{List from format_lfst() with DT and g objects.}
-
-\item{dict}{data.table dictionary for id/reporting_level encoding (from build_pair_dict()).}
 }
 \value{
 data.table with total population by group (columns: id_rl, W).

From 47434242f79952f26f9d0584ae7fbbdfa1b29dfb Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Wed, 10 Sep 2025 21:18:26 -0400
Subject: [PATCH 113/203] Increment version number to 1.3.19.9003

---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 0f625a0e..b646fcde 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: pipapi
 Title: API for the Poverty and Inequality Platform
-Version: 1.3.19.9002
+Version: 1.3.19.9003
 Authors@R: 
     c(person(given = "Tony",
             family = "Fujs",

From f35bfed10783aa509f8424d16b96892701cee730 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Thu, 11 Sep 2025 10:42:09 -0400
Subject: [PATCH 114/203] fix issue with censoring

---
 R/pip_agg.R                 | 2 +-
 inst/plumber/v1/endpoints.R | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/R/pip_agg.R b/R/pip_agg.R
index 52b18bcd..0d5485fb 100644
--- a/R/pip_agg.R
+++ b/R/pip_agg.R
@@ -14,7 +14,7 @@ pip_agg <- function(country         = "ALL",
                     welfare_type    = c("all", "consumption", "income"),
                     reporting_level = c("all", "national"),
                     lkup,
-                    censor          = TRUE,
+                    censor          = FALSE,
                     lkup_hash       = lkup$cache_data_id$hash_pip_grp,
                     additional_ind  = FALSE) {
 
diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R
index 77c83c07..5e42b7c7 100644
--- a/inst/plumber/v1/endpoints.R
+++ b/inst/plumber/v1/endpoints.R
@@ -281,6 +281,7 @@ function(req, res) {
   res$serializer <- pipapi::assign_serializer(format = params$format)
   params$format  <- NULL
   params$version <- NULL
+  params$censor  <- TRUE
 
   out <- do.call(pipapi::pip_agg, params)
   out

From cebe90c4889aaf417bd6fb9264339abc7b1e9814 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Thu, 11 Sep 2025 10:46:53 -0400
Subject: [PATCH 115/203] fix bug

---
 R/pip_grp_new.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/pip_grp_new.R b/R/pip_grp_new.R
index e4880abb..86f68b6d 100644
--- a/R/pip_grp_new.R
+++ b/R/pip_grp_new.R
@@ -82,7 +82,7 @@ pip_grp_new <- \(country         = "ALL",
       out <- censor_rows(out, lkup[["censored"]], type = "regions")
     }
 
-
+    out
 }
 
 #' Subset country_code values based on matches in *_code columns and country_code

From a010a8cd63488a9af72a6882161dd92b168fd767 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Thu, 11 Sep 2025 14:22:38 -0400
Subject: [PATCH 116/203] update ui_hp_stacked to account for use_new

---
 R/pip_grp_new.R  |  2 +-
 R/ui_home_page.R | 49 +++++++++++++++++++++++++++++++++++-------------
 2 files changed, 37 insertions(+), 14 deletions(-)

diff --git a/R/pip_grp_new.R b/R/pip_grp_new.R
index 86f68b6d..ecd7ce22 100644
--- a/R/pip_grp_new.R
+++ b/R/pip_grp_new.R
@@ -16,7 +16,7 @@ pip_grp_new <- \(country         = "ALL",
   # subgroups aggregations only supported for "all" countries
   country <- toupper(country)
   year    <- toupper(year)
-  reporting_level <- "all"
+
   if (!all(country %in% c("ALL", lkup$query_controls$region$values))) {
     country <- "ALL"
   }
diff --git a/R/ui_home_page.R b/R/ui_home_page.R
index 04105c59..58230063 100644
--- a/R/ui_home_page.R
+++ b/R/ui_home_page.R
@@ -15,20 +15,43 @@ ui_hp_stacked <- function(povline = 1.9,
   ref_years <- sort(unique(lkup$ref_lkup$reporting_year))
   ref_years <- ref_years[!ref_years %in% c(1981:1989)]
 
-  out <- pip_grp(
-    country = "all",
-    year = ref_years,
-    povline = povline,
-    group_by = "wb",
-    reporting_level = "national",
-    censor = FALSE,
-    lkup = lkup
-  )
+  use_new <- lkup$use_new_lineup_version
+
+  # Run correct function
+  #-------------------------------------
+  out <- if (use_new) {
+   x <-  pip_grp_new(country         = "ALL",
+                year            = ref_years,
+                povline         = povline,
+                welfare_type    = "all",
+                reporting_level = "national",
+                lkup            = lkup,
+                censor          = FALSE)
+
+   regs <- lkup$aux_files$country_list[, region_code] |>
+     funique() |>
+     c("WLD")
+
+   x |>
+     fsubset(region_code %in% regs)
+
+  } else {
+    pip_grp(
+      country = "all",
+      year = ref_years,
+      povline = povline,
+      group_by = "wb",
+      reporting_level = "national",
+      censor = FALSE,
+      lkup = lkup
+    )
+
+  }
+
+  out <- get_vars(out,
+                  c("region_code", "reporting_year",
+                    "poverty_line", "pop_in_poverty"))
 
-  out <- out[, c(
-    "region_code", "reporting_year",
-    "poverty_line", "pop_in_poverty"
-  )]
 
   return(out)
 }

From 48cf146107d8f3b7c13a9c67a1a87ad57e80747d Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Thu, 11 Sep 2025 14:26:16 -0400
Subject: [PATCH 117/203] improve ui_hp_countries

---
 R/ui_home_page.R | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/R/ui_home_page.R b/R/ui_home_page.R
index 58230063..6b230aea 100644
--- a/R/ui_home_page.R
+++ b/R/ui_home_page.R
@@ -80,13 +80,16 @@ ui_hp_countries <- function(country = c("IDN", "CIV"),
   )
 
   # Add pop_in_poverty and scale according to pop_units
-  out$pop_in_poverty <- out$reporting_pop * out$headcount / pop_units
-  out$reporting_pop <- out$reporting_pop / pop_units
+  out[,
+      `:=`(
+        pop_in_poverty = reporting_pop * headcount / pop_units,
+        reporting_pop  = reporting_pop / pop_units
+        )]
 
-  out <- out[, c(
+  out <- get_vars(out, c(
     "region_code", "country_code", "reporting_year",
     "poverty_line", "reporting_pop", "pop_in_poverty"
-  )]
+  ))
 
   return(out)
 }

From ae6818c9e3e7577bfc4657475007fd4de30ad2d2 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Thu, 11 Sep 2025 14:54:56 -0400
Subject: [PATCH 118/203] improve ui_pc_charts and ui_pc_regional

---
 R/pip.R                     |  2 +-
 R/ui_poverty_indicators.R   | 33 +++++++++++++++++++++------------
 inst/plumber/v1/endpoints.R |  3 ++-
 3 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/R/pip.R b/R/pip.R
index 0689ef26..44a4d556 100644
--- a/R/pip.R
+++ b/R/pip.R
@@ -52,7 +52,7 @@ pip <- function(country         = "ALL",
                 reporting_level = c("all", "national", "rural", "urban"),
                 ppp             = NULL,
                 lkup,
-                censor          = TRUE,
+                censor          = FALSE,
                 lkup_hash       = lkup$cache_data_id$hash_pip,
                 additional_ind  = FALSE) {
 
diff --git a/R/ui_poverty_indicators.R b/R/ui_poverty_indicators.R
index f2758a13..b4e2b26d 100644
--- a/R/ui_poverty_indicators.R
+++ b/R/ui_poverty_indicators.R
@@ -14,6 +14,7 @@ ui_pc_charts <- function(country = c("AGO"),
                          welfare_type = c("all", "consumption", "income"),
                          reporting_level = c("all", "national", "rural", "urban"),
                          pop_units = 1e6,
+                         censor  = TRUE,
                          lkup) {
   # Set returned columns
   return_cols           <- lkup$return_cols$ui_pc_charts$cols
@@ -30,12 +31,17 @@ ui_pc_charts <- function(country = c("AGO"),
     fill_gaps       = fill_gaps,
     group_by        = group_by,
     reporting_level = reporting_level,
-    lkup            = lkup
+    lkup            = lkup,
+    censor          = censor
   )
 
   # Add pop_in_poverty and scale according to pop_units
-  out$pop_in_poverty <- out$reporting_pop * out$headcount / pop_units
-  out$reporting_pop <- out$reporting_pop / pop_units
+  out[,
+      `:=`(
+        pop_in_poverty = reporting_pop * headcount / pop_units,
+        reporting_pop  = reporting_pop / pop_units
+      )]
+
 
   # handle different responses when fill_gaps = TRUE / FALSE
   # Return all columns when survey years are requested
@@ -45,7 +51,7 @@ ui_pc_charts <- function(country = c("AGO"),
     out <- out[, .SD, .SDcols = return_cols]
 
   } else {
-    out <- out[, .SD, .SDcols = return_cols]
+    out <- get_vars(out, return_cols)
     # Set non-interpolated variables to NA if line-up years are requested
     out[, (inequality_indicators) := NA]
     out[, survey_comparability := NA]   # remove manually survey_comparability
@@ -76,16 +82,19 @@ ui_pc_regional <- function(country   = "ALL",
   }
 
   out <- pip_agg(country         = country,
-                       year            = year,
-                       group_by        = "wb",
-                       reporting_level = "national",
-                       povline         = povline,
-                       lkup            = lkup,
-                       censor          = TRUE)
+                 year            = year,
+                 group_by        = "wb",
+                 reporting_level = "national",
+                 povline         = povline,
+                 lkup            = lkup,
+                 censor          = TRUE)
 
   # Add pop_in_poverty and scale according to pop_units
-  out$pop_in_poverty <- out$reporting_pop * out$headcount / pop_units
-  out$reporting_pop <- out$reporting_pop / pop_units
+  out[,
+      `:=`(
+        pop_in_poverty = reporting_pop * headcount / pop_units,
+        reporting_pop  = reporting_pop / pop_units
+      )]
 
   out <- out[estimate_type == "actual"]
 
diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R
index 5e42b7c7..102a7dc6 100644
--- a/inst/plumber/v1/endpoints.R
+++ b/inst/plumber/v1/endpoints.R
@@ -281,7 +281,6 @@ function(req, res) {
   res$serializer <- pipapi::assign_serializer(format = params$format)
   params$format  <- NULL
   params$version <- NULL
-  params$censor  <- TRUE
 
   out <- do.call(pipapi::pip_agg, params)
   out
@@ -676,6 +675,7 @@ function(req) {
   params <- req$argsQuery
   params$lkup <- lkups$versions_paths[[req$argsQuery$version]]
   params$version <- NULL
+  params$censor  <- TRUE
 
   out <- do.call(pipapi::ui_pc_charts, params)
   return(out)
@@ -700,6 +700,7 @@ function(req) {
   params$lkup <- lkups$versions_paths[[req$argsQuery$version]]
   params$pop_units <- 1
   params$version <- NULL
+  params$censor  <- TRUE
 
   do.call(pipapi::ui_pc_charts, params)
 

From 5be1cdc7005b26277dfb6d8846a8cb5115fdedc4 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Thu, 11 Sep 2025 15:23:05 -0400
Subject: [PATCH 119/203] fix get_mean_median to account for welfare_type in
 survey years

---
 R/ui_country_profile.R | 14 ++++----------
 R/utils.R              | 10 ++++++++--
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/R/ui_country_profile.R b/R/ui_country_profile.R
index ad64f538..f730c1f8 100644
--- a/R/ui_country_profile.R
+++ b/R/ui_country_profile.R
@@ -52,14 +52,6 @@ ui_cp_ki_headcount <- function(country,
                  reporting_level = "all",
                  lkup            = lkup)
 
-  # Select max year and country
-  # res <-
-  #   res_all[res_all[,
-  #                 .I[which.max(reporting_year)],
-  #                 by = country_code]$V1
-  #         ][
-  #           country_code == country
-  #           ]
   res <-
     res_all[country_code == country
             ][,
@@ -83,8 +75,10 @@ ui_cp_ki_headcount <- function(country,
   ### TEMP FIX END
 
   out <- data.table::data.table(
-    country_code = country, reporting_year = res$reporting_year,
-    poverty_line = povline, headcount = res$headcount
+    country_code   = country,
+    reporting_year = res$reporting_year,
+    poverty_line   = povline,
+    headcount      = res$headcount
   )
   return(out)
 }
diff --git a/R/utils.R b/R/utils.R
index 60fa57ac..f4eef7da 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -1481,16 +1481,22 @@ get_mean_median <- \(fgt, lkup, fill_gaps) {
     dist <- get_vars(lkup$lineup_dist_stats,
                      c("country_code", "reporting_year",
                        "reporting_level", "mean", "median"))
+    by_var <- c('country_code', "reporting_year", "reporting_level")
   } else {
     dist <- get_vars(lkup$dist_stats,
                      c("country_code", "reporting_year",
-                       "reporting_level", "mean", "survey_median_ppp"))
+                       "reporting_level", "mean",
+                       "survey_median_ppp", "welfare_type"))
     setnames(dist, "survey_median_ppp", "median")
 
+    by_var <- c('country_code',
+                "reporting_year",
+                "reporting_level",
+                "welfare_type")
   }
   join(x = fgt,
       y = dist,
-      on = c('country_code', "reporting_year", "reporting_level"),
+      on = by_var,
       how = "left",
       validate = "m:1", # multiple povlines
       verbose = 0L)

From 02c2ca021c777968fa839602ed5e79213b565076 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Thu, 11 Sep 2025 16:22:54 -0400
Subject: [PATCH 120/203] improve country_profile

---
 R/ui_country_profile.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/ui_country_profile.R b/R/ui_country_profile.R
index f730c1f8..20fe7dac 100644
--- a/R/ui_country_profile.R
+++ b/R/ui_country_profile.R
@@ -311,7 +311,7 @@ ui_cp_download <- function(country   = "AGO",
   hc <- lapply(country, \(.) {
     ui_cp_ki_headcount(., year, povline, lkup)
   }) |>
-    data.table::rbindlist(use.names = TRUE)
+    rbindlist(use.names = TRUE)
 
   df <- lkup[["cp_lkups"]]$flat$flat_cp
   df <- df[country_code %chin% country]

From 391c78195913090391e3a3fc19603f0898abf4be Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Sat, 13 Sep 2025 20:54:36 -0400
Subject: [PATCH 121/203] Increment version number to 1.3.19.9004

---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index b646fcde..2e8242b1 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: pipapi
 Title: API for the Poverty and Inequality Platform
-Version: 1.3.19.9003
+Version: 1.3.19.9004
 Authors@R: 
     c(person(given = "Tony",
             family = "Fujs",

From 79ff74226ff49f7132a35026aaa9e0bb0d82d969 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Mon, 15 Sep 2025 17:12:44 -0400
Subject: [PATCH 122/203] add popshare to subset_lkup

---
 R/rg_pip.R | 3 ++-
 R/utils.R  | 9 ++++++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/R/rg_pip.R b/R/rg_pip.R
index 769878ae..279f4d49 100644
--- a/R/rg_pip.R
+++ b/R/rg_pip.R
@@ -30,7 +30,8 @@ rg_pip <- function(country,
     data_dir        = data_dir,
     povline         = povline,
     cache_file_path = cache_file_path,
-    fill_gaps       = FALSE
+    fill_gaps       = FALSE,
+    popshare        = popshare
   )
 
   data_present_in_master <- metadata$data_present_in_master
diff --git a/R/utils.R b/R/utils.R
index f4eef7da..88011ff4 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -16,9 +16,16 @@ subset_lkup <- function(country,
                         data_dir = NULL,
                         povline,
                         cache_file_path,
-                        fill_gaps
+                        fill_gaps,
+                        popshare = NULL
                         ) {
 
+  if (!is.null(popshare)) {
+    return(list(data_present_in_master = NULL,
+                lkup                   = lkup,
+                povline                = povline))
+  }
+
   # STEP 1 - Keep every row by default
   keep <- rep(TRUE, nrow(lkup))
   # STEP 2 - Select countries

From 3d499358e03d5bbcddf7984cf573636852ca3213 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Mon, 15 Sep 2025 18:02:38 -0400
Subject: [PATCH 123/203] update round to 2 decimals

---
 inst/plumber/v1/endpoints.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R
index 102a7dc6..e7345ecb 100644
--- a/inst/plumber/v1/endpoints.R
+++ b/inst/plumber/v1/endpoints.R
@@ -151,7 +151,7 @@ function(req, res) {
     # Round poverty line
     # This is to prevent users to abuse the API by passing too many decimals
     if (!is.null(req$argsQuery$povline)) {
-      req$argsQuery$povline <- round(req$argsQuery$povline, digits = 3)
+      req$argsQuery$povline <- round(req$argsQuery$povline, digits = 2)
     }
   }
   plumber::forward()

From ec932256e9249bc04c48210e5c69fc9d78f38d4a Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Mon, 15 Sep 2025 18:05:38 -0400
Subject: [PATCH 124/203] In case of error, make sure you log the endpoint
 closes #432

---
 inst/plumber/v1/plumber.R | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/inst/plumber/v1/plumber.R b/inst/plumber/v1/plumber.R
index 9df24ad4..c5441da4 100644
--- a/inst/plumber/v1/plumber.R
+++ b/inst/plumber/v1/plumber.R
@@ -28,6 +28,12 @@ plumber::pr(endpoints_path) |>
   plumber::pr_hook("exit", function() {
     # log_info('Bye bye: {proc.time()[["elapsed"]]}')
   }) |>
+  plumber::pr_set_error(function(req, res, err) {
+    # In case of error, make sure you log the endpoint for #432
+    method <- req$REQUEST_METHOD
+    path <- req$PATH_INFO
+    cat(sprintf("ERROR at %s %s: %s\n", method, path, err$message))
+  }) |>
   # Set API spec
   plumber::pr_set_api_spec(api = function(spec) {
     spec$info$version <- utils::packageVersion("pipapi") |>

From 2b40da8aeee31a901f0c94ab7fa0c591edaab0d0 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Mon, 15 Sep 2025 18:07:41 -0400
Subject: [PATCH 125/203] include povline in aggregates list

---
 R/add_agg_stats.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/R/add_agg_stats.R b/R/add_agg_stats.R
index 5b93067d..bbc81ed9 100644
--- a/R/add_agg_stats.R
+++ b/R/add_agg_stats.R
@@ -16,7 +16,8 @@ add_agg_stats <- function(df,
     aggregated_list <- split(aggregated,
       interaction(
         aggregated$country_code,
-        aggregated$reporting_year
+        aggregated$reporting_year,
+        aggregated$poverty_line
       ),
       drop = TRUE
     )

From 33b685d3482513387967b9e31d75deaf2dcd33fc Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Mon, 15 Sep 2025 18:08:48 -0400
Subject: [PATCH 126/203] add gc() each time it disconnects

---
 R/duckdb_func.R | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/R/duckdb_func.R b/R/duckdb_func.R
index 5071e1ca..bc9c89be 100644
--- a/R/duckdb_func.R
+++ b/R/duckdb_func.R
@@ -265,6 +265,7 @@ update_master_file <- function(dat,
   "))
 
   duckdb::dbDisconnect(write_con)
+  invisible(gc())
   if (nr > 0 && verbose)  message(glue("{target_file} is updated."))
 
   return(nr)
@@ -335,6 +336,7 @@ reset_cache <- function(pass = Sys.getenv('PIP_CACHE_LOCAL_KEY'),
     DBI::dbExecute(write_con, "DELETE from fg_master_file")
   }
   duckdb::dbDisconnect(write_con)
+  invisible(gc())
 }
 
 create_duckdb_file <- function(cache_file_path) {
@@ -360,6 +362,7 @@ create_duckdb_file <- function(cache_file_path) {
                  watts     DOUBLE
   )")
   DBI::dbDisconnect(con)
+  invisible(gc())
 }
 
 
@@ -394,6 +397,7 @@ load_inter_cache <- function(lkup = NULL,
   # connection object if it is not closed More details here
   # https://app.clickup.com/t/868cdpe3q
   duckdb::dbDisconnect(con)
+  invisible(gc())
 
   setDT(master_file)
 }

From 6ae50037c1da428037234f6a9e56aa9b5e0b2823 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Mon, 15 Sep 2025 18:10:11 -0400
Subject: [PATCH 127/203] set pov line to NULL if popshare is defined in fg

---
 R/fg_pip.R     | 1 +
 R/fg_pip_old.R | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/R/fg_pip.R b/R/fg_pip.R
index d4323ef1..b0761839 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -19,6 +19,7 @@ fg_pip <- function(country,
   interpolation_list  <- lkup$interpolation_list
   data_dir            <- lkup$data_root
   refy_lkup           <- lkup$refy_lkup # cleaned refy table, unique by country-years but some columns removed in order to do that
+  if (!is.null(popshare)) povline <- NULL
 
   cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb")
   # fg_pip is called from multiple places like pip, pip_grp_logic. We have connection object created
diff --git a/R/fg_pip_old.R b/R/fg_pip_old.R
index 2da40213..444df522 100644
--- a/R/fg_pip_old.R
+++ b/R/fg_pip_old.R
@@ -19,6 +19,8 @@ fg_pip_old <- function(country,
   data_dir            <- lkup$data_root
   ref_lkup            <- lkup$ref_lkup
 
+  if (!is.null(popshare)) povline <- NULL
+
   cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb")
   # fg_pip is called from multiple places like pip, pip_grp_logic. We have connection object created
   # when calling from `pip`. For other functions we create it here.

From 0a24c57fe3f2c76f2e87d6f7519669a52738d7d7 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Mon, 15 Sep 2025 18:13:53 -0400
Subject: [PATCH 128/203] check pip_grp_logic with changes made to old_DEV
 branch last commit

---
 R/pip_grp_logic.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/pip_grp_logic.R b/R/pip_grp_logic.R
index e6f54e9b..cc23d253 100644
--- a/R/pip_grp_logic.R
+++ b/R/pip_grp_logic.R
@@ -73,7 +73,7 @@ pip_grp_logic <- function(country         = "ALL",
     ## This will then be re-used in various part of the function
     ## This is to avoid re-computing and re-loading the same data over and over
     fg_pip_master <- fg_pip_old(
-      country         = c(lcv$md_off_reg, lcv$user_off_reg),
+      country         = c(lcv$md_off_reg, lcv$user_alt_agg, lcv$user_off_reg),
       year            = year,
       povline         = povline,
       popshare        = NULL,

From 3dc0dc38026c363e98ad687c0a3d843093662899 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Mon, 15 Sep 2025 18:17:12 -0400
Subject: [PATCH 129/203] limit set of poverty lines to store

---
 R/pip_new_lineups.R | 10 +++++++++-
 R/pip_old.R         | 10 +++++++++-
 R/pip_old_lineups.R | 12 +++++++++++-
 3 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/R/pip_new_lineups.R b/R/pip_new_lineups.R
index 5153f037..84a2c141 100644
--- a/R/pip_new_lineups.R
+++ b/R/pip_new_lineups.R
@@ -315,7 +315,15 @@ treat_cache_and_main <- \(out, cache_file_path,
         rowbind(cached_data)
     }
 
-    update_master_file(main_data, cache_file_path, fill_gaps)
+    pl <- c(seq(from = 0.01, to = 5, by = 0.01),
+            seq(from = 5.1, to = 20, by = 0.1),
+            seq(from = 21, to = 100, by = 1),
+            seq(from = 105, to = 900, by = 5))
+    # Only update master file if poverty line is part of this pl list
+    # Using round to avoid precision error with decimals
+    if (all(round(povline, 2) %in% round(pl, 2))) {
+      update_master_file(main_data, cache_file_path, fill_gaps)
+    }
 
   } else {
     out <- cached_data
diff --git a/R/pip_old.R b/R/pip_old.R
index ae62bb22..5efc61ae 100644
--- a/R/pip_old.R
+++ b/R/pip_old.R
@@ -147,7 +147,15 @@ pip_old <- function(country         = "ALL",
     # This will be used only for development purpose and we don't have any intention to use it in production.
     if(!is.null(cached_data)) {
       # Update cache with data
-      update_master_file(main_data, cache_file_path, fill_gaps)
+      pl <- c(seq(from = 0.01, to = 5, by = 0.01),
+              seq(from = 5.1, to = 20, by = 0.1),
+              seq(from = 21, to = 100, by = 1),
+              seq(from = 105, to = 900, by = 5))
+      # Only update master file if poverty line is part of this pl list
+      # Using round to avoid precision error with decimals
+      if (all(round(povline, 2) %in% round(pl, 2))) {
+        update_master_file(main_data, cache_file_path, fill_gaps)
+      }
     }
   } else {
     out <- cached_data
diff --git a/R/pip_old_lineups.R b/R/pip_old_lineups.R
index fc01620b..be67050c 100644
--- a/R/pip_old_lineups.R
+++ b/R/pip_old_lineups.R
@@ -145,7 +145,17 @@ pip_old_lineups <- function(country         = "ALL",
     out <- main_data |>
       rowbind(cached_data)
 
-    update_master_file(main_data, cache_file_path, fill_gaps)
+
+    pl <- c(seq(from = 0.01, to = 5, by = 0.01),
+            seq(from = 5.1, to = 20, by = 0.1),
+            seq(from = 21, to = 100, by = 1),
+            seq(from = 105, to = 900, by = 5))
+    # Only update master file if poverty line is part of this pl list
+    # Using round to avoid precision error with decimals
+    if (all(round(povline, 2) %in% round(pl, 2))) {
+      update_master_file(main_data, cache_file_path, fill_gaps)
+    }
+
 
   } else {
     out <- cached_data

From a099a386b7649a64a5ad96b28790b636097d70a3 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Mon, 15 Sep 2025 18:20:34 -0400
Subject: [PATCH 130/203] store vars in env defined in zzz

---
 R/pip_new_lineups.R | 5 +----
 R/pip_old.R         | 5 +----
 R/pip_old_lineups.R | 5 +----
 R/zzz.R             | 8 ++++++++
 4 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/R/pip_new_lineups.R b/R/pip_new_lineups.R
index 84a2c141..7838e3a1 100644
--- a/R/pip_new_lineups.R
+++ b/R/pip_new_lineups.R
@@ -315,10 +315,7 @@ treat_cache_and_main <- \(out, cache_file_path,
         rowbind(cached_data)
     }
 
-    pl <- c(seq(from = 0.01, to = 5, by = 0.01),
-            seq(from = 5.1, to = 20, by = 0.1),
-            seq(from = 21, to = 100, by = 1),
-            seq(from = 105, to = 900, by = 5))
+    pl <- get_from_pipapienv("pl_to_store")
     # Only update master file if poverty line is part of this pl list
     # Using round to avoid precision error with decimals
     if (all(round(povline, 2) %in% round(pl, 2))) {
diff --git a/R/pip_old.R b/R/pip_old.R
index 5efc61ae..34fc5371 100644
--- a/R/pip_old.R
+++ b/R/pip_old.R
@@ -147,10 +147,7 @@ pip_old <- function(country         = "ALL",
     # This will be used only for development purpose and we don't have any intention to use it in production.
     if(!is.null(cached_data)) {
       # Update cache with data
-      pl <- c(seq(from = 0.01, to = 5, by = 0.01),
-              seq(from = 5.1, to = 20, by = 0.1),
-              seq(from = 21, to = 100, by = 1),
-              seq(from = 105, to = 900, by = 5))
+      pl <- get_from_pipapienv("pl_to_store")
       # Only update master file if poverty line is part of this pl list
       # Using round to avoid precision error with decimals
       if (all(round(povline, 2) %in% round(pl, 2))) {
diff --git a/R/pip_old_lineups.R b/R/pip_old_lineups.R
index be67050c..f7000674 100644
--- a/R/pip_old_lineups.R
+++ b/R/pip_old_lineups.R
@@ -146,10 +146,7 @@ pip_old_lineups <- function(country         = "ALL",
       rowbind(cached_data)
 
 
-    pl <- c(seq(from = 0.01, to = 5, by = 0.01),
-            seq(from = 5.1, to = 20, by = 0.1),
-            seq(from = 21, to = 100, by = 1),
-            seq(from = 105, to = 900, by = 5))
+    pl <- get_from_pipapienv("pl_to_store")
     # Only update master file if poverty line is part of this pl list
     # Using round to avoid precision error with decimals
     if (all(round(povline, 2) %in% round(pl, 2))) {
diff --git a/R/zzz.R b/R/zzz.R
index d3c42be7..302841c5 100644
--- a/R/zzz.R
+++ b/R/zzz.R
@@ -43,6 +43,14 @@ pipapi_default_options <- list(
     min(8)
   set_in_pipapienv("cores_to_use", cores_to_use)
 
+
+  # pov lines to store
+  pl <- c(seq(from = 0.01, to = 5, by = 0.01),
+          seq(from = 5.1, to = 20, by = 0.1),
+          seq(from = 21, to = 100, by = 1),
+          seq(from = 105, to = 900, by = 5))
+  set_in_pipapienv("pl_to_store", pl)
+
   invisible()
 
 }

From 64eedb585e109ce450be92b449489d1edb7967bb Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Mon, 15 Sep 2025 18:23:18 -0400
Subject: [PATCH 131/203] update data documentation

---
 R/pipapi-package.R           | 11 ++++++
 data-raw/data.R              | 72 +++++++++---------------------------
 man/empty_response_fg_add.Rd | 16 ++++++++
 man/pip.Rd                   |  2 +-
 man/pip_agg.Rd               |  2 +-
 man/subset_lkup.Rd           |  6 ++-
 man/ui_pc_charts.Rd          |  3 ++
 7 files changed, 54 insertions(+), 58 deletions(-)
 create mode 100644 man/empty_response_fg_add.Rd

diff --git a/R/pipapi-package.R b/R/pipapi-package.R
index 7a52927d..c45284df 100644
--- a/R/pipapi-package.R
+++ b/R/pipapi-package.R
@@ -59,6 +59,17 @@ NULL
 #' @format Data frame with 0 rows and 53 columns
 NULL
 
+#' Dataframe for fill gaps empty response with additional variables
+#'
+#' @docType data
+#' @keywords datasets
+#' @name empty_response_fg_add
+#' @usage data(empty_response_fg_add)
+#' @format Data frame with 0 rows and 56 columns
+NULL
+
+
+
 utils::globalVariables(
   c(
     ".",
diff --git a/data-raw/data.R b/data-raw/data.R
index b6d075be..a0a6a16f 100644
--- a/data-raw/data.R
+++ b/data-raw/data.R
@@ -35,61 +35,22 @@ empty_response_grp <- pip_grp("all", year, lkup = lkup, group_by = "wb")
 empty_response_grp <- empty_response_grp[-c(1:nrow(empty_response_grp))]
 
 
-empty_response_fg <- data.table::data.table(
-  country_code = character(0),
-  survey_id = character(0),
-  cache_id = character(0),
-  wb_region_code = character(0),
-  reporting_year = numeric(0),
-  surveyid_year = character(0),
-  survey_year = numeric(0),
-  survey_time = character(0),
-  survey_acronym = character(0),
-  survey_coverage = character(0),
-  survey_comparability = numeric(0),
-  comparable_spell = character(0),
-  welfare_type = character(0),
-  reporting_level = character(0),
-  survey_mean_lcu = numeric(0),
-  survey_mean_ppp = numeric(0),
-  survey_median_ppp = numeric(0),
-  survey_median_lcu = numeric(0),
-  predicted_mean_ppp = numeric(0),
-  ppp = numeric(0),
-  cpi = numeric(0),
-  reporting_pop = numeric(0),
-  reporting_gdp = numeric(0),
-  reporting_pce = numeric(0),
-  pop_data_level = character(0),
-  gdp_data_level = character(0),
-  pce_data_level = character(0),
-  cpi_data_level = character(0),
-  ppp_data_level = character(0),
-  distribution_type = character(0),
-  gd_type = character(0),
-  is_interpolated = logical(0),
-  is_used_for_line_up = logical(0),
-  is_used_for_aggregation = logical(0),
-  estimation_type = character(0),
-  interpolation_id = character(0),
-  display_cp = numeric(0),
-  country_name = character(0),
-  africa_split = character(0),
-  africa_split_code = character(0),
-  region_name = character(0),
-  region_code = character(0),
-  world = character(0),
-  world_code = character(0),
-  path = character(0),
-  data_interpolation_id = character(0),
-  poverty_line = numeric(0),
-  mean = numeric(0),
-  median = numeric(0),
-  headcount = numeric(0),
-  poverty_gap = numeric(0),
-  poverty_severity = numeric(0),
-  watts = numeric(0)
-)
+fg <- fg_pip(
+  ctr,
+  year = year,
+  povline = 3,
+  welfare_type = "all",
+  reporting_level = "all",
+  popshare = NULL,
+  lkup = lkup
+) |>
+  rbindlist()
+empty_response_fg <- fg[-1]
+
+
+add_vars_out_of_pipeline(fg, fill_gaps = TRUE, lkup = lkup)
+
+empty_response_fg_add <- fg[-1]
 
 usethis::use_data(
   empty_response,
@@ -97,5 +58,6 @@ usethis::use_data(
   reporting_level_list,
   empty_response_grp,
   empty_response_fg,
+  empty_response_fg_add,
   overwrite = TRUE
 )
diff --git a/man/empty_response_fg_add.Rd b/man/empty_response_fg_add.Rd
new file mode 100644
index 00000000..7bb34bb1
--- /dev/null
+++ b/man/empty_response_fg_add.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/pipapi-package.R
+\docType{data}
+\name{empty_response_fg_add}
+\alias{empty_response_fg_add}
+\title{Dataframe for fill gaps empty response with additional variables}
+\format{
+Data frame with 0 rows and 56 columns
+}
+\usage{
+data(empty_response_fg_add)
+}
+\description{
+Dataframe for fill gaps empty response with additional variables
+}
+\keyword{datasets}
diff --git a/man/pip.Rd b/man/pip.Rd
index 0161eeef..f128449b 100644
--- a/man/pip.Rd
+++ b/man/pip.Rd
@@ -15,7 +15,7 @@ pip(
   reporting_level = c("all", "national", "rural", "urban"),
   ppp = NULL,
   lkup,
-  censor = TRUE,
+  censor = FALSE,
   lkup_hash = lkup$cache_data_id$hash_pip,
   additional_ind = FALSE
 )
diff --git a/man/pip_agg.Rd b/man/pip_agg.Rd
index 3eb27292..f47dbba8 100644
--- a/man/pip_agg.Rd
+++ b/man/pip_agg.Rd
@@ -14,7 +14,7 @@ pip_agg(
   welfare_type = c("all", "consumption", "income"),
   reporting_level = c("all", "national"),
   lkup,
-  censor = TRUE,
+  censor = FALSE,
   lkup_hash = lkup$cache_data_id$hash_pip_grp,
   additional_ind = FALSE
 )
diff --git a/man/subset_lkup.Rd b/man/subset_lkup.Rd
index d09baf8a..ee5dd4cf 100644
--- a/man/subset_lkup.Rd
+++ b/man/subset_lkup.Rd
@@ -14,7 +14,8 @@ subset_lkup(
   data_dir = NULL,
   povline,
   cache_file_path,
-  fill_gaps
+  fill_gaps,
+  popshare = NULL
 )
 }
 \arguments{
@@ -39,6 +40,9 @@ for region selection}
 
 \item{fill_gaps}{logical: If set to TRUE, will interpolate / extrapolate
 values for missing years}
+
+\item{popshare}{numeric: Proportion of the population living below the
+poverty line}
 }
 \value{
 data.frame
diff --git a/man/ui_pc_charts.Rd b/man/ui_pc_charts.Rd
index 49715b48..9801ecfe 100644
--- a/man/ui_pc_charts.Rd
+++ b/man/ui_pc_charts.Rd
@@ -13,6 +13,7 @@ ui_pc_charts(
   welfare_type = c("all", "consumption", "income"),
   reporting_level = c("all", "national", "rural", "urban"),
   pop_units = 1e+06,
+  censor = TRUE,
   lkup
 )
 }
@@ -36,6 +37,8 @@ sub-groups}
 \item{pop_units}{numeric: Units used to express population numbers (default
 to million)}
 
+\item{censor}{logical: Triggers censoring of country/year statistics}
+
 \item{lkup}{list: A list of lkup tables}
 }
 \value{

From d057467c15d130e491607ef0d3fa45671f58da94 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Tue, 16 Sep 2025 10:20:33 -0400
Subject: [PATCH 132/203] first attempt of infer_poverty_line

---
 R/infer_poverty_line.R | 68 ++++++++++++++++++++++++++++++++++++++++++
 R/pip_old.R            |  3 ++
 R/rg_pip.R             |  8 +++++
 3 files changed, 79 insertions(+)
 create mode 100644 R/infer_poverty_line.R

diff --git a/R/infer_poverty_line.R b/R/infer_poverty_line.R
new file mode 100644
index 00000000..39dd517e
--- /dev/null
+++ b/R/infer_poverty_line.R
@@ -0,0 +1,68 @@
+infer_poverty_line <- function(welfare, weight, popshare = 0.5,
+                               include = FALSE,
+                               method = c("nearest","interp"),
+                               assume_sorted = FALSE) {
+  method <- match.arg(method)
+
+  # basic checks
+  if (length(welfare) != length(weight)) stop("welfare and weight must have the same length")
+  if (anyNA(welfare) || anyNA(weight))   stop("welfare and weight cannot contain NA")
+  if (any(weight < 0))                   stop("weights must be non-negative")
+  if (!length(popshare)) return(numeric(0))
+
+  # clamp probs
+  p <- pmin(pmax(as.numeric(popshare), 0), 1)
+
+  # fast sort (or not)
+  if (!assume_sorted) {
+    o <- if (requireNamespace("data.table", quietly = TRUE)) {
+      data.table::forder(welfare, na.last = FALSE)
+    } else {
+      order(welfare, na.last = NA, method = "radix")
+    }
+    y <- welfare[o]; w <- weight[o]
+  } else {
+    y <- welfare;   w <- weight
+  }
+
+  if (method == "interp") {
+    # collapse::fquantile: weighted linear interpolation
+    # - 'sorted' tells fquantile that 'y' is already sorted
+    # - 'w' supplies weights
+    # 'include' is not used here: interpolation doesn't have that discrete toggle
+    return(collapse::fquantile(y, probs = p, w = w, sorted = TRUE))
+  }
+
+  # ---- method == "nearest" (matches your function) ----
+  # cumulative weight fractions
+  W  <- collapse::fsum(w)
+  if (W <= 0) stop("sum(weight) must be > 0")
+  cw   <- collapse::fcumsum(w)
+  prob <- cw / W
+  n    <- length(y)
+
+  # for each p, find the nearest cumulative location (ties -> lower index)
+  j <- findInterval(p, prob, left.open = FALSE)        # j ∈ {0..n}
+  j[j < 0L] <- 0L; j[j > n] <- n
+
+  prev_idx <- pmax.int(j, 1L)                          # 1..n
+  next_idx <- pmin.int(j + 1L, n)                      # 1..n
+  d_prev   <- p - prob[prev_idx]
+  d_next   <- prob[next_idx] - p
+  use_next <- (d_next < d_prev) & (j < n)
+  idx      <- ifelse(use_next, next_idx, prev_idx)     # final index
+
+  if (!include) {
+    # take the discrete value at the nearest location
+    return(y[idx])
+  } else {
+    # average the two neighbors using their weights (your original rule)
+    idx2 <- pmin.int(idx + 1L, n)
+    wi   <- w[idx]
+    wi2  <- w[idx2]
+    s    <- wi + wi2
+    num  <- wi * y[idx] + wi2 * y[idx2]
+    out  <- ifelse((idx == n) | (s <= 0), y[idx], num / s)
+    return(out)
+  }
+}
diff --git a/R/pip_old.R b/R/pip_old.R
index 34fc5371..3378876d 100644
--- a/R/pip_old.R
+++ b/R/pip_old.R
@@ -307,6 +307,9 @@ rg_pip_old <- function(country,
   valid_regions <- lkup$query_controls$region$values
   svy_lkup      <- lkup$svy_lkup
   data_dir      <- lkup$data_root
+  # povline is set to NULL if popshare is given
+  if (!is.null(popshare)) povline <- NULL
+
 
   cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb")
 
diff --git a/R/rg_pip.R b/R/rg_pip.R
index 279f4d49..4cdf0181 100644
--- a/R/rg_pip.R
+++ b/R/rg_pip.R
@@ -17,6 +17,8 @@ rg_pip <- function(country,
   valid_regions <- lkup$query_controls$region$values
   svy_lkup      <- lkup$svy_lkup
   data_dir      <- lkup$data_root
+  # povline is set to NULL if popshare is given
+  if (!is.null(popshare)) povline <- NULL
 
   cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb")
 
@@ -52,6 +54,12 @@ rg_pip <- function(country,
 
   # load data
   lt <- load_data_list(metadata)
+  # Calculate and update poverty line if popshare is passed
+  if (!is.null(popshare)) {
+    povline <- lapply(lt, \(x) {
+      wbpip:::md_infer_poverty_line(x$welfare, x$weight, popshare)
+      })
+  }
 
   # parallelization
   # res <- get_pov_estimates(lt, povline = povline)

From 819caaae295e9e286cdf688518d8625fffa1b225 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Tue, 16 Sep 2025 10:27:39 -0400
Subject: [PATCH 133/203] document

---
 R/infer_poverty_line.R    | 28 +++++++++++++++++++++++++
 man/infer_poverty_line.Rd | 44 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 72 insertions(+)
 create mode 100644 man/infer_poverty_line.Rd

diff --git a/R/infer_poverty_line.R b/R/infer_poverty_line.R
index 39dd517e..4ee9cb53 100644
--- a/R/infer_poverty_line.R
+++ b/R/infer_poverty_line.R
@@ -1,3 +1,31 @@
+#' Infer the poverty line for a given population share
+#'
+#' Computes the welfare value (poverty line) corresponding to a given population
+#' share, using either nearest or interpolated weighted quantile methods.
+#' Supports both discrete (nearest) and linear interpolation approaches, and can
+#' optionally average neighbors for ties.
+#'
+#' @param welfare Numeric vector of welfare values (e.g., income or
+#'   consumption).
+#' @param weight Numeric vector of sampling weights (must be non-negative, same
+#'   length as welfare).
+#' @param popshare Numeric vector of population shares (probabilities in [0,1]);
+#'   default is 0.5 (median).
+#' @param include Logical; if TRUE, averages neighbors for ties (only for method
+#'   = "nearest").
+#' @param method Character; either "nearest" (default, discrete quantile) or
+#'   "interp" (weighted linear interpolation).
+#' @param assume_sorted Logical; if TRUE, assumes welfare and weight are already
+#'   sorted by welfare.
+#'
+#' @return Numeric vector of poverty line(s) corresponding to the requested
+#'   population share(s).
+#' @details
+#' - If method = "nearest", returns the welfare value at the closest cumulative weight fraction to each popshare.
+#' - If method = "interp", uses collapse::fquantile for weighted linear interpolation.
+#' - If include = TRUE (and method = "nearest"), averages the two closest neighbors using their weights.
+#' - Returns numeric(0) if popshare is empty.
+#' @keywords internal
 infer_poverty_line <- function(welfare, weight, popshare = 0.5,
                                include = FALSE,
                                method = c("nearest","interp"),
diff --git a/man/infer_poverty_line.Rd b/man/infer_poverty_line.Rd
new file mode 100644
index 00000000..3344c543
--- /dev/null
+++ b/man/infer_poverty_line.Rd
@@ -0,0 +1,44 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/infer_poverty_line.R
+\name{infer_poverty_line}
+\alias{infer_poverty_line}
+\title{Infer the poverty line for a given population share}
+\usage{
+infer_poverty_line(
+  welfare,
+  weight,
+  popshare = 0.5,
+  include = FALSE,
+  method = c("nearest", "interp"),
+  assume_sorted = FALSE
+)
+}
+\arguments{
+\item{welfare}{Numeric vector of welfare values (e.g., income or consumption).}
+
+\item{weight}{Numeric vector of sampling weights (must be non-negative, same length as welfare).}
+
+\item{popshare}{Numeric vector of population shares (probabilities in \link{0,1}); default is 0.5 (median).}
+
+\item{include}{Logical; if TRUE, averages neighbors for ties (only for method = "nearest").}
+
+\item{method}{Character; either "nearest" (default, discrete quantile) or "interp" (weighted linear interpolation).}
+
+\item{assume_sorted}{Logical; if TRUE, assumes welfare and weight are already sorted by welfare.}
+}
+\value{
+Numeric vector of poverty line(s) corresponding to the requested population share(s).
+}
+\description{
+Computes the welfare value (poverty line) corresponding to a given population share, using either nearest or interpolated weighted quantile methods.
+Supports both discrete (nearest) and linear interpolation approaches, and can optionally average neighbors for ties.
+}
+\details{
+\itemize{
+\item If method = "nearest", returns the welfare value at the closest cumulative weight fraction to each popshare.
+\item If method = "interp", uses collapse::fquantile for weighted linear interpolation.
+\item If include = TRUE (and method = "nearest"), averages the two closest neighbors using their weights.
+\item Returns numeric(0) if popshare is empty.
+}
+}
+\keyword{internal}

From bd23ba13a56fc75d1343502b92d7656ae6788d36 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Tue, 16 Sep 2025 11:19:26 -0400
Subject: [PATCH 134/203] multiple popshare for svy years.

---
 R/infer_poverty_line.R | 44 ++++++++++++++++++++++--------------------
 R/rg_pip.R             | 24 ++++++++++++++++-------
 R/utils.R              | 12 ++++++------
 3 files changed, 46 insertions(+), 34 deletions(-)

diff --git a/R/infer_poverty_line.R b/R/infer_poverty_line.R
index 4ee9cb53..9b3a53fe 100644
--- a/R/infer_poverty_line.R
+++ b/R/infer_poverty_line.R
@@ -29,13 +29,13 @@
 infer_poverty_line <- function(welfare, weight, popshare = 0.5,
                                include = FALSE,
                                method = c("nearest","interp"),
-                               assume_sorted = FALSE) {
+                               assume_sorted = TRUE) {
   method <- match.arg(method)
 
-  # basic checks
-  if (length(welfare) != length(weight)) stop("welfare and weight must have the same length")
-  if (anyNA(welfare) || anyNA(weight))   stop("welfare and weight cannot contain NA")
-  if (any(weight < 0))                   stop("weights must be non-negative")
+  # defenses
+  if (length(welfare) != length(weight)) cli::cli_abort("welfare and weight must have the same length")
+  if (anyNA(welfare) || anyNA(weight))   cli::cli_abort("welfare and weight cannot contain NA")
+  if (any(weight < 0))                   cli::cli_abort("weights must be non-negative")
   if (!length(popshare)) return(numeric(0))
 
   # clamp probs
@@ -43,54 +43,56 @@ infer_poverty_line <- function(welfare, weight, popshare = 0.5,
 
   # fast sort (or not)
   if (!assume_sorted) {
-    o <- if (requireNamespace("data.table", quietly = TRUE)) {
-      data.table::forder(welfare, na.last = FALSE)
-    } else {
-      order(welfare, na.last = NA, method = "radix")
-    }
-    y <- welfare[o]; w <- weight[o]
+    o <- data.table:::forder(welfare)
+    y <- welfare[o]
+    w <- weight[o]
   } else {
-    y <- welfare;   w <- weight
+    y <- welfare
+    w <- weight
+    o <- seq_along(welfare)
   }
 
   if (method == "interp") {
     # collapse::fquantile: weighted linear interpolation
-    # - 'sorted' tells fquantile that 'y' is already sorted
-    # - 'w' supplies weights
     # 'include' is not used here: interpolation doesn't have that discrete toggle
-    return(collapse::fquantile(y, probs = p, w = w, sorted = TRUE))
+    return(fquantile(y,
+                     probs = p,
+                     w = w,
+                     o = o,
+                     names = FALSE))
   }
 
   # ---- method == "nearest" (matches your function) ----
   # cumulative weight fractions
-  W  <- collapse::fsum(w)
+  W  <- fsum(w)
   if (W <= 0) stop("sum(weight) must be > 0")
-  cw   <- collapse::fcumsum(w)
+  cw   <- fcumsum(w)
   prob <- cw / W
   n    <- length(y)
 
   # for each p, find the nearest cumulative location (ties -> lower index)
   j <- findInterval(p, prob, left.open = FALSE)        # j ∈ {0..n}
-  j[j < 0L] <- 0L; j[j > n] <- n
+  j[j < 0L] <- 0L # fit into boundries of vector
+  j[j > n]  <- n
 
   prev_idx <- pmax.int(j, 1L)                          # 1..n
   next_idx <- pmin.int(j + 1L, n)                      # 1..n
   d_prev   <- p - prob[prev_idx]
   d_next   <- prob[next_idx] - p
   use_next <- (d_next < d_prev) & (j < n)
-  idx      <- ifelse(use_next, next_idx, prev_idx)     # final index
+  idx      <- fifelse(use_next, next_idx, prev_idx)     # final index
 
   if (!include) {
     # take the discrete value at the nearest location
     return(y[idx])
   } else {
-    # average the two neighbors using their weights (your original rule)
+    # average the two neighbors using their weights (the original rule in wbpip)
     idx2 <- pmin.int(idx + 1L, n)
     wi   <- w[idx]
     wi2  <- w[idx2]
     s    <- wi + wi2
     num  <- wi * y[idx] + wi2 * y[idx2]
-    out  <- ifelse((idx == n) | (s <= 0), y[idx], num / s)
+    out  <- fifelse((idx == n) | (s <= 0), y[idx], num / s)
     return(out)
   }
 }
diff --git a/R/rg_pip.R b/R/rg_pip.R
index 4cdf0181..905677b0 100644
--- a/R/rg_pip.R
+++ b/R/rg_pip.R
@@ -19,6 +19,7 @@ rg_pip <- function(country,
   data_dir      <- lkup$data_root
   # povline is set to NULL if popshare is given
   if (!is.null(popshare)) povline <- NULL
+  if (is.list(povline))   povline <- unlist(povline)
 
   cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb")
 
@@ -57,19 +58,28 @@ rg_pip <- function(country,
   # Calculate and update poverty line if popshare is passed
   if (!is.null(popshare)) {
     povline <- lapply(lt, \(x) {
-      wbpip:::md_infer_poverty_line(x$welfare, x$weight, popshare)
+      # wbpip:::md_infer_poverty_line(x$welfare, x$weight, popshare)
+      infer_poverty_line(welfare = x$welfare,
+                         weight = x$weight,
+                         popshare = popshare,
+                         include = FALSE,
+                         method = "nearest",
+                         assume_sorted = TRUE)
       })
   }
 
-  # parallelization
-  # res <- get_pov_estimates(lt, povline = povline)
-
-  # Regular lapply
-  res <- lapply(lt, process_dt, povline = povline)
+  # if popshare is not null, povline will be list
+  if (is.list(povline)) {
+    # If povline is list, it comes from infer_poverty_line when popshare
+    # is not null. Then, we have one set of lines per survey.
+    res <- Map(process_dt, lt, povline)
+  } else {
+    # if povline is vector, it should be applied to all surveys in lt
+    res <- lapply(lt, process_dt, povline = povline)
+  }
 
   res <- rbindlist(res, fill = TRUE)
 
-
   # clean data
   metadata[, file := basename(path)]
 
diff --git a/R/utils.R b/R/utils.R
index 88011ff4..9c09d1a2 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -20,12 +20,6 @@ subset_lkup <- function(country,
                         popshare = NULL
                         ) {
 
-  if (!is.null(popshare)) {
-    return(list(data_present_in_master = NULL,
-                lkup                   = lkup,
-                povline                = povline))
-  }
-
   # STEP 1 - Keep every row by default
   keep <- rep(TRUE, nrow(lkup))
   # STEP 2 - Select countries
@@ -53,6 +47,12 @@ subset_lkup <- function(country,
 
   lkup <- lkup[keep, ]
 
+  if (!is.null(popshare)) {
+    return(list(data_present_in_master = NULL,
+                lkup                   = lkup,
+                povline                = NULL))
+  }
+
   # Return with grace
   return_if_exists(slkup           = lkup,
                    povline         = povline,

From 90deb51beec635181c41cc51bce1bc9abeb2b36b Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Tue, 16 Sep 2025 11:29:14 -0400
Subject: [PATCH 135/203] add  prosperity gapto ui_cp_charts

---
 R/ui_country_profile.R | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/R/ui_country_profile.R b/R/ui_country_profile.R
index 20fe7dac..75fa6102 100644
--- a/R/ui_country_profile.R
+++ b/R/ui_country_profile.R
@@ -118,8 +118,16 @@ ui_cp_charts <- function(country   = "AGO",
     x[country_code == country]
   })
 
-  dl <- list(append(dl, dl2))
-  names(dl) <- country
+  # Add prosperity gap
+  pg <- get_aux_table(data_dir = lkup$data_root,
+                      "pg_svy")
+
+  pg <- pg[country_code == country]
+  dl2[["pg"]] <- pg
+
+
+  dl <- list(append(dl, dl2)) |>
+    setNames(country)
 
   return(dl)
 }

From 4bf15af783a3f4d70a6b5320c6a904522c8476b3 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Tue, 16 Sep 2025 11:40:39 -0400
Subject: [PATCH 136/203] allow to select regions from any variable.     this
 must be changed once we allow many other aggregations

---
 R/utils.R | 80 +++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 57 insertions(+), 23 deletions(-)

diff --git a/R/utils.R b/R/utils.R
index 9c09d1a2..7c42de11 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -20,46 +20,68 @@ subset_lkup <- function(country,
                         popshare = NULL
                         ) {
 
+
+  lkup <- lkup_filter(lkup,
+                      country,
+                      year,
+                      valid_regions,
+                      reporting_level,
+                      welfare_type,
+                      data_dir)
+  # If povline is NULL, this happens when popshare is passed
+  # i.e popshare is not NULL
+  if (is.null(povline)) {
+    return(list(
+      data_present_in_master = NULL,
+      lkup = lkup,
+      povline = NULL
+    ))
+  }
+  # Return with grace
+  return_if_exists(
+    slkup = lkup,
+    povline = povline,
+    cache_file_path = cache_file_path,
+    fill_gaps = fill_gaps
+  )
+}
+
+
+#' @keywords internal
+lkup_filter <- function(lkup,
+                        country,
+                        year,
+                        valid_regions,
+                        reporting_level,
+                        welfare_type,
+                        data_dir) {
   # STEP 1 - Keep every row by default
   keep <- rep(TRUE, nrow(lkup))
   # STEP 2 - Select countries
   keep <- select_country(lkup, keep, country, valid_regions)
   # STEP 3 - Select years
-  keep <- select_years(lkup          = lkup,
-                       keep          = keep,
-                       year          = year,
-                       country       = country,
-                       data_dir      =  data_dir,
+  keep <- select_years(lkup = lkup,
+                       keep = keep,
+                       year = year,
+                       country = country,
+                       data_dir =  data_dir,
                        valid_regions = valid_regions)
 
-  # # step 4. Select MRV
-  # keep <- select_MRV(lkup, keep, year, country, valid_regions, data_dir)
-
   # STEP 4 - Select welfare_type
   if (welfare_type[1] != "all") {
     keep <- keep & lkup$welfare_type == welfare_type
   }
   # STEP 5 - Select reporting_level
-  keep <- select_reporting_level(lkup            = lkup,
-                                 keep            = keep,
+  keep <- select_reporting_level(lkup = lkup,
+                                 keep = keep,
                                  reporting_level = reporting_level[1])
 
 
   lkup <- lkup[keep, ]
-
-  if (!is.null(popshare)) {
-    return(list(data_present_in_master = NULL,
-                lkup                   = lkup,
-                povline                = NULL))
-  }
-
-  # Return with grace
-  return_if_exists(slkup           = lkup,
-                   povline         = povline,
-                   cache_file_path = cache_file_path,
-                   fill_gaps       = fill_gaps)
+  return(lkup)
 }
 
+
 #' select_country
 #' Helper function for subset_lkup()
 #' @inheritParams subset_lkup
@@ -72,7 +94,19 @@ select_country <- function(lkup, keep, country, valid_regions) {
     # Select regions
     if (any(country %in% valid_regions)) {
       selected_regions <- country[country %in% valid_regions]
-      keep_regions <- lkup$region_code %in% selected_regions
+      # Find all columns ending with _code
+      code_cols <- grep("_code$", names(lkup), value = TRUE)
+      code_cols <- code_cols[!code_cols %in% "wb_region_code"] # Temporary solution
+      # For each code column, check if any value matches selected_regions
+      keep_regions_list <- lapply(code_cols, \(col) {
+        lkup[[col]] %in% selected_regions
+      })
+      # Combine with logical OR across all code columns
+      if (length(keep_regions_list) > 0) {
+        keep_regions <- Reduce(`|`, keep_regions_list)
+      } else {
+        keep_regions <- rep(FALSE, nrow(lkup))
+      }
     } else {
       keep_regions <- rep(FALSE, length(lkup$country_code))
     }

From 916780cc944a45980e535b637dc6b9f8e7add4b7 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Tue, 16 Sep 2025 11:43:44 -0400
Subject: [PATCH 137/203] fix bug

---
 R/pip_grp_new.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/pip_grp_new.R b/R/pip_grp_new.R
index ecd7ce22..efa25bb8 100644
--- a/R/pip_grp_new.R
+++ b/R/pip_grp_new.R
@@ -23,7 +23,7 @@ pip_grp_new <- \(country         = "ALL",
 
   # Select countries to estimate poverty
   cts <- copy(lkup$aux_files$country_list)
-  country_code  <-  if (country != "ALL") {
+  country_code  <-  if (!"ALL" %in% country) {
     get_country_code_subset(dt = cts, country = country)
   } else {
     "ALL"

From bd65c2ee84ae221a5310235cd9dc04fe70051e30 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Tue, 16 Sep 2025 12:01:38 -0400
Subject: [PATCH 138/203] fix some tests

---
 tests/testthat/test-fg_pip-local.R | 27 +++++++++++----------------
 tests/testthat/test-pip-local.R    | 10 ++++++++++
 tests/testthat/test-utils.R        | 15 ++++++++++-----
 3 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/tests/testthat/test-fg_pip-local.R b/tests/testthat/test-fg_pip-local.R
index 13a56c84..b73460ca 100644
--- a/tests/testthat/test-fg_pip-local.R
+++ b/tests/testthat/test-fg_pip-local.R
@@ -30,8 +30,7 @@ test_that("Imputation is working for extrapolated aggregated distribution", {
     welfare_type    = "all",
     reporting_level = "all",
     ppp             = NULL,
-    lkup            = lkup,
-    con = con
+    lkup            = lkup
   )
 
   expect_equal(nrow(tmp$main_data), 0)
@@ -44,8 +43,7 @@ test_that("Imputation is working for extrapolated aggregated distribution", {
     welfare_type    = "all",
     reporting_level = "national",
     ppp             = NULL,
-    lkup            = lkup,
-    con             = con
+    lkup            = lkup
   )
 
   expect_equal(nrow(tmp$main_data), 0)
@@ -61,8 +59,7 @@ test_that("Imputation is working for interpolated mixed distribution", {
     welfare_type    = "all",
     reporting_level = "all",
     ppp             = NULL,
-    lkup            = lkup,
-    con             = con
+    lkup            = lkup
   )
 
   expect_equal(nrow(tmp$main_data), 0)
@@ -75,8 +72,7 @@ test_that("Imputation is working for interpolated mixed distribution", {
     welfare_type    = "all",
     reporting_level = "national",
     ppp             = NULL,
-    lkup            = lkup,
-    con             = con
+    lkup            = lkup
   )
 
   expect_equal(nrow(tmp$main_data), 0)
@@ -91,11 +87,11 @@ test_that("Imputation is working for interpolated aggregate distribution", {
     welfare_type    = "all",
     reporting_level = "all",
     ppp             = NULL,
-    lkup            = lkup,
-    con             = con
+    lkup            = lkup
   )
 
-  expect_equal(nrow(tmp$main_data), 2)
+  expect_equal(nrow(tmp$main_data), 0)
+  expect_equal(nrow(tmp$data_in_cache), 2)
 
   tmp <- fg_pip(
     country         = "CHN",
@@ -105,11 +101,11 @@ test_that("Imputation is working for interpolated aggregate distribution", {
     welfare_type    = "all",
     reporting_level = "national",
     ppp             = NULL,
-    lkup            = lkup,
-    con             = con
+    lkup            = lkup
   )
 
-  expect_equal(nrow(tmp$main_data), 2)
+  expect_equal(nrow(tmp$main_data), 0)
+  expect_equal(nrow(tmp$data_in_cache), 2)
 })
 
 
@@ -158,8 +154,7 @@ tmp <- fg_pip(
   welfare_type    = "all",
   reporting_level = "all",
   ppp             = NULL,
-  lkup            = lkup,
-  con             = con
+  lkup            = lkup
 )
 tmp <- tmp$data_in_cache |> as.data.table()
 # dt <- pip(country =  "ALL",
diff --git a/tests/testthat/test-pip-local.R b/tests/testthat/test-pip-local.R
index 59e8ac89..43cf92ce 100644
--- a/tests/testthat/test-pip-local.R
+++ b/tests/testthat/test-pip-local.R
@@ -841,3 +841,13 @@ test_that("SPL is the same by reporting level", {
 })
 
 
+
+test_that("make sure popshare bug no which was reported no longer exists", {
+  out <- pip(country = "USA", year = 2022,
+             popshare = .5, lkup = lkup)
+  # Ensure poverty line is not the default one
+  expect_false(out$poverty_line %in% c(1.9, 3))
+  # Ensure headcount is closer to 0.5
+  expect_equal(out$headcount, 0.5, tolerance = .05)
+})
+
diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R
index 7b2b53a4..da735a34 100644
--- a/tests/testthat/test-utils.R
+++ b/tests/testthat/test-utils.R
@@ -66,7 +66,8 @@ test_that("subset_lkup correctly selects all countries", {
                      reporting_level = "all",
                      lkup            = ref_lkup,
                      valid_regions = valid_regions,
-                     data_dir      = data_dir)
+                     data_dir      = data_dir,
+                     povline = NULL)
 
   expect_equal(nrow(tmp$lkup), nrow(ref_lkup))
 })
@@ -79,7 +80,8 @@ test_that("subset_lkup correctly selects countries", {
                      reporting_level = "all",
                      lkup            = ref_lkup,
                      valid_regions = valid_regions,
-                     data_dir      = data_dir)
+                     data_dir      = data_dir,
+                     povline = NULL)
 
   expect_equal(sort(unique(tmp$lkup$country_code)), sort(selection))
 })
@@ -92,7 +94,8 @@ test_that("subset_lkup correctly selects single regions", {
                      reporting_level = "all",
                      lkup            = ref_lkup,
                      valid_regions = valid_regions,
-                     data_dir      = data_dir)
+                     data_dir      = data_dir,
+                     povline = NULL)
 
   expect_equal(sort(unique(tmp$lkup$region_code)), sort(selection))
 })
@@ -105,7 +108,8 @@ test_that("subset_lkup correctly selects multiple regions", {
                      reporting_level = "all",
                      lkup            = ref_lkup,
                      valid_regions = valid_regions,
-                     data_dir      = data_dir)
+                     data_dir      = data_dir,
+                     povline = NULL)
 
   expect_equal(sort(unique(tmp$lkup$region_code)), sort(selection))
 })
@@ -122,7 +126,8 @@ test_that("subset_lkup correctly selects countries and regions", {
                      reporting_level = "all",
                      lkup            = ref_lkup,
                      valid_regions = valid_regions,
-                     data_dir      = data_dir)
+                     data_dir      = data_dir,
+                     povline = NULL)
 
   # Regions are selected
   expect_true(all(region_selection %in% (unique(tmp$lkup$region_code))))

From 449cf1592c079e58932ceeed086f5c8c39cfc59d Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Tue, 16 Sep 2025 12:09:56 -0400
Subject: [PATCH 139/203] add documentation

---
 DESCRIPTION |  2 +-
 NEWS.md     | 23 +++++++++++++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 2e8242b1..69cf9b62 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: pipapi
 Title: API for the Poverty and Inequality Platform
-Version: 1.3.19.9004
+Version: 1.3.24
 Authors@R: 
     c(person(given = "Tony",
             family = "Fujs",
diff --git a/NEWS.md b/NEWS.md
index 50441fb5..d13ef03c 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,28 @@
 # pipapi (development version)
 
+# pipapi 1.3.24
+* Incorporate Lineup distribution and Countries with Missing Data (CMD) distributions
+* New way to estimate poverty using cumulative sums
+* allow multiple `popshare`
+
+
+# pipapi 1.3.23
+* Add Venn diagram information to Country profiles
+* Add prosperity gap to country profiles chart.
+
+
+# pipapi 1.3.22
+
+* Fix issue with popshare in fill gaps calls.
+
+# pipapi 1.3.21
+* Fix issue with popshare in survey year calls
+* Fix problem with alternative aggregates like AFW and AFE
+* Make sure all tests pass
+* add logs in docker container
+
+# pipapi 1.3.20
+
 # pipapi 1.3.19
 * fix issue with comparability
 

From ed575a6689b22acb8a264b54962f36ecef0509d7 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Tue, 16 Sep 2025 12:11:01 -0400
Subject: [PATCH 140/203] Increment version number to 1.3.24.9000

---
 DESCRIPTION | 2 +-
 NEWS.md     | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 69cf9b62..0c170952 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: pipapi
 Title: API for the Poverty and Inequality Platform
-Version: 1.3.24
+Version: 1.3.24.9000
 Authors@R: 
     c(person(given = "Tony",
             family = "Fujs",
diff --git a/NEWS.md b/NEWS.md
index d13ef03c..4808fc19 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,4 +1,6 @@
 # pipapi (development version)
+* add new way to infer poverty lines
+
 
 # pipapi 1.3.24
 * Incorporate Lineup distribution and Countries with Missing Data (CMD) distributions

From 549152b9e06779419912d691b394d94569e951e2 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Tue, 16 Sep 2025 12:29:43 -0400
Subject: [PATCH 141/203] remove filter_Lkup from fg_pip()

---
 R/fg_pip.R | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/R/fg_pip.R b/R/fg_pip.R
index b0761839..ad28bf60 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -18,16 +18,13 @@ fg_pip <- function(country,
   valid_regions       <- lkup$query_controls$region$values
   interpolation_list  <- lkup$interpolation_list
   data_dir            <- lkup$data_root
-  refy_lkup           <- lkup$refy_lkup # cleaned refy table, unique by country-years but some columns removed in order to do that
+  refy_lkup           <- lkup$refy_lkup # cleaned refy table, unique by country-
+
+  #povline is set to NULL if popshare is given
   if (!is.null(popshare)) povline <- NULL
+  if (is.list(povline))   povline <- unlist(povline)
 
   cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb")
-  # fg_pip is called from multiple places like pip, pip_grp_logic. We have connection object created
-  # when calling from `pip`. For other functions we create it here.
-  # if (is.null(con)) {
-  #   cache_file_path <- fs::path(lkup$data_root, 'cache', ext = "duckdb")
-  #   con <- duckdb::dbConnect(duckdb::duckdb(), dbdir = cache_file_path, read_only = TRUE)
-  # }
 
   # Handle interpolation
   metadata <- subset_lkup(
@@ -46,10 +43,7 @@ fg_pip <- function(country,
   data_present_in_master <- metadata$data_present_in_master
   povline  <- metadata$povline
   metadata <- metadata$lkup
-  # Remove aggregate distribution if popshare is specified
-  # TEMPORARY FIX UNTIL popshare is supported for aggregate distributions
-  metadata <- filter_lkup(metadata = metadata,
-                          popshare = popshare)
+
   setDT(metadata)
 
   # Return empty dataframe if no metadata is found (i.e. all in cache)

From a0b8eb3f7fa09dfe2b4218c6dec5c4c7d59afec1 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Tue, 16 Sep 2025 12:51:19 -0400
Subject: [PATCH 142/203] add multiple popshare for fillgaps

---
 R/compute_fgt_new.R |  7 +++--
 R/fg_pip.R          | 62 ++++++++++++++++++++++++++++++++-------------
 2 files changed, 49 insertions(+), 20 deletions(-)

diff --git a/R/compute_fgt_new.R b/R/compute_fgt_new.R
index c6d52d39..6c68b139 100644
--- a/R/compute_fgt_new.R
+++ b/R/compute_fgt_new.R
@@ -210,9 +210,12 @@ map_fgt <- \(lt, l_rl_rows, povline) {
     rbindlist(fill = TRUE)
 }
 
-process_dt <- function(dt, povline, mean_and_med = FALSE) {
+process_dt <- function(dt, povline,
+                       mean_and_med = FALSE,
+                       id_var = "file") {
+  byvars <- c(id_var, "reporting_level")
   dt[, compute_fgt_dt(.SD, "welfare", "weight", povline, mean_and_med),
-     by = .(file, reporting_level)]
+     by = byvars]
 }
 
 #' load survey year files and store them in a list
diff --git a/R/fg_pip.R b/R/fg_pip.R
index ad28bf60..ccebf59a 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -42,9 +42,8 @@ fg_pip <- function(country,
 
   data_present_in_master <- metadata$data_present_in_master
   povline  <- metadata$povline
-  metadata <- metadata$lkup
-
-  setDT(metadata)
+  metadata <- metadata$lkup |>
+    setDT()
 
   # Return empty dataframe if no metadata is found (i.e. all in cache)
   if (nrow(metadata) == 0) {
@@ -62,22 +61,49 @@ fg_pip <- function(country,
 
   # Load all survey data files into a named list of data.tables, each with an id column.
   lfst <- load_list_refy(input_list = full_list)
+  # Calculate and update poverty line if popshare is passed
+  # YES. this is INEFFICIENT because welfare cumsum is already created in
+  # the data, but we don't have time... FIX for the next release
+  if (!is.null(popshare)) {
+    povline <- lapply(lfst, \(x) {
+      # wbpip:::md_infer_poverty_line(x$welfare, x$weight, popshare)
+      infer_poverty_line(welfare = x$welfare,
+                         weight = x$weight,
+                         popshare = popshare,
+                         include = FALSE,
+                         method = "nearest",
+                         assume_sorted = TRUE)
+    })
+
+    fgt <- Map(process_dt, lfst, povline, id_var = "id") |>
+      rbindlist(fill = TRUE)
+
+    fgt[, `:=`(
+      country_code   = gsub("(.+)(_.+)", "\\1", id),
+      reporting_year = as.integer(gsub("(.+_)(.+)", "\\2", id))
+    )][,
+       id := NULL]
+
+
+  } else {
+    # Combine all loaded surveys into a single data.table, encode group identifiers,
+    # and create a GRP object for efficient grouping.
+    LDTg <- format_lfst(lfst = lfst,
+                        dict = dict)
+
+    # Compute the total population (sum of weights) for each group (id_rl) in
+    # the combined survey data.
+    tpop <- get_total_pop(LDTg = LDTg)
+
+    # Compute FGT and Watts indices for all groups and poverty lines, then decode
+    # integer codes back to (country_code, reporting_year, reporting_level).
+    fgt <- fgt_cumsum(LDTg = LDTg,
+                      tpop = tpop,
+                      povline = povline) |>
+      decode_pairs(dict = dict)
+
+  }
 
-  # Combine all loaded surveys into a single data.table, encode group identifiers,
-  # and create a GRP object for efficient grouping.
-  LDTg <- format_lfst(lfst = lfst,
-                      dict = dict)
-
-  # Compute the total population (sum of weights) for each group (id_rl) in
-  # the combined survey data.
-  tpop <- get_total_pop(LDTg = LDTg)
-
-  # Compute FGT and Watts indices for all groups and poverty lines, then decode
-  # integer codes back to (country_code, reporting_year, reporting_level).
-  fgt <- fgt_cumsum(LDTg = LDTg,
-                    tpop = tpop,
-                    povline = povline) |>
-    decode_pairs(dict = dict)
 
   # Add just mean and median
   res <- get_mean_median(fgt, lkup, fill_gaps = TRUE)

From 57c642d12d128dc239b34b144819145346ff769f Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Tue, 16 Sep 2025 16:51:10 -0400
Subject: [PATCH 143/203] update empty data

---
 data-raw/data.R                |   2 +-
 data/empty_response_fg.rda     | Bin 670 -> 735 bytes
 data/empty_response_fg_add.rda | Bin 0 -> 754 bytes
 3 files changed, 1 insertion(+), 1 deletion(-)
 create mode 100644 data/empty_response_fg_add.rda

diff --git a/data-raw/data.R b/data-raw/data.R
index a0a6a16f..2a87ea42 100644
--- a/data-raw/data.R
+++ b/data-raw/data.R
@@ -44,7 +44,7 @@ fg <- fg_pip(
   popshare = NULL,
   lkup = lkup
 ) |>
-  rbindlist()
+  rbindlist(fill = TRUE)
 empty_response_fg <- fg[-1]
 
 
diff --git a/data/empty_response_fg.rda b/data/empty_response_fg.rda
index 6785534c663fcb70b60f3c44a565efdf30383c72..31da4426fd98a9165dd7372c46428379c7e6115a 100644
GIT binary patch
literal 735
zcmV<50wDcDT4*^jL0KkKSuA~}L;wR7f589$|9}7iIsl9SP(Z)u-=IJM00B?~z5oCK
z0}TxT00E!?000a$GynhwfB*mhFwoEd02%-Q006^7KmY(}00000sU*mNjWh~+GM)(&
zX)-+<O-~fZ2Gw%Pq*SFPP^wiTsw$TE#*uq)#ga+LN~EfZ9+Zghv*RRB<L75)#xML^
zT#$b>GI-e{9BN2CAN_@tUGTHfPwDRaGF?pSb!K}qJ&JWUu=QCzS+Y#j*s+?+C-}SF
z%DY>B=4*VHG;C<*UB-A&o-qQVB&v#vf+cv{o}|43Dt<oML^2|QNfl63kyMVa-{YGv
z$KUU>yIlJE{ye{LUgv|Y;PZB@;=4D3?3pKW$C8%NyXeVgWu7f;iWpN3s%WIbm@J;m
zrly6LX1dC1F1tlGsYLfSEl-9|hPR`=n(fx4!-ABhDYb=@f&ZQSi7AsckE_LHIpbn|
z+8c|Xu14}Mr}Et9GCxQ5*4o~1ELle~%1&lU+V*Tlr&$^dI+~70O*xctQg6D-b*!@M
zD9v}v5BkyA*2!`6Pn$xSCUQ>VcCoq1gkaK2O&NN&w-Z}5j%=Nd=Tv-Otodw48lI~=
zS(is!tiqgB^mLtx(3H%6PA)GN!q3C8(3eL}w~CoK6jF*i9}ib4tl*=dojpDY(%xuu
z%M(dP<_{&^OEyI{zqX|P)M0Z=8<{CW#VIJ=gQNA>k%;@PB^GAXM)ah?7Rg6>=2;j^
zr7V`B%^Eq|m@Y26Cz-*>?|7W!&A6oAm76y@J)g7lZ%30v(Z%k=hDpSuYo^RC-rm@!
z(Y5C8QD!`vjjhFPbT=m3rV&Zlk*K#tS@TcJ2X^x6GviF0uZI@(A48z%Z)!Fq-A*gX
zX9pwXjFW<wVCH1jW^_|f)^m$F);1($c(7oYQj;`_@N~OYeO!i(6ml|IN*OIIZAx;o
R9=~J#7ji{7P>?Krr9^fgVY~nU

literal 670
zcmV;P0%83^T4*^jL0KkKS+{~~%m4zVe}Mo0|9}7iIsgm+P(Z)u-=F{iPy=2C0009?
z6GjtEj7&@klT1xC#zJ9$0B8UVng9R|0e}E#01TP{01W|v0B8UVng9R|0926#&?X@X
z<vmZ-)Y0l5lO}12sgCXZH!PM=sZ^CoYFet2saWS1vkD?^NmUT?q8--utBK>;XTwHV
zWo=O;`O$x$#_<D<LQf+C?`57!eO6(4%LCDK(5FjAFT%&+lIfC^V%D+d!Rp+>qjuIy
zT*;#+qX$Ips6S{VsSzroqNI@(#bJ7o6+Zt<$S)B@ltoYsK!J=1BF&mi@8Y_p540X?
zkr0$ZB?*ZFK_F2N0nJNfVDD2S2YcMPLlrS=Q#B?kVp%;Gnwgeh$%4viFksEGC#kJ!
zS?bjLJD9L+wI(iwDN0jCCqd%w+>)1KvEb&ioTEhlnU{Tclf|*bx$=D(wYHQ}jteO|
z7j`__BRAz4j9%uWr>)R(Q+5j}?^$LElLu#uH|WuJq1v6S%3U&%>`vA@Tk%FFa#Ci$
zn`t#MNaJ@@dt0pixT6+NHJyx%6RpteejO*hIVClZbG@kGQaIw~rQ1p=M_bwE*l2B0
zz?^vZCl=!~gt0S}U~^r`W|2?3B>xm()@NeKqLh?t!0P@qNsp4!QD*dRN-zs*QQCY<
zBN0i37G&hwz|n&R+|CW|*CEZYq}r8>7bJST9nGllW|_Y&{F8}B$5omw&AF{l%bJa>
zJ}k{`g><{@mqjPRBO={ZXJUM@JGK}o%atzN_Hb@d;Bq-PH5$@wrxp@f!M)gvrv)%#
zab2O2*G)r1r7Y1Rj`vY8Qj;W#_`2V!&+WX;nvLtMr3<YrYf5x3KF9tp<ce^iA#ViN
Em|W^NBme*a

diff --git a/data/empty_response_fg_add.rda b/data/empty_response_fg_add.rda
new file mode 100644
index 0000000000000000000000000000000000000000..3acc0ca37fdbe6c63804c26965d62dfaf76c87ea
GIT binary patch
literal 754
zcmV<O0uB8_T4*^jL0KkKSsS;=)BpoEf589$|9}7iIshC1P(Z)u-=IJM00B?~-U5IC
z05ufEVr0pt695UL089X3p`ZW&Gynhq0fvTv007Vc000IW8UO$TKmY&$1xXM=qGZ!k
zOqw-4H8eD2X^kezW50%rEU8qAl%%Q^N~l#OQr>vy7+Z6OSAe)xB~(cJ(j&glOQL+e
zmUe7ohtAr`50w)CKYhf9DoGC~ljzB1KB_W$Eb>!p&GlV^nN;=UepfUp*3pYalkmxO
z$xVwHEV6&h9&UyWZQ3$+urq9C;?;|$ev~JKh$<pVsHmVxUKZt2VMz$7<@CHl@<jrY
zDxjo-l!Oe^$CES16kkA;X`G+5d`%ekCK3n%N1$lzBL;&7)4gNGOJ-o$y3C6`?Q4`V
zQx?@TQevhhlk&XO%(Dhe7E@7!25N0miRNrto{PO2)%SO@U8Sipa8i_|HnCYaAKSlr
zN?nr2#o)4>qhfuTniwCo&BS1*&7+BP;QF<;w;T;uE~ARdPAjK#^4g7atc=DlQ&G=p
z#Yak$b_*%ivdk7ylLu}ulQ&~p><6(ujLKayl6y(q$46Eu#Lh}g*ZelLCbmf(H+DU)
zxb(x;bUSvVCb!{db29{Wy3ADMr`@#dPNb$o*6DP-8w)(0&ZNN{^}H0jfkh~zvFQ9d
z4QB;7snkw<{1X=AGnBD2lxW2FuzHqkifesZllf7L13MZnr3)0KqfAb(+hRsB_gYFU
z$*7IVNr@J^j`Q^_j8g=qlGIt7Hztfs1`HQ?oEzQm1Ca53n~M$US+jK`^7y;k(c;ZB
za%*|Aai$5#qerUD7X2IBRQNQ0uX2kq@oF}-D{HBuZNV``Cu&Bb+-lC`?8NtOIk8yI
z?p9#n^m1=e@Hw2@+Kq`fQ<DiS;^=mw>BUSKoh!AO9O<ZQI)$8;HY8m4u`wnpQe=@{
k4sPY2(|MaU8`oJ%7g|`_l;~W0zK8xU<ce^iAUAK3sKT6K+yDRo

literal 0
HcmV?d00001


From 3c153cf62d8d605b3224e8568fa2edf505536d3d Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Tue, 16 Sep 2025 16:51:29 -0400
Subject: [PATCH 144/203] document

---
 man/infer_poverty_line.Rd | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/man/infer_poverty_line.Rd b/man/infer_poverty_line.Rd
index 3344c543..e00f00ef 100644
--- a/man/infer_poverty_line.Rd
+++ b/man/infer_poverty_line.Rd
@@ -10,28 +10,37 @@ infer_poverty_line(
   popshare = 0.5,
   include = FALSE,
   method = c("nearest", "interp"),
-  assume_sorted = FALSE
+  assume_sorted = TRUE
 )
 }
 \arguments{
-\item{welfare}{Numeric vector of welfare values (e.g., income or consumption).}
+\item{welfare}{Numeric vector of welfare values (e.g., income or
+consumption).}
 
-\item{weight}{Numeric vector of sampling weights (must be non-negative, same length as welfare).}
+\item{weight}{Numeric vector of sampling weights (must be non-negative, same
+length as welfare).}
 
-\item{popshare}{Numeric vector of population shares (probabilities in \link{0,1}); default is 0.5 (median).}
+\item{popshare}{Numeric vector of population shares (probabilities in \link{0,1});
+default is 0.5 (median).}
 
-\item{include}{Logical; if TRUE, averages neighbors for ties (only for method = "nearest").}
+\item{include}{Logical; if TRUE, averages neighbors for ties (only for method
+= "nearest").}
 
-\item{method}{Character; either "nearest" (default, discrete quantile) or "interp" (weighted linear interpolation).}
+\item{method}{Character; either "nearest" (default, discrete quantile) or
+"interp" (weighted linear interpolation).}
 
-\item{assume_sorted}{Logical; if TRUE, assumes welfare and weight are already sorted by welfare.}
+\item{assume_sorted}{Logical; if TRUE, assumes welfare and weight are already
+sorted by welfare.}
 }
 \value{
-Numeric vector of poverty line(s) corresponding to the requested population share(s).
+Numeric vector of poverty line(s) corresponding to the requested
+population share(s).
 }
 \description{
-Computes the welfare value (poverty line) corresponding to a given population share, using either nearest or interpolated weighted quantile methods.
-Supports both discrete (nearest) and linear interpolation approaches, and can optionally average neighbors for ties.
+Computes the welfare value (poverty line) corresponding to a given population
+share, using either nearest or interpolated weighted quantile methods.
+Supports both discrete (nearest) and linear interpolation approaches, and can
+optionally average neighbors for ties.
 }
 \details{
 \itemize{

From 27deea46b4fb80617c46621a7f8e75cb4b84864d Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Tue, 16 Sep 2025 17:16:40 -0400
Subject: [PATCH 145/203] fix bug

---
 R/pip_new_lineups.R | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/R/pip_new_lineups.R b/R/pip_new_lineups.R
index 7838e3a1..f9610cae 100644
--- a/R/pip_new_lineups.R
+++ b/R/pip_new_lineups.R
@@ -315,7 +315,9 @@ treat_cache_and_main <- \(out, cache_file_path,
         rowbind(cached_data)
     }
 
-    pl <- get_from_pipapienv("pl_to_store")
+    pl      <- get_from_pipapienv("pl_to_store")
+    povline <- main_data[, poverty_line] |>
+      unique()
     # Only update master file if poverty line is part of this pl list
     # Using round to avoid precision error with decimals
     if (all(round(povline, 2) %in% round(pl, 2))) {

From 81691c6d8a833610db0cf058fa416976ed71972e Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Tue, 16 Sep 2025 17:16:48 -0400
Subject: [PATCH 146/203] Increment version number to 1.3.24.9001

---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 0c170952..1c7b2be4 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: pipapi
 Title: API for the Poverty and Inequality Platform
-Version: 1.3.24.9000
+Version: 1.3.24.9001
 Authors@R: 
     c(person(given = "Tony",
             family = "Fujs",

From 0f59379cba13ca4abcacb972111f16e520dba233 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Tue, 16 Sep 2025 21:44:11 -0400
Subject: [PATCH 147/203] sort for surveys with more than 1 reporting_level

---
 R/fg_pip.R | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/R/fg_pip.R b/R/fg_pip.R
index ccebf59a..bcb049df 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -67,12 +67,18 @@ fg_pip <- function(country,
   if (!is.null(popshare)) {
     povline <- lapply(lfst, \(x) {
       # wbpip:::md_infer_poverty_line(x$welfare, x$weight, popshare)
+      uni_rl <- funique(x$reporting_level)
+      if (length(uni_rl) > 1) {
+        assume_sorted <- FALSE
+      } else {
+        assume_sorted <- TRUE
+      }
       infer_poverty_line(welfare = x$welfare,
                          weight = x$weight,
                          popshare = popshare,
                          include = FALSE,
                          method = "nearest",
-                         assume_sorted = TRUE)
+                         assume_sorted = assume_sorted)
     })
 
     fgt <- Map(process_dt, lfst, povline, id_var = "id") |>

From c3da472a31dec11a7cc8b793d2e5096cefb848e8 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Tue, 16 Sep 2025 21:44:22 -0400
Subject: [PATCH 148/203] Increment version number to 1.3.24.9002

---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 1c7b2be4..757b624c 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: pipapi
 Title: API for the Poverty and Inequality Platform
-Version: 1.3.24.9001
+Version: 1.3.24.9002
 Authors@R: 
     c(person(given = "Tony",
             family = "Fujs",

From 3985f5a26995766d4811e7896b463d49cb480849 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Tue, 16 Sep 2025 22:15:41 -0400
Subject: [PATCH 149/203] remove verbose in join

---
 R/fgt_cumsum.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/R/fgt_cumsum.R b/R/fgt_cumsum.R
index d34c3ecd..8128b64f 100644
--- a/R/fgt_cumsum.R
+++ b/R/fgt_cumsum.R
@@ -101,7 +101,8 @@ fgt_cumsum <- \(LDTg, tpop, povline,
          on = "id_rl",
          how = "left",
          validate = "m:1",
-         drop.dup.cols = "y") |>
+         drop.dup.cols = "y",
+         verbose = 0) |>
     setorder(id_rl, index)
 
 
From 2503a9379fae549771a106bc8501ab2da60915e7 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Tue, 16 Sep 2025 22:15:50 -0400
Subject: [PATCH 150/203] Increment version number to 1.3.24.9003

---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 757b624c..3683c1c2 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: pipapi
 Title: API for the Poverty and Inequality Platform
-Version: 1.3.24.9002
+Version: 1.3.24.9003
 Authors@R: 
     c(person(given = "Tony",
             family = "Fujs",

From 247caa5f086e2d043d24dac26706a17ee1a245ad Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Wed, 17 Sep 2025 16:39:17 -0400
Subject: [PATCH 151/203] make use of missing data file instead of recreate i

---
 R/create_lkups.R | 48 ++++++++++++++----------------------------------
 1 file changed, 14 insertions(+), 34 deletions(-)

diff --git a/R/create_lkups.R b/R/create_lkups.R
index 6b42381d..9d9c9aa7 100644
--- a/R/create_lkups.R
+++ b/R/create_lkups.R
@@ -214,42 +214,19 @@ create_lkups <- function(data_dir, versions) {
       ncountries <- nrow(country_list)
       ly <- lineup_years$lineup_years
 
+      cmd <- fs::path(data_dir,
+                     "_aux/missing_data.fst") |>
+        fst::read_fst(as.data.table = TRUE) |>
+        fselect(country_code,
+                reporting_year = year,
+                welfare_type)
 
-      # Add reporting year
-      cmd <- CJ(country_list$country_code,
-                 ly, c("national", "urban", "rural"))
-
-      setnames(cmd, new = c('country_code',
-                            'reporting_year',
-                            'reporting_level'))
-
-      cmd <- joyn::joyn(x          = cmd,
-                         y          = refy_lkup,
-                         by         = c('country_code',
-                                        'reporting_year',
-                                        'reporting_level'),
-                         keep       = "anti",
-                         # reportvar  = FALSE,
-                         match_type = "1:1") |>
-        setDT()
-
-      # get number or reporing levels left after anti join
-      # if less than three, it should NOT be CMD (e.g., ARG or CHN)
-      cmd[, n := .N,
-          by = c("country_code", "reporting_year")]
-
-      # Delete unnecessary reporting levels
-      cmd <- cmd[(reporting_level == "national" &
-                     .joyn == "x" & n == 3)
-                 ]
 
       # build some variables
       cmd[,
            `:=`(
-               cache_id = paste(country_code, reporting_year,"NOSVY_D1_CON_CMD",
-                                sep = "_"),
                survey_coverage = "national",
-               welfare_type = "consumption",
+               reporting_level = "national",
                distribution_type = "CMD distribution",
                is_interpolated = FALSE,
                is_used_for_line_up = TRUE,
@@ -261,10 +238,13 @@ create_lkups <- function(data_dir, versions) {
                relative_distance = 1,
                lineup_approach = "CMD",
                mult_factor = 1,
-               .joyn = NULL,
-               n     = NULL
-
-             )]
+               wt_code  = toupper(substr(welfare_type, 1, 3))
+             )][,
+                cache_id := paste(country_code,
+                                  reporting_year,
+                                  paste0("NOSVY_D1_", wt_code,"_CMD"),
+                                 sep = "_")
+                ][, wt_code := NULL]
 
       # Append lineup and CMD info
 

From 7e3cdb8b74969b24e5584a5477421bad07c63ce8 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Thu, 18 Sep 2025 13:01:24 -0400
Subject: [PATCH 152/203] Increment version number to 1.3.24.9004

---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 3683c1c2..b35e67d7 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: pipapi
 Title: API for the Poverty and Inequality Platform
-Version: 1.3.24.9003
+Version: 1.3.24.9004
 Authors@R: 
     c(person(given = "Tony",
             family = "Fujs",

From f04612b93ff7b0843cd6d4c9caa7f3b4ece5412b Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Thu, 18 Sep 2025 17:00:14 -0400
Subject: [PATCH 153/203] remove heavy objects and apply gc()

---
 R/fg_pip.R          | 3 +++
 R/pip_new_lineups.R | 3 ++-
 R/rg_pip.R          | 2 ++
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/R/fg_pip.R b/R/fg_pip.R
index bcb049df..f58d4890 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -108,7 +108,10 @@ fg_pip <- function(country,
                       povline = povline) |>
       decode_pairs(dict = dict)
 
+  rm(LDTg)
   }
+  rm(lfst)
+  invisible(gc())
 
 
   # Add just mean and median
diff --git a/R/pip_new_lineups.R b/R/pip_new_lineups.R
index f9610cae..22fea671 100644
--- a/R/pip_new_lineups.R
+++ b/R/pip_new_lineups.R
@@ -309,7 +309,7 @@ treat_cache_and_main <- \(out, cache_file_path,
 
   if (nrow(main_data) > 0) {
     if (is.null(cached_data)) {
-      out <- main_data
+      out <- copy(main_data)
     } else {
       out <- main_data |>
         rowbind(cached_data)
@@ -323,6 +323,7 @@ treat_cache_and_main <- \(out, cache_file_path,
     if (all(round(povline, 2) %in% round(pl, 2))) {
       update_master_file(main_data, cache_file_path, fill_gaps)
     }
+    rm(main_data)
 
   } else {
     out <- cached_data
diff --git a/R/rg_pip.R b/R/rg_pip.R
index 905677b0..0b216e2b 100644
--- a/R/rg_pip.R
+++ b/R/rg_pip.R
@@ -77,6 +77,8 @@ rg_pip <- function(country,
     # if povline is vector, it should be applied to all surveys in lt
     res <- lapply(lt, process_dt, povline = povline)
   }
+  rm(lt)
+  invisible(gc())
 
   res <- rbindlist(res, fill = TRUE)
 

From 1b5447b4ef3f41819d0fafcc66d2179ef4765b28 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Thu, 18 Sep 2025 17:46:36 -0400
Subject: [PATCH 154/203] filter properly proverty lines to update master
 (inter cache) file

---
 R/duckdb_func.R     | 28 +++++++++++++++++++++++++++-
 R/pip_new_lineups.R |  9 +--------
 R/pip_old.R         | 14 ++++----------
 R/pip_old_lineups.R |  7 -------
 4 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/R/duckdb_func.R b/R/duckdb_func.R
index bc9c89be..2d8647c6 100644
--- a/R/duckdb_func.R
+++ b/R/duckdb_func.R
@@ -206,9 +206,35 @@ return_if_exists <- function(slkup,
 update_master_file <- function(dat,
                                cache_file_path,
                                fill_gaps,
-                               verbose = getOption("pipapi.verbose")
+                               verbose = getOption("pipapi.verbose"),
+                               decimal = 2
                                ) {
 
+  # select the right lines
+  pl      <- get_from_pipapienv("pl_to_store")
+
+
+  # Keep only rows with <= 2 decimal places
+  to_keep <- get_vars(dat, "poverty_line") |>
+    reg_elem() |> # extract vectos
+    as.character() |>
+    sub("^[^.]*\\.?","", x = _) |> # get only the decimal part
+    (\(x) which(nchar(x) <= decimal))()
+
+  dat <- dat[to_keep]
+
+  povline <- dat[, poverty_line] |>
+    unique()
+
+  # Keep only those that belong to the list
+  wpl <- povline[povline %in% round(pl, decimal)]
+
+  if (length(wpl) == 0) return(invisible(FALSE))
+
+  dat <- dat[poverty_line %in% wpl]
+
+  if (nrow(dat) == 0) return(invisible(FALSE))
+
   write_con <- connect_with_retry(cache_file_path, read_only = FALSE)
 
   if (fill_gaps) {
diff --git a/R/pip_new_lineups.R b/R/pip_new_lineups.R
index 22fea671..a7c51b3e 100644
--- a/R/pip_new_lineups.R
+++ b/R/pip_new_lineups.R
@@ -315,14 +315,7 @@ treat_cache_and_main <- \(out, cache_file_path,
         rowbind(cached_data)
     }
 
-    pl      <- get_from_pipapienv("pl_to_store")
-    povline <- main_data[, poverty_line] |>
-      unique()
-    # Only update master file if poverty line is part of this pl list
-    # Using round to avoid precision error with decimals
-    if (all(round(povline, 2) %in% round(pl, 2))) {
-      update_master_file(main_data, cache_file_path, fill_gaps)
-    }
+    update_master_file(main_data, cache_file_path, fill_gaps)
     rm(main_data)
 
   } else {
diff --git a/R/pip_old.R b/R/pip_old.R
index 3378876d..e0194c20 100644
--- a/R/pip_old.R
+++ b/R/pip_old.R
@@ -141,18 +141,12 @@ pip_old <- function(country         = "ALL",
 
   if (nrow(main_data) > 0) {
     out <- main_data |>
-      collapse::fmutate(path = as.character(path)) |>
-      collapse::rowbind(cached_data)
+      fmutate(path = as.character(path)) |>
+      rowbind(cached_data)
     # cached_data is NULL when we are querying live data in which case we don't update cache
     # This will be used only for development purpose and we don't have any intention to use it in production.
-    if(!is.null(cached_data)) {
-      # Update cache with data
-      pl <- get_from_pipapienv("pl_to_store")
-      # Only update master file if poverty line is part of this pl list
-      # Using round to avoid precision error with decimals
-      if (all(round(povline, 2) %in% round(pl, 2))) {
-        update_master_file(main_data, cache_file_path, fill_gaps)
-      }
+    if (!is.null(cached_data)) {
+      update_master_file(main_data, cache_file_path, fill_gaps)
     }
   } else {
     out <- cached_data
diff --git a/R/pip_old_lineups.R b/R/pip_old_lineups.R
index f7000674..1bb86dea 100644
--- a/R/pip_old_lineups.R
+++ b/R/pip_old_lineups.R
@@ -145,14 +145,7 @@ pip_old_lineups <- function(country         = "ALL",
     out <- main_data |>
       rowbind(cached_data)
 
-
-    pl <- get_from_pipapienv("pl_to_store")
-    # Only update master file if poverty line is part of this pl list
-    # Using round to avoid precision error with decimals
-    if (all(round(povline, 2) %in% round(pl, 2))) {
       update_master_file(main_data, cache_file_path, fill_gaps)
-    }
-
 
   } else {
     out <- cached_data

From c76bbd394b86a9c7f03973e7b27af5df30257c46 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Thu, 18 Sep 2025 18:30:27 -0400
Subject: [PATCH 155/203] Increment version number to 1.3.24.9005

---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index b35e67d7..a4b955d8 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: pipapi
 Title: API for the Poverty and Inequality Platform
-Version: 1.3.24.9004
+Version: 1.3.24.9005
 Authors@R: 
     c(person(given = "Tony",
             family = "Fujs",

From 84ecf2877c5e44e7c6dd59631dc0821dc8b15544 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Fri, 19 Sep 2025 08:10:44 -0400
Subject: [PATCH 156/203] remove gc()

---
 R/duckdb_func.R | 8 ++++----
 R/fg_pip.R      | 2 +-
 R/rg_pip.R      | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/R/duckdb_func.R b/R/duckdb_func.R
index 2d8647c6..12b47e80 100644
--- a/R/duckdb_func.R
+++ b/R/duckdb_func.R
@@ -291,7 +291,7 @@ update_master_file <- function(dat,
   "))
 
   duckdb::dbDisconnect(write_con)
-  invisible(gc())
+  
   if (nr > 0 && verbose)  message(glue("{target_file} is updated."))
 
   return(nr)
@@ -362,7 +362,7 @@ reset_cache <- function(pass = Sys.getenv('PIP_CACHE_LOCAL_KEY'),
     DBI::dbExecute(write_con, "DELETE from fg_master_file")
   }
   duckdb::dbDisconnect(write_con)
-  invisible(gc())
+  
 }
 
 create_duckdb_file <- function(cache_file_path) {
@@ -388,7 +388,7 @@ create_duckdb_file <- function(cache_file_path) {
                  watts     DOUBLE
   )")
   DBI::dbDisconnect(con)
-  invisible(gc())
+  
 }
 
 
@@ -423,7 +423,7 @@ load_inter_cache <- function(lkup = NULL,
   # connection object if it is not closed More details here
   # https://app.clickup.com/t/868cdpe3q
   duckdb::dbDisconnect(con)
-  invisible(gc())
+  
 
   setDT(master_file)
 }
diff --git a/R/fg_pip.R b/R/fg_pip.R
index f58d4890..726a0779 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -111,7 +111,7 @@ fg_pip <- function(country,
   rm(LDTg)
   }
   rm(lfst)
-  invisible(gc())
+  
 
 
   # Add just mean and median
diff --git a/R/rg_pip.R b/R/rg_pip.R
index 0b216e2b..a1cce53c 100644
--- a/R/rg_pip.R
+++ b/R/rg_pip.R
@@ -78,7 +78,7 @@ rg_pip <- function(country,
     res <- lapply(lt, process_dt, povline = povline)
   }
   rm(lt)
-  invisible(gc())
+  
 
   res <- rbindlist(res, fill = TRUE)
 

From db6fcd871d297d840791c09792b4c93aaf0c7f33 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Fri, 19 Sep 2025 08:29:22 -0400
Subject: [PATCH 157/203] Increment version number to 1.3.24.9006

---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index a4b955d8..f552ad3f 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: pipapi
 Title: API for the Poverty and Inequality Platform
-Version: 1.3.24.9005
+Version: 1.3.24.9006
 Authors@R: 
     c(person(given = "Tony",
             family = "Fujs",

From 847fb20dc5d1e2bdae0896de81d467d46f5730d7 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Fri, 19 Sep 2025 08:56:14 -0400
Subject: [PATCH 158/203] fix bug req$args to req$argsQuery

---
 inst/TMP/TMP_duckdb_cache.R | 59 ++++++++++++++++++++++++++-----------
 inst/plumber/v1/endpoints.R |  6 ++--
 2 files changed, 44 insertions(+), 21 deletions(-)

diff --git a/inst/TMP/TMP_duckdb_cache.R b/inst/TMP/TMP_duckdb_cache.R
index e37562d9..daae5baf 100644
--- a/inst/TMP/TMP_duckdb_cache.R
+++ b/inst/TMP/TMP_duckdb_cache.R
@@ -1,38 +1,61 @@
+# SETUP  -------------
+
 devtools::load_all(".")
+library(fastverse)
 force <- FALSE
-if (!"lkups" %in% ls() || isTRUE(force)) {
-  data_dir <- Sys.getenv("PIPAPI_DATA_ROOT_FOLDER_LOCAL") |>
-    fs::path()
-  fs::dir_ls(data_dir, recurse = FALSE)
-}
 
+root_dir <- Sys.getenv("PIPAPI_DATA_ROOT_FOLDER_LOCAL") |>
+  fs::path()
+fs::dir_ls(root_dir, recurse = FALSE)
 
+#
 latest_version <-
-  pipapi:::available_versions(data_dir) |>
+  available_versions(root_dir) |>
   max()
 
-latest_version <- NULL
-latest_version <- "20240627_2017_01_02_PROD"
-lkups <- create_versioned_lkups(data_dir,
+# lkups <- create_versioned_lkups(root_dir,
+#                                 vintage_pattern = "^202509.+2017.+(PROD)$")
+lkups <- create_versioned_lkups(root_dir,
                                 vintage_pattern = latest_version)
 
-lkup <- lkups$versions_paths[[lkups$latest_release]]
+# lkup <- lkups$versions_paths[[lkups$versions[[2]]]]
+ver_to_use <- lkups$latest_release # this is important. You need this object below
 
+lkup <- lkups$versions_paths[[ver_to_use]]
 
-reset_cache(lkup = lkup)
 
 
+# DEGUB -------------
 
-# 1.
-pip(country = "all", year = 2000, lkup = lkup)
 
-# 2.
-pip(country = "AGO", year = 2000, lkup = lkup)
+options(pipapi.query_live_data = FALSE)
+getOption("pipapi.query_live_data")
 
+reset_cache(lkup = lkup)
 
-pip(country = "all", year = "all", lkup = lkup)
+tictoc::tic()
+sv <- pip(country = "ALL",
+          year = "ALL",
+          povline = lkup$pl_lkup$poverty_line,
+          lkup = lkup,
+          fill_gaps = FALSE)
+tictoc::toc()
 
 
-pip(country = "IND", year = 2018, lkup = lkup)
+tictoc::tic()
+fg <- pip(country = "ALL",
+          year = "ALL",
+          povline = lkup$pl_lkup$poverty_line,
+          lkup = lkup,
+          fill_gaps = TRUE)
+tictoc::toc()
 
-pip(country = "IND", year = "all", lkup = lkup)
+
+# copy cache to TFS folder
+ori_cache <- fs::path(lkup$data_root, "cache.duckdb")
+dest_cache <- Sys.getenv("PIPAPI_DATA_ROOT_FOLDER_SERVER") |>
+  fs::path(ver_to_use, "cache.duckdb")
+
+if (fs::file_exists(ori_cache)) {
+  fs::file_copy(ori_cache, dest_cache, overwrite = TRUE)
+}
diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R
index e7345ecb..575c9a08 100644
--- a/inst/plumber/v1/endpoints.R
+++ b/inst/plumber/v1/endpoints.R
@@ -299,7 +299,7 @@ function(req, res) {
   params <- req$argsQuery
   res$serializer <- pipapi::assign_serializer(format = params$format)
 
-  if (is.null(req$args$table)) {
+  if (is.null(req$argsQuery$table)) {
     # return all available tables if none selected
     list_of_tables <- lkups$versions_paths[[params$version]]$aux_tables
     out <- data.frame(tables = list_of_tables)
@@ -796,7 +796,7 @@ function(req, res) {
   params <- req$argsQuery
   exclude <- req$argsQuery$exclude
   res$serializer <- pipapi::assign_serializer(format = params$format)
-  if (is.null(req$args$table)) {
+  if (is.null(req$argsQuery$table)) {
     # return all available tables if none selected
     list_of_tables <- lkups$versions_paths[[params$version]]$aux_tables
     out <- data.frame(tables = list_of_tables)
@@ -808,7 +808,7 @@ function(req, res) {
     params$exclude <- NULL
     out <- do.call(pipapi::get_aux_table_ui, params)
 
-    if (req$args$table == "countries" && exclude == TRUE) {
+    if (req$argsQuery$table == "countries" && exclude == TRUE) {
       # hardcoded
       to_remove <- c("MDG", "UKR")
       out <- out[!(country_code %in% to_remove)]

From 3b433c50e302a8134b8dabb3f0bb64fa1bf780e6 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Fri, 19 Sep 2025 08:56:24 -0400
Subject: [PATCH 159/203] Increment version number to 1.3.24.9007

---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index f552ad3f..9d167253 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: pipapi
 Title: API for the Poverty and Inequality Platform
-Version: 1.3.24.9006
+Version: 1.3.24.9007
 Authors@R: 
     c(person(given = "Tony",
             family = "Fujs",

From 5afa3d51da9c39cf9a4b775b407edd04716763d7 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Fri, 19 Sep 2025 10:11:01 -0400
Subject: [PATCH 160/203] handle errors

---
 R/zzz.R                     |  8 ++++----
 inst/TMP/TMP_duckdb_cache.R |  7 +++----
 inst/plumber/v1/plumber.R   | 14 ++++++++++++--
 3 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/R/zzz.R b/R/zzz.R
index 302841c5..0f481c3d 100644
--- a/R/zzz.R
+++ b/R/zzz.R
@@ -37,10 +37,10 @@ pipapi_default_options <- list(
 
 
   # set multi threats
-  available_cores <- parallel::detectCores() - 1
-
-  cores_to_use <- max(available_cores, 1) |>
-    min(8)
+  # available_cores <- parallel::detectCores() - 1
+  #
+  # cores_to_use <- max(available_cores, 1) |>
+  #   min(8)
   set_in_pipapienv("cores_to_use", cores_to_use)
 
 
diff --git a/inst/TMP/TMP_duckdb_cache.R b/inst/TMP/TMP_duckdb_cache.R
index daae5baf..b1e7137a 100644
--- a/inst/TMP/TMP_duckdb_cache.R
+++ b/inst/TMP/TMP_duckdb_cache.R
@@ -2,7 +2,6 @@
 
 devtools::load_all(".")
 library(fastverse)
-force <- FALSE
 
 root_dir <- Sys.getenv("PIPAPI_DATA_ROOT_FOLDER_LOCAL") |>
   fs::path()
@@ -13,10 +12,10 @@ latest_version <-
   available_versions(root_dir) |>
   max()
 
-# lkups <- create_versioned_lkups(root_dir,
-#                                 vintage_pattern = "^202509.+2017.+(PROD)$")
 lkups <- create_versioned_lkups(root_dir,
-                                vintage_pattern = latest_version)
+                                vintage_pattern = "^202509.+2017.+(PROD)$")
+# lkups <- create_versioned_lkups(root_dir,
+#                                 vintage_pattern = latest_version)
 
 # lkup <- lkups$versions_paths[[lkups$versions[[2]]]]
 ver_to_use <- lkups$latest_release # this is important. You need this object below
diff --git a/inst/plumber/v1/plumber.R b/inst/plumber/v1/plumber.R
index c5441da4..32056e79 100644
--- a/inst/plumber/v1/plumber.R
+++ b/inst/plumber/v1/plumber.R
@@ -29,10 +29,20 @@ plumber::pr(endpoints_path) |>
     # log_info('Bye bye: {proc.time()[["elapsed"]]}')
   }) |>
   plumber::pr_set_error(function(req, res, err) {
-    # In case of error, make sure you log the endpoint for #432
+    # Log error with request info
     method <- req$REQUEST_METHOD
     path <- req$PATH_INFO
-    cat(sprintf("ERROR at %s %s: %s\n", method, path, err$message))
+    msg <- sprintf("ERROR at %s %s: %s\n", method, path, err$message)
+    cat(msg)
+    # Always return a JSON error response
+    res$status <- 500
+    res$body <- jsonlite::toJSON(list(
+      error = "Internal Server Error",
+      message = err$message,
+      path = path,
+      method = method
+    ), auto_unbox = TRUE)
+    res
   }) |>
   # Set API spec
   plumber::pr_set_api_spec(api = function(spec) {

From 86a66da541797d049fda580c287756b5490ba24d Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Fri, 19 Sep 2025 10:16:14 -0400
Subject: [PATCH 161/203] add some tryCatch in some endpoints

---
 inst/plumber/v1/endpoints.R | 42 +++++++++++++++++++++++--------------
 1 file changed, 26 insertions(+), 16 deletions(-)

diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R
index 575c9a08..00e5f112 100644
--- a/inst/plumber/v1/endpoints.R
+++ b/inst/plumber/v1/endpoints.R
@@ -248,15 +248,20 @@ function(req, res) {
 #* Default is FALSE
 
 function(req, res) {
-  # Process request
-  params         <- req$argsQuery
-  params$lkup    <- lkups$versions_paths[[params$version]]
-  res$serializer <- pipapi::assign_serializer(format = params$format)
-  params$format  <- NULL
-  params$version <- NULL
+  # Defensive error handling for pip endpoint
+  tryCatch({
+    params         <- req$argsQuery
+    params$lkup    <- lkups$versions_paths[[params$version]]
+    res$serializer <- pipapi::assign_serializer(format = params$format)
+    params$format  <- NULL
+    params$version <- NULL
 
-  out <- do.call(pipapi::pip, params)
-  out
+    out <- do.call(pipapi::pip, params)
+    out
+  }, error = function(e) {
+    res$status <- 500
+    list(error = "Error in /api/v1/pip", message = e$message)
+  })
 }
 
 ### pip-grp ----------
@@ -275,15 +280,20 @@ function(req, res) {
 #* @param additional_ind:[bool] Additional indicators based on standard PIP output.
 #* Default is FALSE
 function(req, res) {
-  # Process request
-  params         <- req$argsQuery
-  params$lkup    <- lkups$versions_paths[[params$version]]
-  res$serializer <- pipapi::assign_serializer(format = params$format)
-  params$format  <- NULL
-  params$version <- NULL
+  # Defensive error handling for pip-grp endpoint
+  tryCatch({
+    params         <- req$argsQuery
+    params$lkup    <- lkups$versions_paths[[params$version]]
+    res$serializer <- pipapi::assign_serializer(format = params$format)
+    params$format  <- NULL
+    params$version <- NULL
 
-  out <- do.call(pipapi::pip_agg, params)
-  out
+    out <- do.call(pipapi::pip_agg, params)
+    out
+  }, error = function(e) {
+    res$status <- 500
+    list(error = "Error in /api/v1/pip-grp", message = e$message)
+  })
 }
 
 ###  aux ------------------

From 71afe001eccbe418a5b24c726b03cae51322987e Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Fri, 19 Sep 2025 11:02:00 -0400
Subject: [PATCH 162/203] Add a tiny request-ID + timings filter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

    every request prints one access line; you also get a serialize line. If you later see “request started” (client side) but no access log, the hang is inside handler/filters. If access prints but no serialize, the hang is in serialization
---
 R/zzz.R                     |  2 +-
 inst/plumber/v1/endpoints.R | 42 +++++++++++++++++++++++++++++++++++++
 man/update_master_file.Rd   |  3 ++-
 3 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/R/zzz.R b/R/zzz.R
index 0f481c3d..094f9dda 100644
--- a/R/zzz.R
+++ b/R/zzz.R
@@ -41,7 +41,7 @@ pipapi_default_options <- list(
   #
   # cores_to_use <- max(available_cores, 1) |>
   #   min(8)
-  set_in_pipapienv("cores_to_use", cores_to_use)
+  # set_in_pipapienv("cores_to_use", cores_to_use)
 
 
   # pov lines to store
diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R
index 00e5f112..a8bafbb5 100644
--- a/inst/plumber/v1/endpoints.R
+++ b/inst/plumber/v1/endpoints.R
@@ -3,6 +3,48 @@
 #* pip.worldbank.org
 
 library(pipapi)
+# ---- tiny telemetry (ID + timings) --------------------------------------
+`%||%` <- function(a, b) if (is.null(a)) b else a
+.req_id <- function() paste0(as.integer((Sys.time() |> as.POSIXct()) * 1e3), "-", sample.int(1e9,1))
+.now <- function() proc.time()[["elapsed"]]
+
+#* Always-on request context (one access line per request)
+#* @filter ctx
+function(req, res) {
+  req$.id    <- .req_id()
+  req$.start <- .now()
+  req$.path  <- req$PATH_INFO %||% ""
+  req$.meth  <- req$REQUEST_METHOD %||% ""
+  res$setHeader("X-Request-ID", req$.id)
+
+  # mark serialization time
+  req$.ser0 <- NA_real_
+  on.exit({
+    total <- .now() - req$.start
+    cat(sprintf(
+      '{"type":"access","id":"%s","method":"%s","path":"%s","status":%s,"dur_s":%.6f}\n',
+      req$.id, req$.meth, req$.path, as.character(res$status %||% NA_integer_), total
+    ), file = stderr())
+  }, add = TRUE)
+
+  forward()
+}
+
+# plumber-wide preserialize/postserialize hooks (stay in this file)
+#* @plumber
+function(pr) {
+  pr |>
+    pr_hook("preserialize", function(req, res) req$.ser0 <- .now()) |>
+    pr_hook("postserialize", function(req, res) {
+      if (!is.na(req$.ser0)) {
+        ser <- .now() - req$.ser0
+        cat(sprintf(
+          '{"type":"serialize","id":"%s","path":"%s","dur_s":%.6f}\n',
+          req$.id, req$PATH_INFO, ser
+        ), file = stderr())
+      }
+    })
+}
 
 # API filters -------------------------------------------------------------
 ## Validate version parameter ----
diff --git a/man/update_master_file.Rd b/man/update_master_file.Rd
index 39b1d3b9..968d3de4 100644
--- a/man/update_master_file.Rd
+++ b/man/update_master_file.Rd
@@ -8,7 +8,8 @@ update_master_file(
   dat,
   cache_file_path,
   fill_gaps,
-  verbose = getOption("pipapi.verbose")
+  verbose = getOption("pipapi.verbose"),
+  decimal = 2
 )
 }
 \arguments{

From fd91f928218d87cbdc3df1f99c8e55cc53069fe0 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Fri, 19 Sep 2025 12:07:32 -0400
Subject: [PATCH 163/203] add Request timed out to pip and pip-grp endpoints

---
 inst/plumber/v1/endpoints.R | 60 ++++++++++++++++++++++++++++++++-----
 1 file changed, 52 insertions(+), 8 deletions(-)

diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R
index a8bafbb5..ae32d142 100644
--- a/inst/plumber/v1/endpoints.R
+++ b/inst/plumber/v1/endpoints.R
@@ -21,10 +21,17 @@ function(req, res) {
   req$.ser0 <- NA_real_
   on.exit({
     total <- .now() - req$.start
-    cat(sprintf(
-      '{"type":"access","id":"%s","method":"%s","path":"%s","status":%s,"dur_s":%.6f}\n',
-      req$.id, req$.meth, req$.path, as.character(res$status %||% NA_integer_), total
-    ), file = stderr())
+    cat(
+      sprintf(
+        '{"type":"access","id":"%s","method":"%s","path":"%s","status":%s,"dur_s":%.6f}\n',
+        req$.id,
+        req$.meth,
+        req$.path,
+        as.character(res$status %||% NA_integer_),
+        total
+      ),
+      file = stderr()
+    )
   }, add = TRUE)
 
   forward()
@@ -46,6 +53,17 @@ function(pr) {
     })
 }
 
+
+# ---- kill runaway requests ----------------------------------------------
+with_req_timeout <- function(expr,
+                             secs = as.numeric(Sys.getenv("PLUMBER_REQ_TIMEOUT","30"))) {
+  if (!is.finite(secs) || secs <= 0) return(force(expr))
+  R.utils::withTimeout(force(expr), timeout = secs, onTimeout = "silent")  # uses setTimeLimit()
+}
+
+
+
+
 # API filters -------------------------------------------------------------
 ## Validate version parameter ----
 
@@ -298,11 +316,24 @@ function(req, res) {
     params$format  <- NULL
     params$version <- NULL
 
-    out <- do.call(pipapi::pip, params)
+    out <- do.call(pipapi::pip, params) |>
+      with_req_timeout()
+    if (is.null(out)) {
+      res$status <- 503
+      return(list(
+        error       = "Request timed out",
+        request_id  = req$.id,   # added in the ctx filter
+        endpoint    = "/api/v1/pip"
+      ))
+    }
+
     out
   }, error = function(e) {
     res$status <- 500
-    list(error = "Error in /api/v1/pip", message = e$message)
+    list(error = "Error in /api/v1/pip",
+         message = e$message,
+         request_id = tryCatch(req$.id, error = function(.) NA)
+         )
   })
 }
 
@@ -330,11 +361,24 @@ function(req, res) {
     params$format  <- NULL
     params$version <- NULL
 
-    out <- do.call(pipapi::pip_agg, params)
+    out <- do.call(pipapi::pip_agg, params) |>
+      with_req_timeout()
+    if (is.null(out)) {
+      res$status <- 503
+      return(list(
+        error       = "Request timed out",
+        request_id  = req$.id,   # added in the ctx filter
+        endpoint    = "/api/v1/pip-grp"
+      ))
+    }
+
     out
   }, error = function(e) {
     res$status <- 500
-    list(error = "Error in /api/v1/pip-grp", message = e$message)
+    list(error = "Error in /api/v1/pip-grp",
+         message = e$message,
+         request_id = tryCatch(req$.id, error = function(.) NA)
+    )
   })
 }
 

From c495e0771bf5e0d5c7daaeac5e808cd22048748c Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Fri, 19 Sep 2025 12:12:05 -0400
Subject: [PATCH 164/203] implement timeout wrapper to more endpoint

---
 inst/plumber/v1/endpoints.R | 171 ++++++++++++++++++++++++++++--------
 1 file changed, 133 insertions(+), 38 deletions(-)

diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R
index ae32d142..8208a819 100644
--- a/inst/plumber/v1/endpoints.R
+++ b/inst/plumber/v1/endpoints.R
@@ -726,13 +726,28 @@ function(req) {
 #* @param ppp_version:[chr] ppp year to be used
 #* @param version:[chr] Data version. Defaults to most recent version. See api/v1/versions
 #* @serializer json
-function(req) {
-  params <- req$argsQuery
-  params$lkup <- lkups$versions_paths[[req$argsQuery$version]]
-  params$version <- NULL
-
-  do.call(pipapi::ui_hp_stacked, params)
-
+function(req, res) {
+  tryCatch({
+    params <- req$argsQuery
+    params$lkup <- lkups$versions_paths[[req$argsQuery$version]]
+    params$version <- NULL
+    out <- do.call(pipapi::ui_hp_stacked, params) |> 
+    with_req_timeout()
+    if (is.null(out)) {
+      res$status <- 503
+      return(list(
+        error = "Request timed out",
+        request_id = req$.id,
+        endpoint = "/api/v1/hp-stacked"
+      ))
+    }
+    out
+  }, error = function(e) {
+    res$status <- 500
+    list(error = "Error in /api/v1/hp-stacked",
+         message = e$message,
+         request_id = tryCatch(req$.id, error = function(.) NA))
+  })
 }
 
 ### hp-countries -------------
@@ -743,11 +758,28 @@ function(req) {
 #* @param ppp_version:[chr] ppp year to be used
 #* @param version:[chr] Data version. Defaults to most recent version. See api/v1/versions
 #* @serializer json
-function(req) {
-  params <- req$argsQuery
-  params$lkup <- lkups$versions_paths[[req$argsQuery$version]]
-  params$version <- NULL
-  do.call(pipapi::ui_hp_countries, params)
+function(req, res) {
+  tryCatch({
+    params <- req$argsQuery
+    params$lkup <- lkups$versions_paths[[req$argsQuery$version]]
+    params$version <- NULL
+    out <- do.call(pipapi::ui_hp_countries, params) |> 
+    with_req_timeout()
+    if (is.null(out)) {
+      res$status <- 503
+      return(list(
+        error = "Request timed out",
+        request_id = req$.id,
+        endpoint = "/api/v1/hp-countries"
+      ))
+    }
+    out
+  }, error = function(e) {
+    res$status <- 500
+    list(error = "Error in /api/v1/hp-countries",
+         message = e$message,
+         request_id = tryCatch(req$.id, error = function(.) NA))
+  })
 }
 
 
@@ -767,14 +799,29 @@ function(req) {
 #* @param ppp_version:[chr] ppp year to be used
 #* @param version:[chr] Data version. Defaults to most recent version. See api/v1/versions
 #* @serializer json list(na = "null")
-function(req) {
-  params <- req$argsQuery
-  params$lkup <- lkups$versions_paths[[req$argsQuery$version]]
-  params$version <- NULL
-  params$censor  <- TRUE
-
-  out <- do.call(pipapi::ui_pc_charts, params)
-  return(out)
+function(req, res) {
+  tryCatch({
+    params <- req$argsQuery
+    params$lkup <- lkups$versions_paths[[req$argsQuery$version]]
+    params$version <- NULL
+    params$censor  <- TRUE
+    out <- do.call(pipapi::ui_pc_charts, params) |>
+     with_req_timeout()
+    if (is.null(out)) {
+      res$status <- 503
+      return(list(
+        error = "Request timed out",
+        request_id = req$.id,
+        endpoint = "/api/v1/pc-charts"
+      ))
+    }
+    out
+  }, error = function(e) {
+    res$status <- 500
+    list(error = "Error in /api/v1/pc-charts",
+         message = e$message,
+         request_id = tryCatch(req$.id, error = function(.) NA))
+  })
 }
 
 ### pc-download -----------
@@ -812,13 +859,28 @@ function(req) {
 #* @param ppp_version:[chr] ppp year to be used
 #* @param version:[chr] Data version. Defaults to most recent version. See api/v1/versions
 #* @serializer json
-function(req) {
-  params <- req$argsQuery
-  params$lkup <- lkups$versions_paths[[req$argsQuery$version]]
-  params$version <- NULL
-
-  do.call(pipapi::ui_pc_regional, params)
-
+function(req, res) {
+  tryCatch({
+    params <- req$argsQuery
+    params$lkup <- lkups$versions_paths[[req$argsQuery$version]]
+    params$version <- NULL
+    out <- do.call(pipapi::ui_pc_regional, params) |> 
+    with_req_timeout()
+    if (is.null(out)) {
+      res$status <- 503
+      return(list(
+        error = "Request timed out",
+        request_id = req$.id,
+        endpoint = "/api/v1/pc-regional-aggregates"
+      ))
+    }
+    out
+  }, error = function(e) {
+    res$status <- 500
+    list(error = "Error in /api/v1/pc-regional-aggregates",
+         message = e$message,
+         request_id = tryCatch(req$.id, error = function(.) NA))
+  })
 }
 
 
@@ -833,11 +895,28 @@ function(req) {
 #* @param ppp_version:[chr] ppp year to be used
 #* @param version:[chr] Data version. Defaults to most recent version. See api/v1/versions
 #* @serializer json list(na="null")
-function(req) {
-  params <- req$argsQuery
-  params$lkup <- lkups$versions_paths[[req$argsQuery$version]]
-  params$version <- NULL
-  do.call(pipapi::ui_cp_key_indicators, params)
+function(req, res) {
+  tryCatch({
+    params <- req$argsQuery
+    params$lkup <- lkups$versions_paths[[req$argsQuery$version]]
+    params$version <- NULL
+    out <- do.call(pipapi::ui_cp_key_indicators, params) |> 
+    with_req_timeout()
+    if (is.null(out)) {
+      res$status <- 503
+      return(list(
+        error = "Request timed out",
+        request_id = req$.id,
+        endpoint = "/api/v1/cp-key-indicators"
+      ))
+    }
+    out
+  }, error = function(e) {
+    res$status <- 500
+    list(error = "Error in /api/v1/cp-key-indicators",
+         message = e$message,
+         request_id = tryCatch(req$.id, error = function(.) NA))
+  })
 }
 
 
@@ -850,12 +929,28 @@ function(req) {
 #* @param ppp_version:[chr] ppp year to be used
 #* @param version:[chr] Data version. Defaults to most recent version. See api/v1/versions
 #* @serializer json
-function(req) {
-  params <- req$argsQuery
-  params$lkup <- lkups$versions_paths[[req$argsQuery$version]]
-
-  params$version <- NULL
-    do.call(pipapi::ui_cp_charts, params)
+function(req, res) {
+  tryCatch({
+    params <- req$argsQuery
+    params$lkup <- lkups$versions_paths[[req$argsQuery$version]]
+    params$version <- NULL
+    out <- do.call(pipapi::ui_cp_charts, params) |> 
+    with_req_timeout()
+    if (is.null(out)) {
+      res$status <- 503
+      return(list(
+        error = "Request timed out",
+        request_id = req$.id,
+        endpoint = "/api/v1/cp-charts"
+      ))
+    }
+    out
+  }, error = function(e) {
+    res$status <- 500
+    list(error = "Error in /api/v1/cp-charts",
+         message = e$message,
+         request_id = tryCatch(req$.id, error = function(.) NA))
+  })
 }
 
 ### cp-download -----------

From 6fa1450bf5a6621deb7a0ed0fe37a1ad1707866f Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Fri, 19 Sep 2025 12:46:16 -0400
Subject: [PATCH 165/203] revamp plumber

---
 inst/plumber/v1/endpoints.R | 62 ++++++++++++++++++++++------
 inst/plumber/v1/plumber.R   | 80 ++++++++++++++++++-------------------
 2 files changed, 89 insertions(+), 53 deletions(-)

diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R
index 8208a819..a780c921 100644
--- a/inst/plumber/v1/endpoints.R
+++ b/inst/plumber/v1/endpoints.R
@@ -4,9 +4,39 @@
 
 library(pipapi)
 # ---- tiny telemetry (ID + timings) --------------------------------------
-`%||%` <- function(a, b) if (is.null(a)) b else a
-.req_id <- function() paste0(as.integer((Sys.time() |> as.POSIXct()) * 1e3), "-", sample.int(1e9,1))
-.now <- function() proc.time()[["elapsed"]]
+`%||%` <- function(a, b) {
+  if (is.null(a))
+    b
+  else
+    a
+}
+
+# Generate a unique request ID
+.req_id <- function() {
+  # current time in milliseconds as a character string
+  ts_ms <- format(as.numeric(Sys.time()) * 1000, scientific = FALSE)
+
+  # add random component
+  rnd <- sample.int(1e9, 1)
+
+  # combine
+  id <- paste0(ts_ms, "-", rnd)
+
+  return(id)
+}
+
+# Capture current elapsed CPU time (since R started)
+.now <- function() {
+  pt <- proc.time()
+
+  # Extract elapsed time (in seconds)
+  elapsed <- pt[["elapsed"]]
+
+  return(elapsed)
+}
+
+
+
 
 #* Always-on request context (one access line per request)
 #* @filter ctx
@@ -55,10 +85,18 @@ function(pr) {
 
 
 # ---- kill runaway requests ----------------------------------------------
-with_req_timeout <- function(expr,
-                             secs = as.numeric(Sys.getenv("PLUMBER_REQ_TIMEOUT","30"))) {
-  if (!is.finite(secs) || secs <= 0) return(force(expr))
-  R.utils::withTimeout(force(expr), timeout = secs, onTimeout = "silent")  # uses setTimeLimit()
+with_req_timeout <-
+  function(expr,
+           secs = Sys.getenv("PLUMBER_REQ_TIMEOUT", "150") |>
+             as.numeric()
+           ){
+
+  if (!is.finite(secs) || secs <= 0)
+    return(force(expr))
+
+  R.utils::withTimeout(force(expr),
+                       timeout = secs,
+                       onTimeout = "silent")  # uses setTimeLimit()
 }
 
 
@@ -731,7 +769,7 @@ function(req, res) {
     params <- req$argsQuery
     params$lkup <- lkups$versions_paths[[req$argsQuery$version]]
     params$version <- NULL
-    out <- do.call(pipapi::ui_hp_stacked, params) |> 
+    out <- do.call(pipapi::ui_hp_stacked, params) |>
     with_req_timeout()
     if (is.null(out)) {
       res$status <- 503
@@ -763,7 +801,7 @@ function(req, res) {
     params <- req$argsQuery
     params$lkup <- lkups$versions_paths[[req$argsQuery$version]]
     params$version <- NULL
-    out <- do.call(pipapi::ui_hp_countries, params) |> 
+    out <- do.call(pipapi::ui_hp_countries, params) |>
     with_req_timeout()
     if (is.null(out)) {
       res$status <- 503
@@ -864,7 +902,7 @@ function(req, res) {
     params <- req$argsQuery
     params$lkup <- lkups$versions_paths[[req$argsQuery$version]]
     params$version <- NULL
-    out <- do.call(pipapi::ui_pc_regional, params) |> 
+    out <- do.call(pipapi::ui_pc_regional, params) |>
     with_req_timeout()
     if (is.null(out)) {
       res$status <- 503
@@ -900,7 +938,7 @@ function(req, res) {
     params <- req$argsQuery
     params$lkup <- lkups$versions_paths[[req$argsQuery$version]]
     params$version <- NULL
-    out <- do.call(pipapi::ui_cp_key_indicators, params) |> 
+    out <- do.call(pipapi::ui_cp_key_indicators, params) |>
     with_req_timeout()
     if (is.null(out)) {
       res$status <- 503
@@ -934,7 +972,7 @@ function(req, res) {
     params <- req$argsQuery
     params$lkup <- lkups$versions_paths[[req$argsQuery$version]]
     params$version <- NULL
-    out <- do.call(pipapi::ui_cp_charts, params) |> 
+    out <- do.call(pipapi::ui_cp_charts, params) |>
     with_req_timeout()
     if (is.null(out)) {
       res$status <- 503
diff --git a/inst/plumber/v1/plumber.R b/inst/plumber/v1/plumber.R
index 32056e79..118e9617 100644
--- a/inst/plumber/v1/plumber.R
+++ b/inst/plumber/v1/plumber.R
@@ -1,54 +1,52 @@
+# ---- process-level thread caps (Step 3) -----------------------------
+# Avoid oversubscribing on multi-core servers (common cause of hangs under load)
+Sys.setenv(
+  OPENBLAS_NUM_THREADS = "1",
+  MKL_NUM_THREADS      = "1",
+  OMP_NUM_THREADS      = "1"
+)
+
+ncores <- parallel::detectCores(logical = FALSE)
+
+data.table::setDTthreads(max(1L, ncores))
+collapse::set_collapse(nthreads	= max(1L, ncores))
+fst::threads_fst(max(1L, ncores))
+
+# ---- plumber setup ---------------------------------------------------
 library(plumber)
 
 endpoints_path <- system.file("plumber/v1/endpoints.R", package = "pipapi")
-api_spec_path <-  system.file("plumber/v1/openapi.yaml", package = "pipapi")
-# convert_empty <- pipapi:::convert_empty
-
-plumber::pr(endpoints_path) |>
-  # pre-route log
-  plumber::pr_hook("preroute", function() {
-    # log_separator()
-    # tictoc::tic("route") # Start timer for log info
-  }) |>
-  # post-route log
-  plumber::pr_hook("postroute", function(req, res) {
-    # end_route <- tictoc::toc(quiet = TRUE)
-    # log_info('route: {convert_empty(req$REMOTE_ADDR)} {convert_empty(req$REQUEST_METHOD)} {convert_empty(req$PATH_INFO)} {convert_empty(req$QUERY_STRING)}  {convert_empty(res$status)} {round(end_route$toc - end_route$tic, digits = getOption("digits", 6))}')
-  }) |>
-  # pre-serialization log
-  plumber::pr_hook("preserialize", function() {
-    # tictoc::tic("serialize")
-  }) |>
-  # post-serialization log
-  plumber::pr_hook("postserialize", function(req) {
-    # end_serial <- tictoc::toc(quiet = TRUE)
-    # log_info('serialize: {convert_empty(req$PATH_INFO)} {round(end_serial$toc - end_serial$tic, digits = getOption("digits", 6))}')
-    # log_separator()
-  }) |>
-  plumber::pr_hook("exit", function() {
-    # log_info('Bye bye: {proc.time()[["elapsed"]]}')
-  }) |>
+api_spec_path <- system.file("plumber/v1/openapi.yaml", package = "pipapi")
+
+pr <- plumber::pr(endpoints_path) |>
+  # (Step 4) simplified error handler
   plumber::pr_set_error(function(req, res, err) {
-    # Log error with request info
+    rid <- tryCatch(req$.id, error = function(e) NA)
     method <- req$REQUEST_METHOD
-    path <- req$PATH_INFO
-    msg <- sprintf("ERROR at %s %s: %s\n", method, path, err$message)
-    cat(msg)
-    # Always return a JSON error response
+    path   <- req$PATH_INFO
+
+    # structured log to stderr (good for docker logs)
+    cat(sprintf(
+      '{"level":"error","id":"%s","method":"%s","path":"%s","msg":%s}\n',
+      as.character(rid), method, path,
+      jsonlite::toJSON(err$message, auto_unbox = TRUE)
+    ), file = stderr())
+
     res$status <- 500
     res$body <- jsonlite::toJSON(list(
-      error = "Internal Server Error",
-      message = err$message,
-      path = path,
-      method = method
+      error      = "Internal Server Error",
+      message    = err$message,
+      path       = path,
+      method     = method,
+      request_id = rid
     ), auto_unbox = TRUE)
     res
   }) |>
-  # Set API spec
+  # Inject version into OpenAPI spec
   plumber::pr_set_api_spec(api = function(spec) {
-    spec$info$version <- utils::packageVersion("pipapi") |>
-      as.character()
+    spec$info$version <- as.character(utils::packageVersion("pipapi"))
     spec
   }) |>
-  plumber::pr_set_api_spec(
-    yaml::read_yaml(api_spec_path))
+  plumber::pr_set_api_spec(yaml::read_yaml(api_spec_path))
+
+pr

From 912a0a707b0c4be1ba1c88e39781ecf3d23b90cc Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Fri, 19 Sep 2025 14:10:43 -0400
Subject: [PATCH 166/203] new plumber setup.

---
 inst/plumber/v1/endpoints.R |  91 ++++++++++++------------
 inst/plumber/v1/plumber.R   | 135 ++++++++++++++++++++++++++++--------
 2 files changed, 153 insertions(+), 73 deletions(-)

diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R
index a780c921..dde6835f 100644
--- a/inst/plumber/v1/endpoints.R
+++ b/inst/plumber/v1/endpoints.R
@@ -4,86 +4,91 @@
 
 library(pipapi)
 # ---- tiny telemetry (ID + timings) --------------------------------------
-`%||%` <- function(a, b) {
-  if (is.null(a))
-    b
-  else
-    a
+# Monotonic wall time for durations (in seconds)
+.now <- function() {
+  pt <- proc.time()
+  elapsed <- pt[["elapsed"]]
+  return(elapsed)
 }
 
-# Generate a unique request ID
+# Generate a request id AND return a decoded structure
+# - id_raw: "milliseconds-since-epoch-random"
+# - timestamp: POSIXct in UTC
+# - random: integer
 .req_id <- function() {
-  # current time in milliseconds as a character string
-  ts_ms <- format(as.numeric(Sys.time()) * 1000, scientific = FALSE)
+  # 1) milliseconds since epoch (as numeric -> character; avoid 32-bit overflow)
+  ts_ms_num <- as.numeric(Sys.time()) * 1000
+  ts_ms_chr <- format(ts_ms_num, scientific = FALSE, trim = TRUE)
 
-  # add random component
+  # 2) random component
   rnd <- sample.int(1e9, 1)
 
-  # combine
-  id <- paste0(ts_ms, "-", rnd)
-
-  return(id)
-}
-
-# Capture current elapsed CPU time (since R started)
-.now <- function() {
-  pt <- proc.time()
+  # 3) build id string
+  id_raw <- paste0(ts_ms_chr, "-", rnd)
 
-  # Extract elapsed time (in seconds)
-  elapsed <- pt[["elapsed"]]
+  # 4) decoded timestamp (POSIXct, UTC)
+  ts_posix <- as.POSIXct(as.numeric(ts_ms_chr) / 1000,
+                         origin = "1970-01-01", tz = "UTC")
 
-  return(elapsed)
+  out <- list(
+    id_raw    = id_raw,
+    timestamp = ts_posix,
+    random    = rnd
+  )
+  return(out)
 }
 
+`%||%` <- function(a, b) if (!is.null(a)) a else b
 
-
-
-#* Always-on request context (one access line per request)
+# ---- Always-on request context (one log line per request) ----
 #* @filter ctx
 function(req, res) {
-  req$.id    <- .req_id()
+  rid <- .req_id()  # <- returns list(id_raw, timestamp, random)
+
+  # put a STRING into places that need strings
+  req$.id      <- rid$id_raw            # string token for logs & error bodies
+  res$setHeader("X-Request-ID", req$.id)
+
+  # keep the decoded pieces if you want (optional)
+  req$.id_time <- rid$timestamp         # POSIXct (UTC)
+  req$.id_rand <- rid$random            # integer
+
   req$.start <- .now()
   req$.path  <- req$PATH_INFO %||% ""
   req$.meth  <- req$REQUEST_METHOD %||% ""
-  res$setHeader("X-Request-ID", req$.id)
 
-  # mark serialization time
-  req$.ser0 <- NA_real_
   on.exit({
     total <- .now() - req$.start
-    cat(
-      sprintf(
-        '{"type":"access","id":"%s","method":"%s","path":"%s","status":%s,"dur_s":%.6f}\n',
-        req$.id,
-        req$.meth,
-        req$.path,
-        as.character(res$status %||% NA_integer_),
-        total
-      ),
-      file = stderr()
-    )
+    cat(sprintf(
+      '{"type":"access","id":"%s","method":"%s","path":"%s","status":%s,"dur_s":%.6f}\n',
+      req$.id, req$.meth, req$.path, as.character(res$status %||% NA_integer_), total
+    ), file = stderr())
   }, add = TRUE)
 
   forward()
 }
 
-# plumber-wide preserialize/postserialize hooks (stay in this file)
+
+# ---- Serialization timing hooks (stay with the filter for coherence) ----
 #* @plumber
 function(pr) {
   pr |>
-    pr_hook("preserialize", function(req, res) req$.ser0 <- .now()) |>
+    pr_hook("preserialize", function(req, res) {
+      req$.ser0 <- .now()
+    }) |>
     pr_hook("postserialize", function(req, res) {
-      if (!is.na(req$.ser0)) {
+      if (!is.null(req$.ser0) && !is.na(req$.ser0)) {
         ser <- .now() - req$.ser0
         cat(sprintf(
           '{"type":"serialize","id":"%s","path":"%s","dur_s":%.6f}\n',
-          req$.id, req$PATH_INFO, ser
+          req$.id %||% "", req$.path %||% "", ser
         ), file = stderr())
       }
     })
 }
 
 
+
 # ---- kill runaway requests ----------------------------------------------
 with_req_timeout <-
   function(expr,
diff --git a/inst/plumber/v1/plumber.R b/inst/plumber/v1/plumber.R
index 118e9617..67a9ebf2 100644
--- a/inst/plumber/v1/plumber.R
+++ b/inst/plumber/v1/plumber.R
@@ -12,41 +12,116 @@ data.table::setDTthreads(max(1L, ncores))
 collapse::set_collapse(nthreads	= max(1L, ncores))
 fst::threads_fst(max(1L, ncores))
 
-# ---- plumber setup ---------------------------------------------------
+# ---- build router --------------------------------------------------------
 library(plumber)
 
 endpoints_path <- system.file("plumber/v1/endpoints.R", package = "pipapi")
-api_spec_path <- system.file("plumber/v1/openapi.yaml", package = "pipapi")
+api_spec_path  <- system.file("plumber/v1/openapi.yaml", package = "pipapi")
 
 pr <- plumber::pr(endpoints_path) |>
-  # (Step 4) simplified error handler
-  plumber::pr_set_error(function(req, res, err) {
-    rid <- tryCatch(req$.id, error = function(e) NA)
-    method <- req$REQUEST_METHOD
-    path   <- req$PATH_INFO
-
-    # structured log to stderr (good for docker logs)
-    cat(sprintf(
-      '{"level":"error","id":"%s","method":"%s","path":"%s","msg":%s}\n',
-      as.character(rid), method, path,
-      jsonlite::toJSON(err$message, auto_unbox = TRUE)
-    ), file = stderr())
-
-    res$status <- 500
-    res$body <- jsonlite::toJSON(list(
-      error      = "Internal Server Error",
-      message    = err$message,
-      path       = path,
-      method     = method,
-      request_id = rid
-    ), auto_unbox = TRUE)
-    res
-  }) |>
-  # Inject version into OpenAPI spec
-  plumber::pr_set_api_spec(api = function(spec) {
-    spec$info$version <- as.character(utils::packageVersion("pipapi"))
-    spec
-  }) |>
+
+  # ---- Pre-route: attach request id + start time ----
+plumber::pr_hook("preroute", function(req, res) {
+  rid <- .req_id()
+
+  # store a STRING in req$.id and in header
+  req$.id       <- rid$id_raw
+  req$.id_time  <- rid$timestamp   # optional metadata
+  req$.id_rand  <- rid$random      # optional metadata
+  res$setHeader("X-Request-ID", req$.id)
+
+  # basic context
+  req$.start <- .now()
+  req$.path  <- req$PATH_INFO %||% ""
+  req$.meth  <- req$REQUEST_METHOD %||% ""
+
+  # mark serialization time start
+  req$.ser0 <- NA_real_
+
+  forward()
+}) |>
+
+  # ---- Post-route: log route duration (handler time) ----
+plumber::pr_hook("postroute", function(req, res) {
+  if (!is.null(req$.start)) {
+    dur <- .now() - req$.start
+    cat(
+      sprintf(
+        '{"type":"route","id":"%s","method":"%s","path":"%s","status":%s,"dur_s":%.6f}\n',
+        req$.id %||% "",
+        req$.meth %||% "",
+        req$.path %||% "",
+        as.character(res$status %||% NA_integer_),
+        dur
+      ),
+      file = stderr()
+    )
+  }
+}) |>
+
+  # ---- Pre-serialization: mark when we start serializing ----
+plumber::pr_hook("preserialize", function(req, res) {
+  req$.ser0 <- .now()
+}) |>
+
+  # ---- Post-serialization: log serialization duration ----
+plumber::pr_hook("postserialize", function(req, res) {
+  if (!is.null(req$.ser0) && !is.na(req$.ser0)) {
+    ser_dur <- .now() - req$.ser0
+    cat(
+      sprintf(
+        '{"type":"serialize","id":"%s","path":"%s","dur_s":%.6f}\n',
+        req$.id %||% "",
+        req$.path %||% "",
+        ser_dur
+      ),
+      file = stderr()
+    )
+  }
+}) |>
+
+  # ---- Exit hook: when process shuts down ----
+plumber::pr_hook("exit", function() {
+  cat(
+    sprintf(
+      '{"type":"exit","uptime_s":%.2f}\n',
+      proc.time()[["elapsed"]]
+    ),
+    file = stderr()
+  )
+}) |>
+
+  # ---- Global error handler ----
+plumber::pr_set_error(function(req, res, err) {
+  method <- req$REQUEST_METHOD %||% ""
+  path   <- req$PATH_INFO %||% ""
+  rid    <- req$.id %||% "NA"
+
+  cat(
+    sprintf(
+      '{"type":"error","id":"%s","method":"%s","path":"%s","msg":%s}\n',
+      rid, method, path, jsonlite::toJSON(err$message, auto_unbox = TRUE)
+    ),
+    file = stderr()
+  )
+
+  res$status <- 500
+  res$body <- jsonlite::toJSON(list(
+    error      = "Internal Server Error",
+    message    = err$message,
+    path       = path,
+    method     = method,
+    request_id = rid
+  ), auto_unbox = TRUE)
+
+  res
+}) |>
+
+  # ---- API Spec (with dynamic version injection) ----
+plumber::pr_set_api_spec(api = function(spec) {
+  spec$info$version <- as.character(utils::packageVersion("pipapi"))
+  spec
+}) |>
   plumber::pr_set_api_spec(yaml::read_yaml(api_spec_path))
 
 pr

From d9a10a66c1d873dcae659a879830da4c1eb6aa47 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Fri, 19 Sep 2025 14:13:01 -0400
Subject: [PATCH 167/203] remove the preroute as it is in endpoints.R

---
 inst/plumber/v1/plumber.R | 21 ---------------------
 1 file changed, 21 deletions(-)

diff --git a/inst/plumber/v1/plumber.R b/inst/plumber/v1/plumber.R
index 67a9ebf2..b1fea35f 100644
--- a/inst/plumber/v1/plumber.R
+++ b/inst/plumber/v1/plumber.R
@@ -20,27 +20,6 @@ api_spec_path  <- system.file("plumber/v1/openapi.yaml", package = "pipapi")
 
 pr <- plumber::pr(endpoints_path) |>
 
-  # ---- Pre-route: attach request id + start time ----
-plumber::pr_hook("preroute", function(req, res) {
-  rid <- .req_id()
-
-  # store a STRING in req$.id and in header
-  req$.id       <- rid$id_raw
-  req$.id_time  <- rid$timestamp   # optional metadata
-  req$.id_rand  <- rid$random      # optional metadata
-  res$setHeader("X-Request-ID", req$.id)
-
-  # basic context
-  req$.start <- .now()
-  req$.path  <- req$PATH_INFO %||% ""
-  req$.meth  <- req$REQUEST_METHOD %||% ""
-
-  # mark serialization time start
-  req$.ser0 <- NA_real_
-
-  forward()
-}) |>
-
   # ---- Post-route: log route duration (handler time) ----
 plumber::pr_hook("postroute", function(req, res) {
   if (!is.null(req$.start)) {

From 7efee79b5adfd42df09854740c2bd625e57e0ae3 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Fri, 19 Sep 2025 14:13:57 -0400
Subject: [PATCH 168/203] Increment version number to 1.3.24.9008

---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 9d167253..2fd73a17 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: pipapi
 Title: API for the Poverty and Inequality Platform
-Version: 1.3.24.9007
+Version: 1.3.24.9008
 Authors@R: 
     c(person(given = "Tony",
             family = "Fujs",

From baa6be879efae3d7a392a58870f47829867d0297 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Fri, 19 Sep 2025 15:42:02 -0400
Subject: [PATCH 169/203] fix bugs

---
 inst/plumber/v1/endpoints.R | 67 ++++++++++++++-----------------------
 inst/plumber/v1/plumber.R   | 44 +++++++-----------------
 2 files changed, 38 insertions(+), 73 deletions(-)

diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R
index dde6835f..8135a822 100644
--- a/inst/plumber/v1/endpoints.R
+++ b/inst/plumber/v1/endpoints.R
@@ -3,12 +3,12 @@
 #* pip.worldbank.org
 
 library(pipapi)
+
 # ---- tiny telemetry (ID + timings) --------------------------------------
+
 # Monotonic wall time for durations (in seconds)
 .now <- function() {
-  pt <- proc.time()
-  elapsed <- pt[["elapsed"]]
-  return(elapsed)
+  proc.time()[["elapsed"]]
 }
 
 # Generate a request id AND return a decoded structure
@@ -16,26 +16,17 @@ library(pipapi)
 # - timestamp: POSIXct in UTC
 # - random: integer
 .req_id <- function() {
-  # 1) milliseconds since epoch (as numeric -> character; avoid 32-bit overflow)
   ts_ms_num <- as.numeric(Sys.time()) * 1000
   ts_ms_chr <- format(ts_ms_num, scientific = FALSE, trim = TRUE)
+  rnd       <- sample.int(1e9, 1)
+  id_raw    <- paste0(ts_ms_chr, "-", rnd)
 
-  # 2) random component
-  rnd <- sample.int(1e9, 1)
-
-  # 3) build id string
-  id_raw <- paste0(ts_ms_chr, "-", rnd)
-
-  # 4) decoded timestamp (POSIXct, UTC)
-  ts_posix <- as.POSIXct(as.numeric(ts_ms_chr) / 1000,
-                         origin = "1970-01-01", tz = "UTC")
-
-  out <- list(
+  list(
     id_raw    = id_raw,
-    timestamp = ts_posix,
+    timestamp = as.POSIXct(as.numeric(ts_ms_chr) / 1000,
+                           origin = "1970-01-01", tz = "UTC"),
     random    = rnd
   )
-  return(out)
 }
 
 `%||%` <- function(a, b) if (!is.null(a)) a else b
@@ -43,15 +34,14 @@ library(pipapi)
 # ---- Always-on request context (one log line per request) ----
 #* @filter ctx
 function(req, res) {
-  rid <- .req_id()  # <- returns list(id_raw, timestamp, random)
+  rid <- .req_id()  # list(id_raw, timestamp, random)
 
-  # put a STRING into places that need strings
-  req$.id      <- rid$id_raw            # string token for logs & error bodies
+  req$.id      <- rid$id_raw
   res$setHeader("X-Request-ID", req$.id)
 
-  # keep the decoded pieces if you want (optional)
-  req$.id_time <- rid$timestamp         # POSIXct (UTC)
-  req$.id_rand <- rid$random            # integer
+  # keep decoded pieces if helpful
+  req$.id_time <- rid$timestamp
+  req$.id_rand <- rid$random
 
   req$.start <- .now()
   req$.path  <- req$PATH_INFO %||% ""
@@ -68,8 +58,7 @@ function(req, res) {
   forward()
 }
 
-
-# ---- Serialization timing hooks (stay with the filter for coherence) ----
+# ---- Serialization timing hooks (kept here to avoid duplication) ----
 #* @plumber
 function(pr) {
   pr |>
@@ -87,28 +76,24 @@ function(pr) {
     })
 }
 
-
-
-# ---- kill runaway requests ----------------------------------------------
+# ---- bounded execution helper -------------------------------------------
 with_req_timeout <-
   function(expr,
-           secs = Sys.getenv("PLUMBER_REQ_TIMEOUT", "150") |>
-             as.numeric()
-           ){
-
-  if (!is.finite(secs) || secs <= 0)
-    return(force(expr))
-
-  R.utils::withTimeout(force(expr),
-                       timeout = secs,
-                       onTimeout = "silent")  # uses setTimeLimit()
-}
-
+           secs = as.numeric(Sys.getenv("PLUMBER_REQ_TIMEOUT", "150"))) {
 
+    if (!is.finite(secs) || secs <= 0) return(force(expr))
 
+    R.utils::withTimeout(
+      expr     = force(expr),
+      timeout  = secs,
+      onTimeout = "silent"   # uses setTimeLimit()
+    )
+  }
 
+# ========================================================================
 # API filters -------------------------------------------------------------
-## Validate version parameter ----
+# ========================================================================
+
 
 #* Ensure that version parameter is correct
 #* @filter validate_version
diff --git a/inst/plumber/v1/plumber.R b/inst/plumber/v1/plumber.R
index b1fea35f..0b36deab 100644
--- a/inst/plumber/v1/plumber.R
+++ b/inst/plumber/v1/plumber.R
@@ -1,5 +1,5 @@
-# ---- process-level thread caps (Step 3) -----------------------------
-# Avoid oversubscribing on multi-core servers (common cause of hangs under load)
+# ---- process-level thread caps ------------------------------------------
+# (Avoid oversubscription; helps with stability under load)
 Sys.setenv(
   OPENBLAS_NUM_THREADS = "1",
   MKL_NUM_THREADS      = "1",
@@ -7,11 +7,14 @@ Sys.setenv(
 )
 
 ncores <- parallel::detectCores(logical = FALSE)
-
 data.table::setDTthreads(max(1L, ncores))
-collapse::set_collapse(nthreads	= max(1L, ncores))
+collapse::set_collapse(nthreads = max(1L, ncores))
 fst::threads_fst(max(1L, ncores))
 
+# local fallbacks used only in this file (do not depend on endpoints.R)
+`%||%` <- function(a, b) if (!is.null(a)) a else b
+.now_p  <- function() proc.time()[["elapsed"]]
+
 # ---- build router --------------------------------------------------------
 library(plumber)
 
@@ -20,10 +23,10 @@ api_spec_path  <- system.file("plumber/v1/openapi.yaml", package = "pipapi")
 
 pr <- plumber::pr(endpoints_path) |>
 
-  # ---- Post-route: log route duration (handler time) ----
+  # ---- Post-route: log handler duration (separate from total access time) ----
 plumber::pr_hook("postroute", function(req, res) {
   if (!is.null(req$.start)) {
-    dur <- .now() - req$.start
+    dur <- .now_p() - req$.start
     cat(
       sprintf(
         '{"type":"route","id":"%s","method":"%s","path":"%s","status":%s,"dur_s":%.6f}\n',
@@ -38,27 +41,6 @@ plumber::pr_hook("postroute", function(req, res) {
   }
 }) |>
 
-  # ---- Pre-serialization: mark when we start serializing ----
-plumber::pr_hook("preserialize", function(req, res) {
-  req$.ser0 <- .now()
-}) |>
-
-  # ---- Post-serialization: log serialization duration ----
-plumber::pr_hook("postserialize", function(req, res) {
-  if (!is.null(req$.ser0) && !is.na(req$.ser0)) {
-    ser_dur <- .now() - req$.ser0
-    cat(
-      sprintf(
-        '{"type":"serialize","id":"%s","path":"%s","dur_s":%.6f}\n',
-        req$.id %||% "",
-        req$.path %||% "",
-        ser_dur
-      ),
-      file = stderr()
-    )
-  }
-}) |>
-
   # ---- Exit hook: when process shuts down ----
 plumber::pr_hook("exit", function() {
   cat(
@@ -70,7 +52,7 @@ plumber::pr_hook("exit", function() {
   )
 }) |>
 
-  # ---- Global error handler ----
+  # ---- Global error handler (must return a serializable object) ----
 plumber::pr_set_error(function(req, res, err) {
   method <- req$REQUEST_METHOD %||% ""
   path   <- req$PATH_INFO %||% ""
@@ -85,15 +67,13 @@ plumber::pr_set_error(function(req, res, err) {
   )
 
   res$status <- 500
-  res$body <- jsonlite::toJSON(list(
+  list(
     error      = "Internal Server Error",
     message    = err$message,
     path       = path,
     method     = method,
     request_id = rid
-  ), auto_unbox = TRUE)
-
-  res
+  )
 }) |>
 
   # ---- API Spec (with dynamic version injection) ----

From 0c16445e5a5a41bd3f9017c027c72ab83a72c087 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Fri, 19 Sep 2025 15:42:13 -0400
Subject: [PATCH 170/203] Increment version number to 1.3.24.9009

---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 2fd73a17..fd2143d2 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: pipapi
 Title: API for the Poverty and Inequality Platform
-Version: 1.3.24.9008
+Version: 1.3.24.9009
 Authors@R: 
     c(person(given = "Tony",
             family = "Fujs",

From 980f47f13675b9f10fe52dcfa839c8335b7315b3 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Mon, 22 Sep 2025 11:59:00 -0400
Subject: [PATCH 171/203] add R.utils package

---
 DESCRIPTION | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index fd2143d2..a8267a9a 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -71,7 +71,8 @@ Imports:
     duckdb,
     jsonlite,
     digest,
-    parallel
+    parallel,
+    R.utils
 Remotes:
     PIP-Technical-Team/wbpip@DEV
 Depends: 

From 011f4d3b52a7e3dfd123e37199ee3ada0c1ba825 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Mon, 22 Sep 2025 11:59:13 -0400
Subject: [PATCH 172/203] Increment version number to 1.3.24.9010

---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index a8267a9a..ef21f1b2 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: pipapi
 Title: API for the Poverty and Inequality Platform
-Version: 1.3.24.9009
+Version: 1.3.24.9010
 Authors@R: 
     c(person(given = "Tony",
             family = "Fujs",

From 28fca6c5c8eec5a539747de912aa0e96f95d67fc Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Mon, 22 Sep 2025 18:37:19 -0400
Subject: [PATCH 173/203] add function safe_endpoint

---
 NAMESPACE                   |  1 +
 R/create_lkups.R            | 28 ++++++++++--
 R/utils-plumber.R           | 91 +++++++++++++++++++++++++++++++++++++
 inst/plumber/v1/endpoints.R |  2 +-
 man/safe_endpoint.Rd        | 67 +++++++++++++++++++++++++++
 5 files changed, 184 insertions(+), 5 deletions(-)
 create mode 100644 man/safe_endpoint.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 96b78000..c3c4bef4 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -39,6 +39,7 @@ export(pip_old_lineups)
 export(pipgd_lorenz_curve)
 export(return_correct_version)
 export(return_if_exists)
+export(safe_endpoint)
 export(select_off_alt_agg)
 export(select_reporting_level)
 export(select_user_aggs)
diff --git a/R/create_lkups.R b/R/create_lkups.R
index 9d9c9aa7..32c82d53 100644
--- a/R/create_lkups.R
+++ b/R/create_lkups.R
@@ -298,14 +298,34 @@ create_lkups <- function(data_dir, versions) {
                               update_values = TRUE)
 
 
-      # merge population
-      popl <- pivot(pop,
+      # TEMP: fix ARG population --- START
+      pw <- pivot(pop,
             ids = c("country_code", "data_level"),
             names = list(variable = "reporting_year",
                          value = "reporting_pop"),
             how = "longer") |>
-        ftransform(reporting_year = as_integer_factor(reporting_year)) |>
-        frename(data_level = reporting_level)
+        pivot(how = "wider",
+              ids = c("country_code", "reporting_year"),
+              values = "reporting_pop",
+              names = "data_level") |>
+        setorder(country_code, reporting_year)
+
+      pw[country_code != "CHN", `:=`(
+        urban = national,
+        rural = national
+        )]
+
+      # TEMP: fix ARG population --- END
+
+      popl <- pivot(pw,
+                    ids = c("country_code", "reporting_year"),
+                    names = list(variable = "reporting_level",
+                                 value = "reporting_pop"),
+                    how = "longer") |>
+        ftransform(reporting_year = as_integer_factor(reporting_year),
+                   reporting_level = as_character_factor(reporting_level)) |>
+        setkey(NULL)
+
 
       refy_lkup <- joyn::joyn(refy_lkup, popl,
                               by         = c('country_code',
diff --git a/R/utils-plumber.R b/R/utils-plumber.R
index 215310b6..acfe0ecc 100644
--- a/R/utils-plumber.R
+++ b/R/utils-plumber.R
@@ -565,3 +565,94 @@ change_grouped_stats_to_csv <- function(out) {
   out$deciles <- NULL
   data.frame(out)
 }
+
+
+
+
+
+#' Wrap a Plumber endpoint with standardized error handling
+#'
+#' `safe_endpoint()` wraps an endpoint handler in a `tryCatch`, ensuring
+#' consistent error handling across the API. On success, the original
+#' handler's result is returned. On error, a structured JSON object is
+#' returned with useful metadata (status, message, request ID, endpoint),
+#' and optionally additional debug details.
+#'
+#' Debug mode can be enabled by either:
+#' \itemize{
+#'   \item Passing `debug = TRUE` explicitly, or
+#'   \item Setting the environment variable `PIPAPI_DEBUG=TRUE`.
+#' }
+#' When debug mode is active, the error payload also includes the error
+#' class, call, query parameters, and a truncated traceback.
+#'
+#' @param fun A function `(req, res)` containing the endpoint logic.
+#'   This is where you parse request arguments and call the relevant
+#'   internal functions.
+#' @param endpoint Character string giving the endpoint path
+#'   (e.g., `"/api/v1/pip"`). Used in error payloads so clients know
+#'   which endpoint failed.
+#' @param debug Logical; if `NULL` (default), inherits from the
+#'   environment variable `PIPAPI_DEBUG`. When `TRUE`, include extended
+#'   diagnostic details in the error response.
+#'
+#' @return A function `(req, res)` suitable for use in Plumber routes.
+#'   On error, sets `res$status <- 500` and returns a JSON object with:
+#'   \describe{
+#'     \item{error}{A short description ("Error in /api/v1/...")}
+#'     \item{message}{Either the actual error message (debug mode) or
+#'       `"Internal Server Error"`}
+#'     \item{request_id}{The Plumber request ID, if available}
+#'     \item{endpoint}{The endpoint string supplied}
+#'     \item{class}{Error class (debug mode only)}
+#'     \item{call}{The call that generated the error (debug mode only)}
+#'     \item{query}{The query parameters (debug mode only)}
+#'     \item{trace}{Traceback captured by `rlang::trace_back()` (debug mode only)}
+#'   }
+#'
+#' @examples
+#' \dontrun{
+#' # Example: wrap a handler for /api/v1/pip
+#' #* @get /api/v1/pip
+#' function(req, res) {
+#'   safe_endpoint(function(req, res) {
+#'     params <- req$argsQuery
+#'     params$lkup <- lkups$versions_paths[[req$argsQuery$version]]
+#'     params$version <- NULL
+#'     do.call(pipapi::ui_pip, params)
+#'   }, endpoint = "/api/v1/pip")(req, res)
+#' }
+#' }
+#'
+#' @export
+safe_endpoint <- function(fun, endpoint, debug = NULL) {
+  if (is.null(debug)) {
+    debug <- identical(Sys.getenv("PIPAPI_DEBUG"), "TRUE")
+  }
+
+  function(req, res) {
+    tryCatch(
+      {
+        fun(req, res)
+      },
+      error = function(e) {
+        res$status <- 500L
+        out <- list(
+          error      = paste("Error in", endpoint),
+          message    = if (debug) conditionMessage(e) else "Internal Server Error",
+          request_id = tryCatch(req$.id, error = \(.) NA),
+          endpoint   = endpoint
+        )
+        if (debug) {
+          out$class <- class(e)[[1]]
+          out$call  <- as.character(conditionCall(e))
+          out$query <- req$argsQuery
+          out$trace <- utils::capture.output(
+            rlang::trace_back(bottom = 10, simplify = "branch")
+          )
+        }
+        out
+      }
+    )
+  }
+}
diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R
index 8135a822..1b47256b 100644
--- a/inst/plumber/v1/endpoints.R
+++ b/inst/plumber/v1/endpoints.R
@@ -977,7 +977,7 @@ function(req, res) {
     res$status <- 500
     list(error = "Error in /api/v1/cp-charts",
          message = e$message,
-         request_id = tryCatch(req$.id, error = function(.) NA))
+         request_id = tryCatch(req$.id, error = \(.) NA))
   })
 }
 
diff --git a/man/safe_endpoint.Rd b/man/safe_endpoint.Rd
new file mode 100644
index 00000000..e754bdc4
--- /dev/null
+++ b/man/safe_endpoint.Rd
@@ -0,0 +1,67 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils-plumber.R
+\name{safe_endpoint}
+\alias{safe_endpoint}
+\title{Wrap a Plumber endpoint with standardized error handling}
+\usage{
+safe_endpoint(fun, endpoint, debug = NULL)
+}
+\arguments{
+\item{fun}{A function \verb{(req, res)} containing the endpoint logic.
+This is where you parse request arguments and call the relevant
+internal functions.}
+
+\item{endpoint}{Character string giving the endpoint path
+(e.g., \code{"/api/v1/pip"}). Used in error payloads so clients know
+which endpoint failed.}
+
+\item{debug}{Logical; if \code{NULL} (default), inherits from the
+environment variable \code{PIPAPI_DEBUG}. When \code{TRUE}, include extended
+diagnostic details in the error response.}
+}
+\value{
+A function \verb{(req, res)} suitable for use in Plumber routes.
+On error, sets \code{res$status <- 500} and returns a JSON object with:
+\describe{
+\item{error}{A short description ("Error in /api/v1/...")}
+\item{message}{Either the actual error message (debug mode) or
+\code{"Internal Server Error"}}
+\item{request_id}{The Plumber request ID, if available}
+\item{endpoint}{The endpoint string supplied}
+\item{class}{Error class (debug mode only)}
+\item{call}{The call that generated the error (debug mode only)}
+\item{query}{The query parameters (debug mode only)}
+\item{trace}{Traceback captured by \code{rlang::trace_back()} (debug mode only)}
+}
+}
+\description{
+\code{safe_endpoint()} wraps an endpoint handler in a \code{tryCatch}, ensuring
+consistent error handling across the API. On success, the original
+handler's result is returned. On error, a structured JSON object is
+returned with useful metadata (status, message, request ID, endpoint),
+and optionally additional debug details.
+}
+\details{
+Debug mode can be enabled by either:
+\itemize{
+\item Passing \code{debug = TRUE} explicitly, or
+\item Setting the environment variable \code{PIPAPI_DEBUG=TRUE}.
+}
+When debug mode is active, the error payload also includes the error
+class, call, query parameters, and a truncated traceback.
+}
+\examples{
+\dontrun{
+# Example: wrap a handler for /api/v1/pip
+#* @get /api/v1/pip
+function(req, res) {
+  safe_endpoint(function(req, res) {
+    params <- req$argsQuery
+    params$lkup <- lkups$versions_paths[[req$argsQuery$version]]
+    params$version <- NULL
+    do.call(pipapi::ui_pip, params)
+  }, endpoint = "/api/v1/pip")(req, res)
+}
+}
+
+}

From d9e9420e2ed21ca3cb49704cf86853435fa515d6 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Mon, 22 Sep 2025 21:10:50 -0400
Subject: [PATCH 174/203] impore with_req_timeout

---
 NAMESPACE                   |  1 +
 R/utils-plumber.R           | 40 +++++++++++++++++++++++++++++++++++++
 inst/plumber/v1/endpoints.R | 14 +------------
 3 files changed, 42 insertions(+), 13 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index c3c4bef4..af6ba74d 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -59,6 +59,7 @@ export(use_new_lineup_version)
 export(valid_years)
 export(validate_input_grouped_stats)
 export(version_dataframe)
+export(with_req_timeout)
 export(wld_lineup_year)
 import(collapse, except = fdroplevels)
 import(data.table)
diff --git a/R/utils-plumber.R b/R/utils-plumber.R
index acfe0ecc..13d784e1 100644
--- a/R/utils-plumber.R
+++ b/R/utils-plumber.R
@@ -656,3 +656,43 @@ safe_endpoint <- function(fun, endpoint, debug = NULL) {
     )
   }
 }
+
+# ---- bounded execution helper -------------------------------------------
+#' Evaluate an expression with a timeout
+#'
+#' Wraps [R.utils::withTimeout()] but returns a structured failure
+#' object instead of stopping the whole process. This allows
+#' `safe_endpoint()` to handle timeouts like normal errors without
+#' killing the API process.
+#'
+#' @param expr Expression to evaluate.
+#' @param secs Timeout in seconds (default: from env var `PLUMBER_REQ_TIMEOUT`,
+#'   or 150 if unset).
+#'
+#' @return Result of `expr` if it finishes in time; otherwise a list
+#'   with `ok = FALSE`, `error = "timeout"`, and `elapsed` seconds.
+#' @export
+with_req_timeout <- function(expr,
+                             secs = as.numeric(Sys.getenv("PLUMBER_REQ_TIMEOUT", "150"))) {
+  if (!is.finite(secs) || secs <= 0) return(force(expr))
+
+  start <- proc.time()[["elapsed"]]
+  tryCatch(
+    {
+      R.utils::withTimeout(
+        expr     = force(expr),
+        timeout  = secs,
+        onTimeout = "error"
+      )
+
+    },
+    TimeoutException = \(e) {
+      elapsed <- proc.time()[["elapsed"]] - start
+      list(
+        ok     = FALSE,
+        error  = sprintf("Request exceeded timeout of %s seconds", secs),
+        elapsed = elapsed
+      )
+    }
+  )
+}
diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R
index 1b47256b..a9382b45 100644
--- a/inst/plumber/v1/endpoints.R
+++ b/inst/plumber/v1/endpoints.R
@@ -76,19 +76,7 @@ function(pr) {
     })
 }
 
-# ---- bounded execution helper -------------------------------------------
-with_req_timeout <-
-  function(expr,
-           secs = as.numeric(Sys.getenv("PLUMBER_REQ_TIMEOUT", "150"))) {
 
-    if (!is.finite(secs) || secs <= 0) return(force(expr))
-
-    R.utils::withTimeout(
-      expr     = force(expr),
-      timeout  = secs,
-      onTimeout = "silent"   # uses setTimeLimit()
-    )
-  }
 
 # ========================================================================
 # API filters -------------------------------------------------------------
@@ -760,7 +748,7 @@ function(req, res) {
     params$lkup <- lkups$versions_paths[[req$argsQuery$version]]
     params$version <- NULL
     out <- do.call(pipapi::ui_hp_stacked, params) |>
-    with_req_timeout()
+      with_req_timeout()
     if (is.null(out)) {
       res$status <- 503
       return(list(

From 8c8151db2035379af1c15d82024edf86131d73b2 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Mon, 22 Sep 2025 21:13:23 -0400
Subject: [PATCH 175/203] test safe_endpoint with cp-charts

---
 inst/plumber/v1/endpoints.R | 32 ++++++++++----------------------
 1 file changed, 10 insertions(+), 22 deletions(-)

diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R
index a9382b45..6d185d65 100644
--- a/inst/plumber/v1/endpoints.R
+++ b/inst/plumber/v1/endpoints.R
@@ -945,29 +945,17 @@ function(req, res) {
 #* @param ppp_version:[chr] ppp year to be used
 #* @param version:[chr] Data version. Defaults to most recent version. See api/v1/versions
 #* @serializer json
-function(req, res) {
-  tryCatch({
-    params <- req$argsQuery
-    params$lkup <- lkups$versions_paths[[req$argsQuery$version]]
-    params$version <- NULL
-    out <- do.call(pipapi::ui_cp_charts, params) |>
+cp_charts <- safe_endpoint(function(req, res) {
+  params <- req$argsQuery
+  params$lkup <- lkups$versions_paths[[req$argsQuery$version]]
+  params$version <- NULL
+
+  # wrap the heavy work in with_req_timeout
+  do.call(pipapi::ui_cp_charts, params) |>
     with_req_timeout()
-    if (is.null(out)) {
-      res$status <- 503
-      return(list(
-        error = "Request timed out",
-        request_id = req$.id,
-        endpoint = "/api/v1/cp-charts"
-      ))
-    }
-    out
-  }, error = function(e) {
-    res$status <- 500
-    list(error = "Error in /api/v1/cp-charts",
-         message = e$message,
-         request_id = tryCatch(req$.id, error = \(.) NA))
-  })
-}
+
+  },
+  endpoint = "/api/v1/cp-charts")
 
 ### cp-download -----------
 #* Return Country Profile - Downloads

From 46373a4c8cf29e9dbaefa84493aafed14b3c4feb Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Mon, 22 Sep 2025 21:13:37 -0400
Subject: [PATCH 176/203] Increment version number to 1.3.24.9011

---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index ef21f1b2..f16b98a1 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: pipapi
 Title: API for the Poverty and Inequality Platform
-Version: 1.3.24.9010
+Version: 1.3.24.9011
 Authors@R: 
     c(person(given = "Tony",
             family = "Fujs",

From 219167e0f6ff0e92ba77967d2f412611ed09f2f9 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Tue, 23 Sep 2025 09:55:54 -0400
Subject: [PATCH 177/203] document with_req_timeout

---
 man/with_req_timeout.Rd | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100644 man/with_req_timeout.Rd

diff --git a/man/with_req_timeout.Rd b/man/with_req_timeout.Rd
new file mode 100644
index 00000000..e6bc2c4c
--- /dev/null
+++ b/man/with_req_timeout.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils-plumber.R
+\name{with_req_timeout}
+\alias{with_req_timeout}
+\title{Evaluate an expression with a timeout}
+\usage{
+with_req_timeout(
+  expr,
+  secs = as.numeric(Sys.getenv("PLUMBER_REQ_TIMEOUT", "150"))
+)
+}
+\arguments{
+\item{expr}{Expression to evaluate.}
+
+\item{secs}{Timeout in seconds (default: from env var \code{PLUMBER_REQ_TIMEOUT},
+or 150 if unset).}
+}
+\value{
+Result of \code{expr} if it finishes in time; otherwise a list
+with \code{ok = FALSE}, \code{error = "timeout"}, and \code{elapsed} seconds.
+}
+\description{
+Wraps \code{\link[R.utils:withTimeout]{R.utils::withTimeout()}} but returns a structured failure
+object instead of stopping the whole process. This allows
+\code{safe_endpoint()} to handle timeouts like normal errors without
+killing the API process.
+}

From 57b25480cc6703f87386d8126f3b479184b5b721 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Wed, 24 Sep 2025 09:47:09 -0400
Subject: [PATCH 178/203] update news

---
 NEWS.md | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index 4808fc19..46f378fb 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,29 @@
 # pipapi (development version)
-* add new way to infer poverty lines
+* add function safe_endpoint
+* remove the preroute as it is in endpoints.R
+* new plumber setup.
+* revamp plumber
+* implement timeout wrapper to more endpoint
+* add Request timed out to pip and pip-grp endpoints
+* Add a tiny request-ID + timings filter
+* add some tryCatch in some endpoints
+* handle errors
+* fix bug req$args to req$argsQuery
+* remove gc()
+* filter properly proverty lines to update master (inter cache) file
+* make use of missing data file instead of recreate i
+* remove verbose in join
+* sort for surveys with more than 1 reporting_level
+* update empty data
+* add multiple popshare for fillgaps
+* remove filter_Lkup from fg_pip()
+* allow to select regions from any variable. this must be changed once we allow many other aggregations
+* add prosperity gap to ui_cp_charts
+* multiple popshare for svy years.
+* improved version of  infer_poverty_line()
+* update data documentation
+* store vars in env defined in zzz
+* Fix several bugs
 
 
 # pipapi 1.3.24

From 6b6643d7953dd5cf84fdfe9620d1e9c90b29a284 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Wed, 24 Sep 2025 09:47:23 -0400
Subject: [PATCH 179/203] Increment version number to 1.4.0

---
 DESCRIPTION | 2 +-
 NEWS.md     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index f16b98a1..8130c10b 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: pipapi
 Title: API for the Poverty and Inequality Platform
-Version: 1.3.24.9011
+Version: 1.4.0
 Authors@R: 
     c(person(given = "Tony",
             family = "Fujs",
diff --git a/NEWS.md b/NEWS.md
index 46f378fb..79aac109 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,4 +1,4 @@
-# pipapi (development version)
+# pipapi 1.4.0
 * add function safe_endpoint
 * remove the preroute as it is in endpoints.R
 * new plumber setup.

From a01a71ecb7b70aa2d534cb63c5a8d54a35f7a7e4 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Wed, 24 Sep 2025 11:23:36 -0400
Subject: [PATCH 180/203] remove old regions from pc_charts

---
 R/ui_poverty_indicators.R | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/R/ui_poverty_indicators.R b/R/ui_poverty_indicators.R
index b4e2b26d..899e4a34 100644
--- a/R/ui_poverty_indicators.R
+++ b/R/ui_poverty_indicators.R
@@ -96,7 +96,18 @@ ui_pc_regional <- function(country   = "ALL",
         reporting_pop  = reporting_pop / pop_units
       )]
 
-  out <- out[estimate_type == "actual"]
+  # TEMP START: remove old aggregations --------------
+  cl <- lkup$aux_files$country_list
+
+  regs <- cl[, .(region_code, africa_split_code)] |>
+    unlist()  |>  # convert to vector
+    na_omit() |>
+    unique()  |>
+    unname()  |>
+    c("WLD")  # add the world
+  # TEMP END: remove old aggregations --------------
+
+  out <- out[estimate_type == "actual" & region_code %in% regs]
 
   return(out)
 }

From 48b1a2ca5dc3c404bb915c7f61c26ab8c3fc168f Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Wed, 24 Sep 2025 11:23:47 -0400
Subject: [PATCH 181/203] Increment version number to 1.4.0.9000

---
 DESCRIPTION | 2 +-
 NEWS.md     | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 8130c10b..c60d4f17 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: pipapi
 Title: API for the Poverty and Inequality Platform
-Version: 1.4.0
+Version: 1.4.0.9000
 Authors@R: 
     c(person(given = "Tony",
             family = "Fujs",
diff --git a/NEWS.md b/NEWS.md
index 79aac109..31ff8530 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,5 @@
+# pipapi (development version)
+
 # pipapi 1.4.0
 * add function safe_endpoint
 * remove the preroute as it is in endpoints.R

From 243ef1d1b185171923a425e693c2329ad5150ca8 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Wed, 24 Sep 2025 11:28:40 -0400
Subject: [PATCH 182/203] fix descriptions

---
 NEWS.md | 51 ++++++++++++++++++++++++++-------------------------
 1 file changed, 26 insertions(+), 25 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 31ff8530..5eab34d6 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,31 +1,32 @@
 # pipapi (development version)
+* Removed old regions ggregate from `pc_chart`.
 
 # pipapi 1.4.0
-* add function safe_endpoint
-* remove the preroute as it is in endpoints.R
-* new plumber setup.
-* revamp plumber
-* implement timeout wrapper to more endpoint
-* add Request timed out to pip and pip-grp endpoints
-* Add a tiny request-ID + timings filter
-* add some tryCatch in some endpoints
-* handle errors
-* fix bug req$args to req$argsQuery
-* remove gc()
-* filter properly proverty lines to update master (inter cache) file
-* make use of missing data file instead of recreate i
-* remove verbose in join
-* sort for surveys with more than 1 reporting_level
-* update empty data
-* add multiple popshare for fillgaps
-* remove filter_Lkup from fg_pip()
-* allow to select regions from any variable. this must be changed once we allow many other aggregations
-* add prosperity gap to ui_cp_charts
-* multiple popshare for svy years.
-* improved version of  infer_poverty_line()
-* update data documentation
-* store vars in env defined in zzz
-* Fix several bugs
+* Added `safe_endpoint` function for safer endpoint handling.
+* Removed `preroute` as it is now managed in `endpoints.R`.
+* Introduced a new and improved Plumber setup.
+* Refactored Plumber configuration for better maintainability.
+* Implemented a timeout wrapper for more endpoints.
+* Added "Request timed out" responses to `pip` and `pip-grp` endpoints.
+* Introduced a lightweight request ID and timing filter for better request tracking.
+* Added `tryCatch` blocks to improve error handling in several endpoints.
+* Enhanced error handling throughout the API.
+* Fixed bug by changing `req$args` to `req$argsQuery`.
+* Removed unnecessary calls to `gc()`.
+* Improved filtering of poverty lines when updating the master (intermediate cache) file.
+* Utilized the missing data file instead of recreating it.
+* Removed verbose output in join operations.
+* Ensured correct sorting for surveys with multiple `reporting_level` values.
+* Updated handling of empty data responses.
+* Enabled support for multiple `popshare` values in fill gaps operations.
+* Removed `filter_Lkup` from `fg_pip()`.
+* Allowed region selection from any variable (to be revised when supporting more aggregations).
+* Added prosperity gap metric to `ui_cp_charts`.
+* Enabled multiple `popshare` values for survey years.
+* Improved the `infer_poverty_line()` function.
+* Updated data documentation for clarity and completeness.
+* Stored variables in the environment defined in `zzz`.
+* Fixed several bugs across the codebase.
 
 
 # pipapi 1.3.24

From ff3fd8a2c3d0f759c51ed3d3baec518528764001 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Sat, 27 Sep 2025 14:30:40 -0400
Subject: [PATCH 183/203] remove MDG censoring

---
 inst/plumber/v1/endpoints.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R
index 6d185d65..428a82db 100644
--- a/inst/plumber/v1/endpoints.R
+++ b/inst/plumber/v1/endpoints.R
@@ -1005,7 +1005,7 @@ function(req, res) {
 
     if (req$argsQuery$table == "countries" && exclude == TRUE) {
       # hardcoded
-      to_remove <- c("MDG", "UKR")
+      to_remove <- c("UKR")
       out <- out[!(country_code %in% to_remove)]
     }
   }

From 74b77ab49edce14745d66f24058df0ac6f3c771c Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Sat, 27 Sep 2025 14:43:57 -0400
Subject: [PATCH 184/203] excliude regions that should not go in UI

---
 R/get_aux_table.R           | 26 +++++++++++++++++++++++++-
 inst/plumber/v1/endpoints.R |  8 --------
 2 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/R/get_aux_table.R b/R/get_aux_table.R
index 5f2bae95..e26db872 100644
--- a/R/get_aux_table.R
+++ b/R/get_aux_table.R
@@ -45,15 +45,39 @@ get_aux_table <- function(data_dir = NULL,
 #' Helper function to the UI
 #' @param data_dir character: Data directory
 #' @param table character: Name of auxiliary table
+#' @param esclude logical: whether or not to exclude some countries or regions...
 #'
 #' @return data.frame
 #' @export
 #'
-get_aux_table_ui <- function(data_dir, table) {
+get_aux_table_ui <- function(data_dir,
+                             table,
+                             exclude = TRUE) {
 
   out <- get_aux_table(data_dir    = data_dir,
                        table       = table,
                        long_format = FALSE)
 
+  if (table == "regions") {
+    # TEMP START: remove old aggregations --------------
+    cl <- lkup$aux_files$country_list
+
+    regs <- cl[, .(region_code, africa_split_code)] |>
+      unlist()  |>  # convert to vector
+      na_omit() |>
+      unique()  |>
+      unname()  |>
+      c("WLD")  # add the world
+    # TEMP END: remove old aggregations --------------
+
+    out <- out[region_code %in% regs]
+
+  } else if (table == "countries" && exclude == TRUE) {
+      # hardcoded
+      to_remove <- c("UKR")
+      out <- out[!(country_code %in% to_remove)]
+  }
+
+
   return(out)
 }
diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R
index 428a82db..359167d2 100644
--- a/inst/plumber/v1/endpoints.R
+++ b/inst/plumber/v1/endpoints.R
@@ -989,7 +989,6 @@ function(req, res) {
 #* @param exclude:[bool] exclude countries. only applies for "countries" table
 function(req, res) {
   params <- req$argsQuery
-  exclude <- req$argsQuery$exclude
   res$serializer <- pipapi::assign_serializer(format = params$format)
   if (is.null(req$argsQuery$table)) {
     # return all available tables if none selected
@@ -1000,14 +999,7 @@ function(req, res) {
     params$data_dir <- lkups$versions_paths[[params$version]]$data_root
     params$format <- NULL
     params$version <- NULL
-    params$exclude <- NULL
     out <- do.call(pipapi::get_aux_table_ui, params)
-
-    if (req$argsQuery$table == "countries" && exclude == TRUE) {
-      # hardcoded
-      to_remove <- c("UKR")
-      out <- out[!(country_code %in% to_remove)]
-    }
   }
   out
 }

From 2555f1a65c0b6e5b513754c8a9813421ccd79641 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Sat, 27 Sep 2025 14:44:52 -0400
Subject: [PATCH 185/203] document

---
 man/get_aux_table_ui.Rd | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/man/get_aux_table_ui.Rd b/man/get_aux_table_ui.Rd
index 8d5edc4b..21d1a485 100644
--- a/man/get_aux_table_ui.Rd
+++ b/man/get_aux_table_ui.Rd
@@ -5,12 +5,14 @@
 \title{Return specified auxiliary data in wide format
 Helper function to the UI}
 \usage{
-get_aux_table_ui(data_dir, table)
+get_aux_table_ui(data_dir, table, exclude = TRUE)
 }
 \arguments{
 \item{data_dir}{character: Data directory}
 
 \item{table}{character: Name of auxiliary table}
+
+\item{esclude}{logical: whether or not to exclude some countries or regions...}
 }
 \value{
 data.frame

From fa4639935433cfba64bc0e4f4c5805d8901fd808 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Sat, 27 Sep 2025 14:45:39 -0400
Subject: [PATCH 186/203] update news

---
 NEWS.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index 5eab34d6..71da4a50 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,7 @@
 # pipapi (development version)
-* Removed old regions ggregate from `pc_chart`.
+* Removed old regions aggregate from `pc_chart`.
+* Removed old regions aggregate from `get_aux_table_ui`.
+* Bring back MDG.
 
 # pipapi 1.4.0
 * Added `safe_endpoint` function for safer endpoint handling.

From 54e64bf7feac76e47be5985b785cf2e6f45c7bd6 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Sat, 27 Sep 2025 14:45:48 -0400
Subject: [PATCH 187/203] Increment version number to 1.4.0.9001

---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index c60d4f17..a0e8b314 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: pipapi
 Title: API for the Poverty and Inequality Platform
-Version: 1.4.0.9000
+Version: 1.4.0.9001
 Authors@R: 
     c(person(given = "Tony",
             family = "Fujs",

From 1235870a7ef32e30ad180982cc0717ba5c1af69f Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Sat, 27 Sep 2025 15:02:56 -0400
Subject: [PATCH 188/203] add hashed to all UI endpoint

---
 R/create_lkups.R            | 9 +++++++--
 R/ui_home_page.R            | 4 +++-
 R/ui_poverty_indicators.R   | 8 ++++++--
 inst/plumber/v1/endpoints.R | 3 ++-
 4 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/R/create_lkups.R b/R/create_lkups.R
index 32c82d53..9bb03efd 100644
--- a/R/create_lkups.R
+++ b/R/create_lkups.R
@@ -654,12 +654,17 @@ create_lkups <- function(data_dir, versions) {
   ## `pipapi` runs onto, such as path to data files, which will not be the same
   ## on my laptop, and on the PROD VM. These variables therefore need to be removed
   ## prior to the creation of the cache_data_id
-  hash_svy_lkup <- svy_lkup
+  hash_svy_lkup <- data.table::copy(svy_lkup)
   hash_svy_lkup$path <- NULL
 
-  hash_ref_lkup <- ref_lkup
+  if (use_new_lineup_version) {
+    hash_ref_lkup <- data.table::copy(refy_lkup)
+  } else {
+    hash_ref_lkup <- data.table::copy(ref_lkup)
+  }
   hash_ref_lkup$path <- NULL
 
+
   query_controls_hash <- query_controls
   query_controls_hash$version <- NULL
 
diff --git a/R/ui_home_page.R b/R/ui_home_page.R
index 6b230aea..0cc201a6 100644
--- a/R/ui_home_page.R
+++ b/R/ui_home_page.R
@@ -68,7 +68,9 @@ ui_hp_stacked <- function(povline = 1.9,
 ui_hp_countries <- function(country = c("IDN", "CIV"),
                             povline = 1.9,
                             pop_units = 1e6,
-                            lkup) {
+                            lkup,
+                            lkup_hash       = lkup$cache_data_id$hash_pip
+                            ) {
   out <- pip(
     country = country,
     year = "all",
diff --git a/R/ui_poverty_indicators.R b/R/ui_poverty_indicators.R
index 899e4a34..4821aa7a 100644
--- a/R/ui_poverty_indicators.R
+++ b/R/ui_poverty_indicators.R
@@ -15,7 +15,9 @@ ui_pc_charts <- function(country = c("AGO"),
                          reporting_level = c("all", "national", "rural", "urban"),
                          pop_units = 1e6,
                          censor  = TRUE,
-                         lkup) {
+                         lkup,
+                         lkup_hash       = lkup$cache_data_id$hash_pip_grp
+                         ) {
   # Set returned columns
   return_cols           <- lkup$return_cols$ui_pc_charts$cols
   inequality_indicators <- lkup$return_cols$ui_pc_charts$inequality_indicators
@@ -73,7 +75,9 @@ ui_pc_regional <- function(country   = "ALL",
                            year      = "ALL",
                            povline   = 1.9,
                            pop_units = 1e6,
-                           lkup) {
+                           lkup,
+                           lkup_hash  = lkup$cache_data_id$hash_pip_grp
+                           ) {
 
   # TEMPORARY UNTIL SELECTION MECHANISM IS BEING IMPROVED
   country <- toupper(country)
diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R
index 359167d2..e0fc4651 100644
--- a/inst/plumber/v1/endpoints.R
+++ b/inst/plumber/v1/endpoints.R
@@ -881,7 +881,8 @@ function(req, res) {
     params$lkup <- lkups$versions_paths[[req$argsQuery$version]]
     params$version <- NULL
     out <- do.call(pipapi::ui_pc_regional, params) |>
-    with_req_timeout()
+      with_req_timeout()
+
     if (is.null(out)) {
       res$status <- 503
       return(list(

From ecaff99f00205bf69329eebcee7f469d8fad501a Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Sat, 27 Sep 2025 15:04:17 -0400
Subject: [PATCH 189/203] Increment version number to 1.4.0.9002

---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index a0e8b314..40237391 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: pipapi
 Title: API for the Poverty and Inequality Platform
-Version: 1.4.0.9001
+Version: 1.4.0.9002
 Authors@R: 
     c(person(given = "Tony",
             family = "Fujs",

From 368462dcfa16bc3c82fa205d5dde87191968b7a5 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Sat, 27 Sep 2025 15:33:46 -0400
Subject: [PATCH 190/203] temp fix with indicators file

---
 NEWS.md                     | 1 +
 inst/plumber/v1/endpoints.R | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index 71da4a50..74fe6b6c 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -2,6 +2,7 @@
 * Removed old regions aggregate from `pc_chart`.
 * Removed old regions aggregate from `get_aux_table_ui`.
 * Bring back MDG.
+* Add hash to all UI enpdoints
 
 # pipapi 1.4.0
 * Added `safe_endpoint` function for safer endpoint handling.
diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R
index e0fc4651..307e0d84 100644
--- a/inst/plumber/v1/endpoints.R
+++ b/inst/plumber/v1/endpoints.R
@@ -718,8 +718,12 @@ function(req) {
 #* @param version:[chr] Data version. Defaults to most recent version. See api/v1/versions
 #* @serializer json list(na="null")
 function(req) {
-  pipapi::get_aux_table(data_dir = lkups$versions_paths[[req$argsQuery$version]]$data_root,
+  out <- pipapi::get_aux_table(data_dir = lkups$versions_paths[[req$argsQuery$version]]$data_root,
                         table = "indicators")
+
+  # TEMP patch
+  out |>
+    collapse::funique(cols = "indicator_code")
 }
 
 ### decomposition-vars --------------

From 38de1febf15385ab0eadd68be897602870a801ff Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Sat, 27 Sep 2025 15:33:55 -0400
Subject: [PATCH 191/203] Increment version number to 1.4.0.9003

---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 40237391..c1ba3e99 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: pipapi
 Title: API for the Poverty and Inequality Platform
-Version: 1.4.0.9002
+Version: 1.4.0.9003
 Authors@R: 
     c(person(given = "Tony",
             family = "Fujs",

From a66f362a3faf3ca0f6bfe6d565fa8c394017177b Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Sat, 27 Sep 2025 21:34:36 -0400
Subject: [PATCH 192/203] fiw distribution_type in fg gaps and lkups

---
 R/create_lkups.R | 43 +++++++++++++++++++++++++++++++++++++------
 R/fg_pip.R       | 19 +++++++++++++++----
 2 files changed, 52 insertions(+), 10 deletions(-)

diff --git a/R/create_lkups.R b/R/create_lkups.R
index 9bb03efd..a6086bd6 100644
--- a/R/create_lkups.R
+++ b/R/create_lkups.R
@@ -198,9 +198,43 @@ create_lkups <- function(data_dir, versions) {
       refy_lkup      <- fst::read_fst(refy_lkup_path,
                                       as.data.table = TRUE)
 
+      ## TEMP START: add distribution type -----------
+      dt <- ref_lkup[, .(country_code,
+                   reporting_year,
+                   welfare_type,
+                   reporting_level,
+                   distribution_type)]
+
+
+      dt[,
+         y := as.integer(length(unique(distribution_type)) == 1),
+         by = .(country_code,
+                reporting_year,
+                welfare_type,
+                reporting_level)
+         ]
+      dt[y == 0,
+         distribution_type := "mixed"
+         ][, y := NULL]
+
+      dt <- funique(dt)
+      refy_lkup <- joyn::joyn(refy_lkup, dt,
+                              by = c("country_code",
+                                     "reporting_year",
+                                     "welfare_type",
+                                     "reporting_level"),
+                              match_type = "1:1",
+                              keep = "left",
+                              update_values = TRUE,
+                              reportvar = FALSE)
+
+
+      ## TEMP END: add distribution type -----------
+
+
 
       # ZP ADD - CREATE OBJECT: lineup years
-      #___________________________________________________________________________
+      #______________________________________________________________
       lineup_years_path <-
         fs::path(data_dir,
                  "estimations/lineup_years.fst")
@@ -298,7 +332,7 @@ create_lkups <- function(data_dir, versions) {
                               update_values = TRUE)
 
 
-      # TEMP: fix ARG population --- START
+      ## TEMP START: fix ARG population ----
       pw <- pivot(pop,
             ids = c("country_code", "data_level"),
             names = list(variable = "reporting_year",
@@ -315,7 +349,7 @@ create_lkups <- function(data_dir, versions) {
         rural = national
         )]
 
-      # TEMP: fix ARG population --- END
+      ## TEMP END: fix ARG population ------
 
       popl <- pivot(pw,
                     ids = c("country_code", "reporting_year"),
@@ -337,9 +371,6 @@ create_lkups <- function(data_dir, versions) {
                               update_values = TRUE)
 
 
-
-
-
       # --- END inclussion of CMD data.
 
       refy_lkup <- refy_lkup[reporting_year %in% lineup_years$lineup_years, ]
diff --git a/R/fg_pip.R b/R/fg_pip.R
index 726a0779..6adb29eb 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -111,7 +111,7 @@ fg_pip <- function(country,
   rm(LDTg)
   }
   rm(lfst)
-  
+
 
 
   # Add just mean and median
@@ -121,9 +121,20 @@ fg_pip <- function(country,
   # try metadata unique code
   tmp_metadata <- copy(metadata) # I think we can avoid this inefficiency.
   # Handle multiple distribution types (for aggregated distributions)
-  if (length(unique(tmp_metadata$distribution_type)) > 1) {
-    tmp_metadata[, distribution_type := "mixed"]
-  }
+
+  tmp_metadata[,
+     y := as.integer(length(unique(distribution_type)) == 1),
+     by = .(country_code,
+            reporting_year,
+            welfare_type,
+            reporting_level)
+  ]
+
+  tmp_metadata[y == 0,
+     distribution_type := "mixed"
+     ][,
+       y := NULL]
+
   # convert survey_comparability to NA
   # NOTE: This should not be necessary. for the new lineup distribution
   # metadata should come without this variable.

From 88d1df84458b8d096c7b4cef5cf777b629cede52 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Sat, 27 Sep 2025 22:05:41 -0400
Subject: [PATCH 193/203] formatting

---
 R/fg_pip.R | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/R/fg_pip.R b/R/fg_pip.R
index 6adb29eb..70111a53 100644
--- a/R/fg_pip.R
+++ b/R/fg_pip.R
@@ -142,14 +142,18 @@ fg_pip <- function(country,
   # get all vars
   meta_vars <- setdiff(names(tmp_metadata), "reporting_year")
   # transform to NA when necessary - i.e. when interpolated (two rows per reporting_year)
-  tmp_metadata[, (meta_vars) := lapply(.SD, \(x) {
-    if (uniqueN(x) == 1) {
-      x
-    } else {
-      NA
-    }}),
-    by      = c("reporting_year", "country_code", "reporting_level", "welfare_type"),
-    .SDcols = meta_vars]
+  tmp_metadata[,
+               (meta_vars) := lapply(.SD, \(x) {
+                 if (uniqueN(x) == 1) {
+                   x
+                 } else {
+                   NA
+                 }}),
+               by = c("reporting_year",
+                      "country_code",
+                      "reporting_level",
+                      "welfare_type"),
+               .SDcols = meta_vars]
 
   # Remove duplicate rows by reporting_year (keep only one row per
   # reporting_year)
@@ -158,11 +162,12 @@ fg_pip <- function(country,
 
   out <- join(res,
               tmp_metadata_unique,
-              on            = c("country_code", "reporting_year",
-                                "reporting_level"),
-              how           = "left", # ZP: change from full to left,
-                                      #  this rm nowcast years - i.e. years not included
-                                      #  as lineup years
+              on = c("country_code",
+                     "reporting_year",
+                     "reporting_level"),
+              how = "left", # ZP: change from full to left,
+                            #  this rm nowcast years - i.e. years not included
+                            #  as lineup years
               validate      = "m:1",
               drop.dup.cols = TRUE,
               verbose       = 0,

From a40eb05a2ab1b477aa88985df6f862e6342e217e Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Sat, 27 Sep 2025 22:06:04 -0400
Subject: [PATCH 194/203] Increment version number to 1.4.0.9004

---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index c1ba3e99..b4207281 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: pipapi
 Title: API for the Poverty and Inequality Platform
-Version: 1.4.0.9003
+Version: 1.4.0.9004
 Authors@R: 
     c(person(given = "Tony",
             family = "Fujs",

From 4a7d196f1d3f43629bc9ec735dad6a51b43ff8e1 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Sat, 27 Sep 2025 23:08:43 -0400
Subject: [PATCH 195/203] add cache_data_id to pip_grp_new

---
 R/pip_grp_new.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/R/pip_grp_new.R b/R/pip_grp_new.R
index efa25bb8..bf45c3e0 100644
--- a/R/pip_grp_new.R
+++ b/R/pip_grp_new.R
@@ -7,7 +7,8 @@ pip_grp_new <- \(country         = "ALL",
                  reporting_level = c("all", "national"),
                  lkup,
                  censor          = TRUE,
-                 additional_ind  = FALSE) {
+                 additional_ind  = FALSE,
+                 lkup_hash       = lkup$cache_data_id$hash_pip_grp) {
 
   welfare_type    <- match.arg(welfare_type)
   reporting_level <- match.arg(reporting_level)

From 63064662d1588feeb0ef495ec7f4968ee05c6667 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Sat, 27 Sep 2025 23:09:04 -0400
Subject: [PATCH 196/203] Increment version number to 1.4.0.9005

---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index b4207281..b4c9ec4f 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: pipapi
 Title: API for the Poverty and Inequality Platform
-Version: 1.4.0.9004
+Version: 1.4.0.9005
 Authors@R: 
     c(person(given = "Tony",
             family = "Fujs",

From 4bd5a614b4b29e719b78db60543ac4a51f67bf6a Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Mon, 29 Sep 2025 11:00:35 -0400
Subject: [PATCH 197/203] remove patch in indicators files

---
 inst/plumber/v1/endpoints.R | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R
index 307e0d84..98485220 100644
--- a/inst/plumber/v1/endpoints.R
+++ b/inst/plumber/v1/endpoints.R
@@ -720,10 +720,7 @@ function(req) {
 function(req) {
   out <- pipapi::get_aux_table(data_dir = lkups$versions_paths[[req$argsQuery$version]]$data_root,
                         table = "indicators")
-
-  # TEMP patch
-  out |>
-    collapse::funique(cols = "indicator_code")
+  out
 }
 
 ### decomposition-vars --------------

From 81498da6529dd42a5b9b6f37d2da55b774fd96fa Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Mon, 29 Sep 2025 11:17:00 -0400
Subject: [PATCH 198/203] Increment version number to 1.4.0.9006

---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index b4c9ec4f..7660d51c 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: pipapi
 Title: API for the Poverty and Inequality Platform
-Version: 1.4.0.9005
+Version: 1.4.0.9006
 Authors@R: 
     c(person(given = "Tony",
             family = "Fujs",

From 8ff8c4ba9605194fd130d6f1558a47fc955e2b0a Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Mon, 29 Sep 2025 13:36:46 -0400
Subject: [PATCH 199/203] fix issue with regions table in the UI

---
 R/get_aux_table.R           | 3 ++-
 inst/plumber/v1/endpoints.R | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/R/get_aux_table.R b/R/get_aux_table.R
index e26db872..5e517666 100644
--- a/R/get_aux_table.R
+++ b/R/get_aux_table.R
@@ -52,7 +52,8 @@ get_aux_table <- function(data_dir = NULL,
 #'
 get_aux_table_ui <- function(data_dir,
                              table,
-                             exclude = TRUE) {
+                             exclude = TRUE,
+                             lkup) {
 
   out <- get_aux_table(data_dir    = data_dir,
                        table       = table,
diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R
index 98485220..e887034d 100644
--- a/inst/plumber/v1/endpoints.R
+++ b/inst/plumber/v1/endpoints.R
@@ -1001,6 +1001,7 @@ function(req, res) {
     params$data_dir <- lkups$versions_paths[[params$version]]$data_root
     params$format <- NULL
     params$version <- NULL
+    params$lkup <- lkups$versions_paths[[params$version]]
     out <- do.call(pipapi::get_aux_table_ui, params)
   }
   out

From 53aadbd6f160d9223fce28b68197797beb38cd15 Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Mon, 29 Sep 2025 13:36:58 -0400
Subject: [PATCH 200/203] Increment version number to 1.4.0.9007

---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 7660d51c..18e9e001 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: pipapi
 Title: API for the Poverty and Inequality Platform
-Version: 1.4.0.9006
+Version: 1.4.0.9007
 Authors@R: 
     c(person(given = "Tony",
             family = "Fujs",

From 6e20c180f1b3fd23d6d98abe607260d2b1a5a91f Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Mon, 29 Sep 2025 14:27:17 -0400
Subject: [PATCH 201/203] fix bug

---
 inst/TMP/TMP_API_launcher.R | 9 +++------
 inst/plumber/v1/endpoints.R | 2 +-
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/inst/TMP/TMP_API_launcher.R b/inst/TMP/TMP_API_launcher.R
index 72e28195..7ce361b3 100644
--- a/inst/TMP/TMP_API_launcher.R
+++ b/inst/TMP/TMP_API_launcher.R
@@ -4,12 +4,9 @@
 library(pipapi)
 # devtools::load_all(".")
 if (Sys.info()[["user"]] == "wb384996") {
-  force <- FALSE
-  if (!"lkups" %in% ls() || isTRUE(force)) {
-    data_dir <- Sys.getenv("PIPAPI_DATA_ROOT_FOLDER_LOCAL") |>
-      fs::path()
-    fs::dir_ls(data_dir, recurse = FALSE)
-  }
+  data_dir <- Sys.getenv("PIPAPI_DATA_ROOT_FOLDER_LOCAL") |>
+    fs::path()
+  fs::dir_ls(data_dir, recurse = FALSE)
 
 
   latest_version <-
diff --git a/inst/plumber/v1/endpoints.R b/inst/plumber/v1/endpoints.R
index e887034d..7acae90b 100644
--- a/inst/plumber/v1/endpoints.R
+++ b/inst/plumber/v1/endpoints.R
@@ -999,9 +999,9 @@ function(req, res) {
   } else {
     # Return only requested table
     params$data_dir <- lkups$versions_paths[[params$version]]$data_root
+    params$lkup <- lkups$versions_paths[[params$version]]
     params$format <- NULL
     params$version <- NULL
-    params$lkup <- lkups$versions_paths[[params$version]]
     out <- do.call(pipapi::get_aux_table_ui, params)
   }
   out

From c2d0a3340ae9c2a3771fd939eb3b5c3be0ca31ab Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Mon, 29 Sep 2025 14:27:29 -0400
Subject: [PATCH 202/203] Increment version number to 1.4.0.9008

---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 18e9e001..d724e400 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: pipapi
 Title: API for the Poverty and Inequality Platform
-Version: 1.4.0.9007
+Version: 1.4.0.9008
 Authors@R: 
     c(person(given = "Tony",
             family = "Fujs",

From a521953b53f222206657671219c5020c97ea856a Mon Sep 17 00:00:00 2001
From: "R.Andres Castaneda" <acastastanedaa@worldbank.org>
Date: Mon, 29 Sep 2025 16:14:21 -0400
Subject: [PATCH 203/203] Increment version number to 1.4.1

---
 DESCRIPTION | 2 +-
 NEWS.md     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index d724e400..9aedcc13 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: pipapi
 Title: API for the Poverty and Inequality Platform
-Version: 1.4.0.9008
+Version: 1.4.1
 Authors@R: 
     c(person(given = "Tony",
             family = "Fujs",
diff --git a/NEWS.md b/NEWS.md
index 74fe6b6c..4c3be5b2 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,4 +1,4 @@
-# pipapi (development version)
+# pipapi 1.4.1
 * Removed old regions aggregate from `pc_chart`.
 * Removed old regions aggregate from `get_aux_table_ui`.
 * Bring back MDG.