databrew · arytontediarjo · Nov 24, 2023 · Nov 24, 2023 · Nov 24, 2023 · Nov 24, 2023
diff --git a/scripts/metadata/generate_metadata.R b/scripts/metadata/generate_metadata.R
@@ -27,6 +27,7 @@ library(rmarkdown)
 library(knitr)
 library(kableExtra)
 library(readr)
+source('tests.R')
 
 # to create rmd output
 dir.create('rmds', recursive = TRUE)
@@ -522,7 +523,7 @@ households_sp <- v0demography %>%
   filter(!is.na(x))
 coordinates(households_sp) <- ~x+y
 load('../../data_public/spatial/new_clusters.RData')
-# buffer clusters by 20 meters so as to
+# buffer clusters by 50 meters so as to
 p4s <- "+proj=utm +zone=37 +south +ellps=WGS84 +datum=WGS84 +units=m +no_defs"
 crs <- CRS(p4s)
 llcrs <- CRS("+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0")
@@ -563,6 +564,20 @@ households_sp_projected@data$old_cluster_correct <-
   ifelse(is.na(households_sp_projected@data$cluster_geo),
          households_sp_projected@data$cluster_with_buffer,
          households_sp_projected@data$cluster_geo)
+
+# x = households_sp_projected@data[households_sp_projected@data$old_cluster_correct != households_sp_projected@data$cluster_correct,]
+# x <- x %>% filter(!is.na(hhid))
+# x <- x %>%
+#   mutate(keep = ifelse(is.na(cluster_geo), cluster_correct, old_cluster_correct))
+
+# households_sp_projected@data$old_cluster_correct <-
+#   ifelse(is.na(households_sp_projected@data$cluster_geo),
+#          households_sp_projected@data$cluster_correct,
+#          households_sp_projected@data$old_cluster_correct)
+
+# Household 88324 is like household 03005, misclassified
+households_sp_projected@data$old_cluster_correct[households_sp_projected@data$hhid == '03005'] <- '3'
+households_sp_projected@data$old_cluster_correct[households_sp_projected@data$hhid == '88324'] <- '88'
 # old_cluster_correct is the right variable for geographic location, not geo_cluster_num
 # since geo_cluster_num was calculated using a faulty method which put households which
 # were within the buffer of >1 cluster into the wrong cluster at times
@@ -670,13 +685,14 @@ departures <- bind_rows(safety_departures, safety_deaths, efficacy_deaths)
 # events <- bind_rows(arrivals, departures) %>% arrange(todays_date)
 # Get the starting roster
 starting_roster <- v0demography_repeat_individual %>%
-  left_join(v0demography %>% dplyr::select(hhid, start_time, KEY), by = c('PARENT_KEY' = 'KEY')) %>%
+  left_join(v0demography_full %>% dplyr::select(hhid, start_time, KEY), by = c('PARENT_KEY' = 'KEY')) %>%
   dplyr::select(hhid, start_time, firstname, lastname, dob, sex, extid) %>%
   filter(!is.na(extid)) %>%
   arrange(desc(start_time)) %>%
   dplyr::distinct(extid, .keep_all = TRUE) %>%
   mutate(remove = FALSE) %>%
-  mutate(index = 1:nrow(.))
+  mutate(index = 1:nrow(.)) %>%
+  filter(!is.na(hhid))
 # save starting roster for nika
 if(FALSE){
   nika <- starting_roster %>%
@@ -701,6 +717,18 @@ dead_in_safety <- starting_roster %>%
 starting_roster <- starting_roster %>%
   filter(migrated != 1, dead != 1)
 
+# Prior to adding arrivals, remove some arrivals if they occured BEFORE a departure or out-migration
+remove_these <- arrivals %>%
+  dplyr::select(arrival_time = start_time,
+                extid, hhid) %>%
+  left_join(
+    departures %>%
+      dplyr::select(departure_time = start_time,
+                    extid, hhid)
+  ) %>%
+  filter(arrival_time < departure_time)
+arrivals <- arrivals %>% filter(!extid %in% remove_these$extid)
+
 # Go through each arrival and add information
 roster <- bind_rows(
   starting_roster,
@@ -737,7 +765,7 @@ households <- roster %>%
   summarise(roster = paste0(roster_name[dead == 0 & migrated == 0], collapse = ', '),
             num_members = length(which(dead == 0 & migrated == 0))) %>%
   # get cluster
-  left_join(v0demography %>%
+  left_join(v0demography_full %>%
               dplyr::distinct(hhid, .keep_all = TRUE) %>%
               dplyr::select(hhid, cluster))
 # get assignments
@@ -899,7 +927,7 @@ safety_individuals <- individuals
 # Remove from safety households which contain 100% refused individuals
 # https://trello.com/c/mGoKfh7w/2071-update-metadata-script-safety-to-remove-from-safety-lists-all-individuals-from-households-with-100-refusals
 if(TRUE){
-
+  removed_households <- c() # placeholder for checking on removed households
   remove_visits <- paste0('V', 1:4)
   for(i in 1:length(remove_visits)){
     this_visit <- remove_visits[i]
@@ -929,13 +957,20 @@ if(TRUE){
       mutate(p_refusals = n_refusals / n_individual_submissions * 100) %>%
       arrange(desc(p_refusals))
     all_refusals <- dx %>% filter(p_refusals == 100) %>% arrange(hhid)
+    removed_households <- c(removed_households, all_refusals$hhid)
     message('---Going to remove ', nrow(all_refusals))
     # write_csv(all_refusals, '~/Desktop/all_refusals.csv')
     households <- households %>% filter(!hhid %in% all_refusals$hhid)
     individuals <- individuals %>% filter(hhid %in% households$hhid)
   }
 }
 
+# Keep only those individuals whose cluster is "in study"
+in_study_clusters <- add_zero(sort(unique(clusters_projected@data$cluster_nu)), 2)
+individuals <- individuals %>%
+  filter(cluster %in% in_study_clusters)
+households <- households %>%
+  filter(cluster %in% in_study_clusters)
 
 # Write csvs
 if(!dir.exists('safety_metadata')){
@@ -1115,6 +1150,80 @@ if(FALSE){
   system(system_text)
   setwd(owd)
 }
+
+
+if(FALSE){
+  # some safety sanity checks
+  load('rmds/safety_tables.RData')
+  # any individual who is eos remains eos forever
+  ever_eos <-
+    c(safety_repeat_individual %>% filter(safety_status == 'eos') %>% pull(extid),
+      safetynew_repeat_individual %>% filter(safety_status == 'eos') %>% pull(extid)) %>%
+    unique
+  table(individuals$starting_safety_status[individuals$extid %in% ever_eos]) # this should be all eos
+
+  # Same number of households in both `individual_data.csv` and `household_data.csv` (JOE)
+  length(unique(households$hhid))
+  length(unique(individuals$hhid))
+
+  # Only in-cluster households included (JOE)
+  table(households$cluster %in% in_study_clusters)
+  table(individuals$cluster %in% in_study_clusters)
+
+  # Anyone who migrated out is EOS
+  migrated_ids <- safety_repeat_individual %>% filter(person_absent_reason == 'Migrated') %>% pull(extid)
+  table(migrated_ids %in% individuals$extid)
+  still_there <- individuals %>% filter(extid %in% migrated_ids) # they're all eos, all good
+
+  # Anyone who died is EOS
+  died_ids  <- safety_repeat_individual %>% filter(person_absent_reason == 'Died') %>% pull(extid)
+  table(died_ids %in% individuals$extid) # should be all false
+  still_there <- individuals %>% filter(extid %in% died_ids) # none
+
+  # Anyone with a non-EOS individual status at end of visit 2 is not EOS in `individual_data.csv` (JOE)
+  not_eos_v2 <- safety_repeat_individual %>%
+    left_join(safety, by = c('PARENT_KEY' = 'KEY')) %>%
+    filter(visit == 'V2') %>%
+    filter(safety_status != 'eos') %>%
+    pull(extid)
+  now_status <- individuals %>% filter(extid %in% not_eos_v2)
+  table(now_status$starting_safety_status) # should be no eos
+  table(not_eos_v2 %in% individuals$extid) # they should all be there EXCEPT those which are manually removed or all refusals
+  not_there <- not_eos_v2[!not_eos_v2 %in% individuals$extid] %>% unique()
+  table(safety_repeat_individual %>% filter(extid %in% not_there) %>% pull(safety_status))
+  # See if they are in removed households
+  table(substr(not_there, 1, 5) %in% removed_households) # only some
+  # See if they are in departures
+  table(not_there %in% departures$extid) # most of them
+  table(not_there %in% departures$extid | substr(not_there, 1, 5) %in% removed_households)
+
+  # Visit Status checks
+  safety_test_tbl <- dplyr::bind_rows(
+      safety %>%
+        dplyr::inner_join(safety_repeat_individual, by = c('KEY' = 'PARENT_KEY')) %>%
+        dplyr::filter(extid %in% (individuals %>% dplyr::filter(starting_pregnancy_status != 'in') %>% .$extid)),
+      safetynew %>%
+        dplyr::inner_join(safetynew_repeat_individual, by = c('KEY' = 'PARENT_KEY')) %>%
+        dplyr::filter(extid %in% (individuals %>% dplyr::filter(starting_pregnancy_status != 'in') %>% .$extid))
+  )
+
+  # Test changes on safety visit status out -> eos violation
+  safety_test_tbl %>%
+    dplyr::select(visit, safety_status, extid) %>%
+    tidyr::pivot_wider(names_from = visit, values_from = safety_status) %>%
+    dplyr::filter(V1 == 'out', V2 == 'eos') %>%
+    dplyr::select(extid)
+
+  # Test changes on safety visit status in -> out violation
+  safety_test_tbl %>%
+    dplyr::select(visit, safety_status, extid) %>%
+    tidyr::pivot_wider(names_from = visit, values_from = safety_status) %>%
+    dplyr::filter(V1 == 'in', V2 == 'out') %>%
+    dplyr::select(extid)
+
+
+ # that covers them
+}
 # </Safety> ##############################################################################
 ##############################################################################
 ##############################################################################
@@ -1167,7 +1276,7 @@ if(FALSE){
   for(i in 1:nrow(hei)){
     this_extid <- hei$extid[i]
     message(i, ' of ', nrow(hei))
-    v1 <- 
+    v1 <-
       bind_rows(
         healtheconbaseline_repeat_individual %>% filter(extid == this_extid) %>% dplyr::select(extid, hecon_individual_status) %>% mutate(type = 'old'),
         healtheconnew_repeat_individual %>% filter(extid == this_extid) %>% dplyr::select(extid, hecon_individual_status)
@@ -1190,7 +1299,7 @@ if(FALSE){
     if(length(status_after_v2_per_v2_form) == 0){status_after_v2_per_v2_form <- NA}
     why_absent_v2 <- v2_absent %>% pull(person_absent_reason)
     if(length(why_absent_v2) == 0){why_absent_v2 <- NA}
-    out <- 
+    out <-
       tibble(extid = this_extid,
              any_v1,
              v1_new,
@@ -1216,16 +1325,28 @@ if(FALSE){
 starting_hecon_statuses <-
   bind_rows(
     healtheconnew_repeat_individual %>% dplyr::select(extid, PARENT_KEY, hecon_individual_status) %>%
+      dplyr::mutate(visit = 'V1') %>%
       left_join(healtheconnew %>% dplyr::select(KEY, start_time) %>% mutate(start_time = as.POSIXct(start_time)), by = c('PARENT_KEY' = 'KEY')),
     healtheconbaseline_repeat_individual %>% dplyr::select(extid, PARENT_KEY, hecon_individual_status) %>%
-      left_join(healtheconbaseline %>% dplyr::select(KEY, start_time) %>% mutate(start_time = as.POSIXct(start_time)), by = c('PARENT_KEY' = 'KEY')),
+      dplyr::mutate(visit = 'V1') %>%
+      left_join(healtheconbaseline  %>% dplyr::select(KEY, start_time) %>% mutate(start_time = as.POSIXct(start_time)), by = c('PARENT_KEY' = 'KEY')),
     healtheconmonthly_repeat_individual %>% dplyr::select(extid, PARENT_KEY, hecon_individual_status) %>%
-      left_join(healtheconmonthly %>% dplyr::select(KEY, start_time) %>% mutate(start_time = as.POSIXct(start_time)), by = c('PARENT_KEY' = 'KEY'))
+      left_join(healtheconmonthly %>% dplyr::select(KEY, visit, start_time) %>% mutate(start_time = as.POSIXct(start_time)), by = c('PARENT_KEY' = 'KEY'))
   ) %>%
   filter(!is.na(hecon_individual_status)) %>%
-  arrange(desc(start_time)) %>%
-  dplyr::distinct(extid, .keep_all = TRUE) %>%
-  dplyr::select(extid, starting_hecon_status = hecon_individual_status)
+  dplyr::select(start_time, extid, visit, starting_hecon_status = hecon_individual_status)
+
+# QC on healthecon individual statuses
+log_checks <- check_hecon_ind_status_diff(starting_hecon_statuses, status_col = 'starting_hecon_status')
+if(nrow(log_checks) == 0){
+  starting_hecon_statuses <- starting_hecon_statuses %>%
+    dplyr::arrange(desc(start_time)) %>%
+    dplyr::distinct(extid, .keep_all = TRUE) %>%
+    dplyr::select(-visit, -start_time)
+}else{
+  stop('Please check log_checks to check status change errors')
+}
+
 # In the case of someone not being in the above dataset but being preselected for health economics
 # the status should be "out"
 starting_roster <- left_join(starting_roster, starting_hecon_statuses) %>%
@@ -1373,13 +1494,25 @@ visits_done <- healtheconbaseline %>%
   summarise(visits_done = paste0(sort(unique(visit)), collapse = ', '))
 # Get baseline + monthly in/out status
 right <- healtheconbaseline %>%
-  mutate(start_time = as.POSIXct(start_time)) %>%
-  dplyr::select(start_time, hhid, hecon_hh_status = hecon_household_status) %>%
+  mutate(start_time = as.POSIXct(start_time),
+         visit = 'V1') %>%
+  dplyr::select(visit, start_time, hhid, hecon_hh_status = hecon_household_status) %>%
   bind_rows(healtheconmonthly %>%
               mutate(start_time = as.POSIXct(start_time)) %>%
-              dplyr::select(start_time, hhid, hecon_hh_status = hecon_household_status)) %>%
-  arrange(desc(start_time)) %>%
-  dplyr::distinct(hhid, .keep_all = TRUE)
+              dplyr::select(visit, start_time, hhid, hecon_hh_status = hecon_household_status))
+
+# QC on HH STATUSES
+log_checks <- check_hecon_hh_status_diff(right, status_col = 'hecon_hh_status')
+if(nrow(log_checks) == 0){
+  right <- right %>%
+    dplyr::arrange(desc(start_time)) %>%
+    dplyr::distinct(hhid, .keep_all = TRUE) %>%
+    dplyr::select(-visit, -start_time)
+}else{
+  stop('Please check log_checks to check status change errors')
+}
+
+
 visits_done <- left_join(visits_done, right)
 households <- left_join(households, visits_done)
 # If NA hecon_hh_status, this means that the household did not have any  visit, and
@@ -1729,7 +1862,7 @@ pfu_in <-
   dplyr::select(extid, pregnancy_status) %>%
   filter(pregnancy_status == 'in')
 
-# Get the starting roster 
+# Get the starting roster
 starting_roster <- v0demography_full_repeat_individual %>%
   dplyr::select(PARENT_KEY, firstname, lastname, dob, sex, extid) %>%
   left_join(v0demography_full %>% dplyr::select(hhid, start_time, KEY), by = c('PARENT_KEY' = 'KEY')) %>%
@@ -1969,15 +2102,15 @@ if(FALSE){
     dir.create('rmds/pk_visit_control_sheets')
   }
   load('rmds/pk_tables.RData')
-  
+
   vcs_list <- sort(unique(individuals$cluster))
   for(a in 1:length(vcs_list)){
     this_vcs <- vcs_list[a]
     message(a, ' of ', length(vcs_list), ' WD: ', getwd())
     rmarkdown::render('rmds/pk_visit_control_sheet.Rmd', params = list('vcs' = this_vcs),
                       output_file = paste0( getwd(), '/pk_visit_control_sheets/', this_vcs, '.pdf'))
   }
-  
+
   # Now stitch them all together
   owd <- getwd()
   setwd('rmds/pk_visit_control_sheets/')

diff --git a/scripts/metadata/tests.R b/scripts/metadata/tests.R
@@ -0,0 +1,63 @@
+# This script is used as a utility script to do spot checks for metadata in healthecon
+
+
+#' checks healthecon individual status diff based on visits
+#' @return tibble of errors in status changes
+check_hecon_ind_status_diff <- function(individuals, status_col){
+  ### Checks Variables
+  possible_visits_list <- c('V1', 'V2', 'V3', 'V4', 'V5')
+  unique_visit <- unique(individuals$visit)
+  add_cols <- setdiff(possible_visits_list, unique_visit)
+
+  ### Individual Checks
+  qc_data <- individuals %>%
+    dplyr::distinct(extid, visit, !!sym(status_col)) %>%
+    tidyr::pivot_wider(names_from = visit, id_cols = c(extid), values_from = !!sym(status_col)) %>%
+    bind_cols(setNames(rep(list(NA), length(add_cols)), add_cols))
+
+  checks <- qc_data %>%
+    dplyr::mutate(error_flag = case_when(
+      (!V1 %in% c('in', 'out')) ~ 'V0 -> V1 hecon hh status error',
+      (!is.na(V1) & !is.na(V2)) & (V1 == 'in'  &  V2 == 'out') ~ 'V1 -> V2 hecon ind status error in -> out is not possible',
+      (!is.na(V1) & !is.na(V2)) & (V1 == 'out' &  (V2 %in% c('in', 'eos'))) ~ 'V1 -> V2 hecon ind status error out -> in or eos is not possible',
+      (!is.na(V2) & !is.na(V3)) & (V2 == 'in'  &  V3 == 'out') ~ 'V2 -> V3 hecon ind status error in -> out is not possible',
+      (!is.na(V2) & !is.na(V3)) & (V2 == 'out' &  (V3 %in% c('in', 'eos'))) ~ 'V2 -> V3 hecon ind status error out -> in or eos is not possible',
+      (!is.na(V3) & !is.na(V4)) & (V3 == 'in'  & V4 == 'out') ~ 'V3 -> V4 hecon ind status error in -> out is not possible',
+      (!is.na(V3) & !is.na(V4)) & (V3 == 'out' & (V4 %in% c('in', 'eos'))) ~ 'V3 -> V4 hecon ind status error out -> in or eos is not possible',
+      TRUE ~ 'pass'
+    )) %>%
+    dplyr::filter(error_flag != 'pass')
+
+  return(checks)
+
+}
+
+#' checks healthecon household status diff based on visits
+#' @return tibble of errors in status changes
+check_hecon_hh_status_diff <- function(households, status_col) {
+  ### Checks Variables
+  possible_visits_list <- c('V1', 'V2', 'V3', 'V4', 'V5')
+  unique_visit <- unique(households$visit)
+  add_cols <- setdiff(possible_visits_list, unique_visit)
+
+  ### Individual Checks
+  qc_data <- households %>%
+    dplyr::distinct(hhid, visit, !!sym(status_col)) %>%
+    tidyr::pivot_wider(names_from = visit, id_cols = c(hhid), values_from = !!sym(status_col)) %>%
+    bind_cols(setNames(rep(list(NA), length(add_cols)), add_cols))
+
+  checks <- qc_data %>%
+    dplyr::mutate(error_flag = case_when(
+      (!V1 %in% c('in', 'out')) ~ 'V0 -> V1 hecon hh status error',
+      (!is.na(V1) & !is.na(V2)) & (V1 == 'in'  &  V2 == 'out') ~ 'V1 -> V2 hecon ind status error in -> out is not possible',
+      (!is.na(V1) & !is.na(V2)) & (V1 == 'out' &  (V2 %in% c('in', 'eos'))) ~ 'V1 -> V2 hecon ind status error out -> in or eos is not possible',
+      (!is.na(V2) & !is.na(V3)) & (V2 == 'in'  &  V3 == 'out') ~ 'V2 -> V3 hecon ind status error in -> out is not possible',
+      (!is.na(V2) & !is.na(V3)) & (V2 == 'out' &  (V3 %in% c('in', 'eos'))) ~ 'V2 -> V3 hecon ind status error out -> in or eos is not possible',
+      (!is.na(V3) & !is.na(V4)) & (V3 == 'in'  & V4 == 'out') ~ 'V3 -> V4 hecon ind status error in -> out is not possible',
+      (!is.na(V3) & !is.na(V4)) & (V3 == 'out' & (V4 %in% c('in', 'eos'))) ~ 'V3 -> V4 hecon ind status error out -> in or eos is not possible',
+      TRUE ~ 'pass'
+    )) %>%
+    dplyr::filter(error_flag != 'pass')
+
+  return(checks)
+}