diff --git a/scripts/metadata/generate_metadata.R b/scripts/metadata/generate_metadata.R index 04491ac..8b39ee3 100644 --- a/scripts/metadata/generate_metadata.R +++ b/scripts/metadata/generate_metadata.R @@ -27,6 +27,7 @@ library(rmarkdown) library(knitr) library(kableExtra) library(readr) +source('tests.R') # to create rmd output dir.create('rmds', recursive = TRUE) @@ -522,7 +523,7 @@ households_sp <- v0demography %>% filter(!is.na(x)) coordinates(households_sp) <- ~x+y load('../../data_public/spatial/new_clusters.RData') -# buffer clusters by 20 meters so as to +# buffer clusters by 50 meters so as to p4s <- "+proj=utm +zone=37 +south +ellps=WGS84 +datum=WGS84 +units=m +no_defs" crs <- CRS(p4s) llcrs <- CRS("+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0") @@ -563,6 +564,20 @@ households_sp_projected@data$old_cluster_correct <- ifelse(is.na(households_sp_projected@data$cluster_geo), households_sp_projected@data$cluster_with_buffer, households_sp_projected@data$cluster_geo) + +# x = households_sp_projected@data[households_sp_projected@data$old_cluster_correct != households_sp_projected@data$cluster_correct,] +# x <- x %>% filter(!is.na(hhid)) +# x <- x %>% +# mutate(keep = ifelse(is.na(cluster_geo), cluster_correct, old_cluster_correct)) + +# households_sp_projected@data$old_cluster_correct <- +# ifelse(is.na(households_sp_projected@data$cluster_geo), +# households_sp_projected@data$cluster_correct, +# households_sp_projected@data$old_cluster_correct) + +# Household 88324 is like household 03005, misclassified +households_sp_projected@data$old_cluster_correct[households_sp_projected@data$hhid == '03005'] <- '3' +households_sp_projected@data$old_cluster_correct[households_sp_projected@data$hhid == '88324'] <- '88' # old_cluster_correct is the right variable for geographic location, not geo_cluster_num # since geo_cluster_num was calculated using a faulty method which put households which # were within the buffer of >1 cluster into the wrong cluster at times @@ -670,13 +685,14 @@ departures <- bind_rows(safety_departures, safety_deaths, efficacy_deaths) # events <- bind_rows(arrivals, departures) %>% arrange(todays_date) # Get the starting roster starting_roster <- v0demography_repeat_individual %>% - left_join(v0demography %>% dplyr::select(hhid, start_time, KEY), by = c('PARENT_KEY' = 'KEY')) %>% + left_join(v0demography_full %>% dplyr::select(hhid, start_time, KEY), by = c('PARENT_KEY' = 'KEY')) %>% dplyr::select(hhid, start_time, firstname, lastname, dob, sex, extid) %>% filter(!is.na(extid)) %>% arrange(desc(start_time)) %>% dplyr::distinct(extid, .keep_all = TRUE) %>% mutate(remove = FALSE) %>% - mutate(index = 1:nrow(.)) + mutate(index = 1:nrow(.)) %>% + filter(!is.na(hhid)) # save starting roster for nika if(FALSE){ nika <- starting_roster %>% @@ -701,6 +717,18 @@ dead_in_safety <- starting_roster %>% starting_roster <- starting_roster %>% filter(migrated != 1, dead != 1) +# Prior to adding arrivals, remove some arrivals if they occured BEFORE a departure or out-migration +remove_these <- arrivals %>% + dplyr::select(arrival_time = start_time, + extid, hhid) %>% + left_join( + departures %>% + dplyr::select(departure_time = start_time, + extid, hhid) + ) %>% + filter(arrival_time < departure_time) +arrivals <- arrivals %>% filter(!extid %in% remove_these$extid) + # Go through each arrival and add information roster <- bind_rows( starting_roster, @@ -737,7 +765,7 @@ households <- roster %>% summarise(roster = paste0(roster_name[dead == 0 & migrated == 0], collapse = ', '), num_members = length(which(dead == 0 & migrated == 0))) %>% # get cluster - left_join(v0demography %>% + left_join(v0demography_full %>% dplyr::distinct(hhid, .keep_all = TRUE) %>% dplyr::select(hhid, cluster)) # get assignments @@ -899,7 +927,7 @@ safety_individuals <- individuals # Remove from safety households which contain 100% refused individuals # https://trello.com/c/mGoKfh7w/2071-update-metadata-script-safety-to-remove-from-safety-lists-all-individuals-from-households-with-100-refusals if(TRUE){ - + removed_households <- c() # placeholder for checking on removed households remove_visits <- paste0('V', 1:4) for(i in 1:length(remove_visits)){ this_visit <- remove_visits[i] @@ -929,6 +957,7 @@ if(TRUE){ mutate(p_refusals = n_refusals / n_individual_submissions * 100) %>% arrange(desc(p_refusals)) all_refusals <- dx %>% filter(p_refusals == 100) %>% arrange(hhid) + removed_households <- c(removed_households, all_refusals$hhid) message('---Going to remove ', nrow(all_refusals)) # write_csv(all_refusals, '~/Desktop/all_refusals.csv') households <- households %>% filter(!hhid %in% all_refusals$hhid) @@ -936,6 +965,12 @@ if(TRUE){ } } +# Keep only those individuals whose cluster is "in study" +in_study_clusters <- add_zero(sort(unique(clusters_projected@data$cluster_nu)), 2) +individuals <- individuals %>% + filter(cluster %in% in_study_clusters) +households <- households %>% + filter(cluster %in% in_study_clusters) # Write csvs if(!dir.exists('safety_metadata')){ @@ -1115,6 +1150,80 @@ if(FALSE){ system(system_text) setwd(owd) } + + +if(FALSE){ + # some safety sanity checks + load('rmds/safety_tables.RData') + # any individual who is eos remains eos forever + ever_eos <- + c(safety_repeat_individual %>% filter(safety_status == 'eos') %>% pull(extid), + safetynew_repeat_individual %>% filter(safety_status == 'eos') %>% pull(extid)) %>% + unique + table(individuals$starting_safety_status[individuals$extid %in% ever_eos]) # this should be all eos + + # Same number of households in both `individual_data.csv` and `household_data.csv` (JOE) + length(unique(households$hhid)) + length(unique(individuals$hhid)) + + # Only in-cluster households included (JOE) + table(households$cluster %in% in_study_clusters) + table(individuals$cluster %in% in_study_clusters) + + # Anyone who migrated out is EOS + migrated_ids <- safety_repeat_individual %>% filter(person_absent_reason == 'Migrated') %>% pull(extid) + table(migrated_ids %in% individuals$extid) + still_there <- individuals %>% filter(extid %in% migrated_ids) # they're all eos, all good + + # Anyone who died is EOS + died_ids <- safety_repeat_individual %>% filter(person_absent_reason == 'Died') %>% pull(extid) + table(died_ids %in% individuals$extid) # should be all false + still_there <- individuals %>% filter(extid %in% died_ids) # none + + # Anyone with a non-EOS individual status at end of visit 2 is not EOS in `individual_data.csv` (JOE) + not_eos_v2 <- safety_repeat_individual %>% + left_join(safety, by = c('PARENT_KEY' = 'KEY')) %>% + filter(visit == 'V2') %>% + filter(safety_status != 'eos') %>% + pull(extid) + now_status <- individuals %>% filter(extid %in% not_eos_v2) + table(now_status$starting_safety_status) # should be no eos + table(not_eos_v2 %in% individuals$extid) # they should all be there EXCEPT those which are manually removed or all refusals + not_there <- not_eos_v2[!not_eos_v2 %in% individuals$extid] %>% unique() + table(safety_repeat_individual %>% filter(extid %in% not_there) %>% pull(safety_status)) + # See if they are in removed households + table(substr(not_there, 1, 5) %in% removed_households) # only some + # See if they are in departures + table(not_there %in% departures$extid) # most of them + table(not_there %in% departures$extid | substr(not_there, 1, 5) %in% removed_households) + + # Visit Status checks + safety_test_tbl <- dplyr::bind_rows( + safety %>% + dplyr::inner_join(safety_repeat_individual, by = c('KEY' = 'PARENT_KEY')) %>% + dplyr::filter(extid %in% (individuals %>% dplyr::filter(starting_pregnancy_status != 'in') %>% .$extid)), + safetynew %>% + dplyr::inner_join(safetynew_repeat_individual, by = c('KEY' = 'PARENT_KEY')) %>% + dplyr::filter(extid %in% (individuals %>% dplyr::filter(starting_pregnancy_status != 'in') %>% .$extid)) + ) + + # Test changes on safety visit status out -> eos violation + safety_test_tbl %>% + dplyr::select(visit, safety_status, extid) %>% + tidyr::pivot_wider(names_from = visit, values_from = safety_status) %>% + dplyr::filter(V1 == 'out', V2 == 'eos') %>% + dplyr::select(extid) + + # Test changes on safety visit status in -> out violation + safety_test_tbl %>% + dplyr::select(visit, safety_status, extid) %>% + tidyr::pivot_wider(names_from = visit, values_from = safety_status) %>% + dplyr::filter(V1 == 'in', V2 == 'out') %>% + dplyr::select(extid) + + + # that covers them +} # ############################################################################## ############################################################################## ############################################################################## @@ -1167,7 +1276,7 @@ if(FALSE){ for(i in 1:nrow(hei)){ this_extid <- hei$extid[i] message(i, ' of ', nrow(hei)) - v1 <- + v1 <- bind_rows( healtheconbaseline_repeat_individual %>% filter(extid == this_extid) %>% dplyr::select(extid, hecon_individual_status) %>% mutate(type = 'old'), healtheconnew_repeat_individual %>% filter(extid == this_extid) %>% dplyr::select(extid, hecon_individual_status) @@ -1190,7 +1299,7 @@ if(FALSE){ if(length(status_after_v2_per_v2_form) == 0){status_after_v2_per_v2_form <- NA} why_absent_v2 <- v2_absent %>% pull(person_absent_reason) if(length(why_absent_v2) == 0){why_absent_v2 <- NA} - out <- + out <- tibble(extid = this_extid, any_v1, v1_new, @@ -1216,16 +1325,28 @@ if(FALSE){ starting_hecon_statuses <- bind_rows( healtheconnew_repeat_individual %>% dplyr::select(extid, PARENT_KEY, hecon_individual_status) %>% + dplyr::mutate(visit = 'V1') %>% left_join(healtheconnew %>% dplyr::select(KEY, start_time) %>% mutate(start_time = as.POSIXct(start_time)), by = c('PARENT_KEY' = 'KEY')), healtheconbaseline_repeat_individual %>% dplyr::select(extid, PARENT_KEY, hecon_individual_status) %>% - left_join(healtheconbaseline %>% dplyr::select(KEY, start_time) %>% mutate(start_time = as.POSIXct(start_time)), by = c('PARENT_KEY' = 'KEY')), + dplyr::mutate(visit = 'V1') %>% + left_join(healtheconbaseline %>% dplyr::select(KEY, start_time) %>% mutate(start_time = as.POSIXct(start_time)), by = c('PARENT_KEY' = 'KEY')), healtheconmonthly_repeat_individual %>% dplyr::select(extid, PARENT_KEY, hecon_individual_status) %>% - left_join(healtheconmonthly %>% dplyr::select(KEY, start_time) %>% mutate(start_time = as.POSIXct(start_time)), by = c('PARENT_KEY' = 'KEY')) + left_join(healtheconmonthly %>% dplyr::select(KEY, visit, start_time) %>% mutate(start_time = as.POSIXct(start_time)), by = c('PARENT_KEY' = 'KEY')) ) %>% filter(!is.na(hecon_individual_status)) %>% - arrange(desc(start_time)) %>% - dplyr::distinct(extid, .keep_all = TRUE) %>% - dplyr::select(extid, starting_hecon_status = hecon_individual_status) + dplyr::select(start_time, extid, visit, starting_hecon_status = hecon_individual_status) + +# QC on healthecon individual statuses +log_checks <- check_hecon_ind_status_diff(starting_hecon_statuses, status_col = 'starting_hecon_status') +if(nrow(log_checks) == 0){ + starting_hecon_statuses <- starting_hecon_statuses %>% + dplyr::arrange(desc(start_time)) %>% + dplyr::distinct(extid, .keep_all = TRUE) %>% + dplyr::select(-visit, -start_time) +}else{ + stop('Please check log_checks to check status change errors') +} + # In the case of someone not being in the above dataset but being preselected for health economics # the status should be "out" starting_roster <- left_join(starting_roster, starting_hecon_statuses) %>% @@ -1373,13 +1494,25 @@ visits_done <- healtheconbaseline %>% summarise(visits_done = paste0(sort(unique(visit)), collapse = ', ')) # Get baseline + monthly in/out status right <- healtheconbaseline %>% - mutate(start_time = as.POSIXct(start_time)) %>% - dplyr::select(start_time, hhid, hecon_hh_status = hecon_household_status) %>% + mutate(start_time = as.POSIXct(start_time), + visit = 'V1') %>% + dplyr::select(visit, start_time, hhid, hecon_hh_status = hecon_household_status) %>% bind_rows(healtheconmonthly %>% mutate(start_time = as.POSIXct(start_time)) %>% - dplyr::select(start_time, hhid, hecon_hh_status = hecon_household_status)) %>% - arrange(desc(start_time)) %>% - dplyr::distinct(hhid, .keep_all = TRUE) + dplyr::select(visit, start_time, hhid, hecon_hh_status = hecon_household_status)) + +# QC on HH STATUSES +log_checks <- check_hecon_hh_status_diff(right, status_col = 'hecon_hh_status') +if(nrow(log_checks) == 0){ + right <- right %>% + dplyr::arrange(desc(start_time)) %>% + dplyr::distinct(hhid, .keep_all = TRUE) %>% + dplyr::select(-visit, -start_time) +}else{ + stop('Please check log_checks to check status change errors') +} + + visits_done <- left_join(visits_done, right) households <- left_join(households, visits_done) # If NA hecon_hh_status, this means that the household did not have any visit, and @@ -1729,7 +1862,7 @@ pfu_in <- dplyr::select(extid, pregnancy_status) %>% filter(pregnancy_status == 'in') -# Get the starting roster +# Get the starting roster starting_roster <- v0demography_full_repeat_individual %>% dplyr::select(PARENT_KEY, firstname, lastname, dob, sex, extid) %>% left_join(v0demography_full %>% dplyr::select(hhid, start_time, KEY), by = c('PARENT_KEY' = 'KEY')) %>% @@ -1969,7 +2102,7 @@ if(FALSE){ dir.create('rmds/pk_visit_control_sheets') } load('rmds/pk_tables.RData') - + vcs_list <- sort(unique(individuals$cluster)) for(a in 1:length(vcs_list)){ this_vcs <- vcs_list[a] @@ -1977,7 +2110,7 @@ if(FALSE){ rmarkdown::render('rmds/pk_visit_control_sheet.Rmd', params = list('vcs' = this_vcs), output_file = paste0( getwd(), '/pk_visit_control_sheets/', this_vcs, '.pdf')) } - + # Now stitch them all together owd <- getwd() setwd('rmds/pk_visit_control_sheets/') diff --git a/scripts/metadata/tests.R b/scripts/metadata/tests.R new file mode 100644 index 0000000..631c043 --- /dev/null +++ b/scripts/metadata/tests.R @@ -0,0 +1,63 @@ +# This script is used as a utility script to do spot checks for metadata in healthecon + + +#' checks healthecon individual status diff based on visits +#' @return tibble of errors in status changes +check_hecon_ind_status_diff <- function(individuals, status_col){ + ### Checks Variables + possible_visits_list <- c('V1', 'V2', 'V3', 'V4', 'V5') + unique_visit <- unique(individuals$visit) + add_cols <- setdiff(possible_visits_list, unique_visit) + + ### Individual Checks + qc_data <- individuals %>% + dplyr::distinct(extid, visit, !!sym(status_col)) %>% + tidyr::pivot_wider(names_from = visit, id_cols = c(extid), values_from = !!sym(status_col)) %>% + bind_cols(setNames(rep(list(NA), length(add_cols)), add_cols)) + + checks <- qc_data %>% + dplyr::mutate(error_flag = case_when( + (!V1 %in% c('in', 'out')) ~ 'V0 -> V1 hecon hh status error', + (!is.na(V1) & !is.na(V2)) & (V1 == 'in' & V2 == 'out') ~ 'V1 -> V2 hecon ind status error in -> out is not possible', + (!is.na(V1) & !is.na(V2)) & (V1 == 'out' & (V2 %in% c('in', 'eos'))) ~ 'V1 -> V2 hecon ind status error out -> in or eos is not possible', + (!is.na(V2) & !is.na(V3)) & (V2 == 'in' & V3 == 'out') ~ 'V2 -> V3 hecon ind status error in -> out is not possible', + (!is.na(V2) & !is.na(V3)) & (V2 == 'out' & (V3 %in% c('in', 'eos'))) ~ 'V2 -> V3 hecon ind status error out -> in or eos is not possible', + (!is.na(V3) & !is.na(V4)) & (V3 == 'in' & V4 == 'out') ~ 'V3 -> V4 hecon ind status error in -> out is not possible', + (!is.na(V3) & !is.na(V4)) & (V3 == 'out' & (V4 %in% c('in', 'eos'))) ~ 'V3 -> V4 hecon ind status error out -> in or eos is not possible', + TRUE ~ 'pass' + )) %>% + dplyr::filter(error_flag != 'pass') + + return(checks) + +} + +#' checks healthecon household status diff based on visits +#' @return tibble of errors in status changes +check_hecon_hh_status_diff <- function(households, status_col) { + ### Checks Variables + possible_visits_list <- c('V1', 'V2', 'V3', 'V4', 'V5') + unique_visit <- unique(households$visit) + add_cols <- setdiff(possible_visits_list, unique_visit) + + ### Individual Checks + qc_data <- households %>% + dplyr::distinct(hhid, visit, !!sym(status_col)) %>% + tidyr::pivot_wider(names_from = visit, id_cols = c(hhid), values_from = !!sym(status_col)) %>% + bind_cols(setNames(rep(list(NA), length(add_cols)), add_cols)) + + checks <- qc_data %>% + dplyr::mutate(error_flag = case_when( + (!V1 %in% c('in', 'out')) ~ 'V0 -> V1 hecon hh status error', + (!is.na(V1) & !is.na(V2)) & (V1 == 'in' & V2 == 'out') ~ 'V1 -> V2 hecon ind status error in -> out is not possible', + (!is.na(V1) & !is.na(V2)) & (V1 == 'out' & (V2 %in% c('in', 'eos'))) ~ 'V1 -> V2 hecon ind status error out -> in or eos is not possible', + (!is.na(V2) & !is.na(V3)) & (V2 == 'in' & V3 == 'out') ~ 'V2 -> V3 hecon ind status error in -> out is not possible', + (!is.na(V2) & !is.na(V3)) & (V2 == 'out' & (V3 %in% c('in', 'eos'))) ~ 'V2 -> V3 hecon ind status error out -> in or eos is not possible', + (!is.na(V3) & !is.na(V4)) & (V3 == 'in' & V4 == 'out') ~ 'V3 -> V4 hecon ind status error in -> out is not possible', + (!is.na(V3) & !is.na(V4)) & (V3 == 'out' & (V4 %in% c('in', 'eos'))) ~ 'V3 -> V4 hecon ind status error out -> in or eos is not possible', + TRUE ~ 'pass' + )) %>% + dplyr::filter(error_flag != 'pass') + + return(checks) +}