Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
173 changes: 153 additions & 20 deletions scripts/metadata/generate_metadata.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ library(rmarkdown)
library(knitr)
library(kableExtra)
library(readr)
source('tests.R')

# to create rmd output
dir.create('rmds', recursive = TRUE)
Expand Down Expand Up @@ -522,7 +523,7 @@ households_sp <- v0demography %>%
filter(!is.na(x))
coordinates(households_sp) <- ~x+y
load('../../data_public/spatial/new_clusters.RData')
# buffer clusters by 20 meters so as to
# buffer clusters by 50 meters so as to
p4s <- "+proj=utm +zone=37 +south +ellps=WGS84 +datum=WGS84 +units=m +no_defs"
crs <- CRS(p4s)
llcrs <- CRS("+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0")
Expand Down Expand Up @@ -563,6 +564,20 @@ households_sp_projected@data$old_cluster_correct <-
ifelse(is.na(households_sp_projected@data$cluster_geo),
households_sp_projected@data$cluster_with_buffer,
households_sp_projected@data$cluster_geo)

# x = households_sp_projected@data[households_sp_projected@data$old_cluster_correct != households_sp_projected@data$cluster_correct,]
# x <- x %>% filter(!is.na(hhid))
# x <- x %>%
# mutate(keep = ifelse(is.na(cluster_geo), cluster_correct, old_cluster_correct))

# households_sp_projected@data$old_cluster_correct <-
# ifelse(is.na(households_sp_projected@data$cluster_geo),
# households_sp_projected@data$cluster_correct,
# households_sp_projected@data$old_cluster_correct)

# Household 88324 is like household 03005, misclassified
households_sp_projected@data$old_cluster_correct[households_sp_projected@data$hhid == '03005'] <- '3'
households_sp_projected@data$old_cluster_correct[households_sp_projected@data$hhid == '88324'] <- '88'
# old_cluster_correct is the right variable for geographic location, not geo_cluster_num
# since geo_cluster_num was calculated using a faulty method which put households which
# were within the buffer of >1 cluster into the wrong cluster at times
Expand Down Expand Up @@ -670,13 +685,14 @@ departures <- bind_rows(safety_departures, safety_deaths, efficacy_deaths)
# events <- bind_rows(arrivals, departures) %>% arrange(todays_date)
# Get the starting roster
starting_roster <- v0demography_repeat_individual %>%
left_join(v0demography %>% dplyr::select(hhid, start_time, KEY), by = c('PARENT_KEY' = 'KEY')) %>%
left_join(v0demography_full %>% dplyr::select(hhid, start_time, KEY), by = c('PARENT_KEY' = 'KEY')) %>%
dplyr::select(hhid, start_time, firstname, lastname, dob, sex, extid) %>%
filter(!is.na(extid)) %>%
arrange(desc(start_time)) %>%
dplyr::distinct(extid, .keep_all = TRUE) %>%
mutate(remove = FALSE) %>%
mutate(index = 1:nrow(.))
mutate(index = 1:nrow(.)) %>%
filter(!is.na(hhid))
# save starting roster for nika
if(FALSE){
nika <- starting_roster %>%
Expand All @@ -701,6 +717,18 @@ dead_in_safety <- starting_roster %>%
starting_roster <- starting_roster %>%
filter(migrated != 1, dead != 1)

# Prior to adding arrivals, remove some arrivals if they occured BEFORE a departure or out-migration
remove_these <- arrivals %>%
dplyr::select(arrival_time = start_time,
extid, hhid) %>%
left_join(
departures %>%
dplyr::select(departure_time = start_time,
extid, hhid)
) %>%
filter(arrival_time < departure_time)
arrivals <- arrivals %>% filter(!extid %in% remove_these$extid)

# Go through each arrival and add information
roster <- bind_rows(
starting_roster,
Expand Down Expand Up @@ -737,7 +765,7 @@ households <- roster %>%
summarise(roster = paste0(roster_name[dead == 0 & migrated == 0], collapse = ', '),
num_members = length(which(dead == 0 & migrated == 0))) %>%
# get cluster
left_join(v0demography %>%
left_join(v0demography_full %>%
dplyr::distinct(hhid, .keep_all = TRUE) %>%
dplyr::select(hhid, cluster))
# get assignments
Expand Down Expand Up @@ -899,7 +927,7 @@ safety_individuals <- individuals
# Remove from safety households which contain 100% refused individuals
# https://trello.com/c/mGoKfh7w/2071-update-metadata-script-safety-to-remove-from-safety-lists-all-individuals-from-households-with-100-refusals
if(TRUE){

removed_households <- c() # placeholder for checking on removed households
remove_visits <- paste0('V', 1:4)
for(i in 1:length(remove_visits)){
this_visit <- remove_visits[i]
Expand Down Expand Up @@ -929,13 +957,20 @@ if(TRUE){
mutate(p_refusals = n_refusals / n_individual_submissions * 100) %>%
arrange(desc(p_refusals))
all_refusals <- dx %>% filter(p_refusals == 100) %>% arrange(hhid)
removed_households <- c(removed_households, all_refusals$hhid)
message('---Going to remove ', nrow(all_refusals))
# write_csv(all_refusals, '~/Desktop/all_refusals.csv')
households <- households %>% filter(!hhid %in% all_refusals$hhid)
individuals <- individuals %>% filter(hhid %in% households$hhid)
}
}

# Keep only those individuals whose cluster is "in study"
in_study_clusters <- add_zero(sort(unique(clusters_projected@data$cluster_nu)), 2)
individuals <- individuals %>%
filter(cluster %in% in_study_clusters)
households <- households %>%
filter(cluster %in% in_study_clusters)

# Write csvs
if(!dir.exists('safety_metadata')){
Expand Down Expand Up @@ -1115,6 +1150,80 @@ if(FALSE){
system(system_text)
setwd(owd)
}


if(FALSE){
# some safety sanity checks
load('rmds/safety_tables.RData')
# any individual who is eos remains eos forever
ever_eos <-
c(safety_repeat_individual %>% filter(safety_status == 'eos') %>% pull(extid),
safetynew_repeat_individual %>% filter(safety_status == 'eos') %>% pull(extid)) %>%
unique
table(individuals$starting_safety_status[individuals$extid %in% ever_eos]) # this should be all eos

# Same number of households in both `individual_data.csv` and `household_data.csv` (JOE)
length(unique(households$hhid))
length(unique(individuals$hhid))

# Only in-cluster households included (JOE)
table(households$cluster %in% in_study_clusters)
table(individuals$cluster %in% in_study_clusters)

# Anyone who migrated out is EOS
migrated_ids <- safety_repeat_individual %>% filter(person_absent_reason == 'Migrated') %>% pull(extid)
table(migrated_ids %in% individuals$extid)
still_there <- individuals %>% filter(extid %in% migrated_ids) # they're all eos, all good

# Anyone who died is EOS
died_ids <- safety_repeat_individual %>% filter(person_absent_reason == 'Died') %>% pull(extid)
table(died_ids %in% individuals$extid) # should be all false
still_there <- individuals %>% filter(extid %in% died_ids) # none

# Anyone with a non-EOS individual status at end of visit 2 is not EOS in `individual_data.csv` (JOE)
not_eos_v2 <- safety_repeat_individual %>%
left_join(safety, by = c('PARENT_KEY' = 'KEY')) %>%
filter(visit == 'V2') %>%
filter(safety_status != 'eos') %>%
pull(extid)
now_status <- individuals %>% filter(extid %in% not_eos_v2)
table(now_status$starting_safety_status) # should be no eos
table(not_eos_v2 %in% individuals$extid) # they should all be there EXCEPT those which are manually removed or all refusals
not_there <- not_eos_v2[!not_eos_v2 %in% individuals$extid] %>% unique()
table(safety_repeat_individual %>% filter(extid %in% not_there) %>% pull(safety_status))
# See if they are in removed households
table(substr(not_there, 1, 5) %in% removed_households) # only some
# See if they are in departures
table(not_there %in% departures$extid) # most of them
table(not_there %in% departures$extid | substr(not_there, 1, 5) %in% removed_households)

# Visit Status checks
safety_test_tbl <- dplyr::bind_rows(
safety %>%
dplyr::inner_join(safety_repeat_individual, by = c('KEY' = 'PARENT_KEY')) %>%
dplyr::filter(extid %in% (individuals %>% dplyr::filter(starting_pregnancy_status != 'in') %>% .$extid)),
safetynew %>%
dplyr::inner_join(safetynew_repeat_individual, by = c('KEY' = 'PARENT_KEY')) %>%
dplyr::filter(extid %in% (individuals %>% dplyr::filter(starting_pregnancy_status != 'in') %>% .$extid))
)

# Test changes on safety visit status out -> eos violation
safety_test_tbl %>%
dplyr::select(visit, safety_status, extid) %>%
tidyr::pivot_wider(names_from = visit, values_from = safety_status) %>%
dplyr::filter(V1 == 'out', V2 == 'eos') %>%
dplyr::select(extid)

# Test changes on safety visit status in -> out violation
safety_test_tbl %>%
dplyr::select(visit, safety_status, extid) %>%
tidyr::pivot_wider(names_from = visit, values_from = safety_status) %>%
dplyr::filter(V1 == 'in', V2 == 'out') %>%
dplyr::select(extid)


# that covers them
}
# </Safety> ##############################################################################
##############################################################################
##############################################################################
Expand Down Expand Up @@ -1167,7 +1276,7 @@ if(FALSE){
for(i in 1:nrow(hei)){
this_extid <- hei$extid[i]
message(i, ' of ', nrow(hei))
v1 <-
v1 <-
bind_rows(
healtheconbaseline_repeat_individual %>% filter(extid == this_extid) %>% dplyr::select(extid, hecon_individual_status) %>% mutate(type = 'old'),
healtheconnew_repeat_individual %>% filter(extid == this_extid) %>% dplyr::select(extid, hecon_individual_status)
Expand All @@ -1190,7 +1299,7 @@ if(FALSE){
if(length(status_after_v2_per_v2_form) == 0){status_after_v2_per_v2_form <- NA}
why_absent_v2 <- v2_absent %>% pull(person_absent_reason)
if(length(why_absent_v2) == 0){why_absent_v2 <- NA}
out <-
out <-
tibble(extid = this_extid,
any_v1,
v1_new,
Expand All @@ -1216,16 +1325,28 @@ if(FALSE){
starting_hecon_statuses <-
bind_rows(
healtheconnew_repeat_individual %>% dplyr::select(extid, PARENT_KEY, hecon_individual_status) %>%
dplyr::mutate(visit = 'V1') %>%
left_join(healtheconnew %>% dplyr::select(KEY, start_time) %>% mutate(start_time = as.POSIXct(start_time)), by = c('PARENT_KEY' = 'KEY')),
healtheconbaseline_repeat_individual %>% dplyr::select(extid, PARENT_KEY, hecon_individual_status) %>%
left_join(healtheconbaseline %>% dplyr::select(KEY, start_time) %>% mutate(start_time = as.POSIXct(start_time)), by = c('PARENT_KEY' = 'KEY')),
dplyr::mutate(visit = 'V1') %>%
left_join(healtheconbaseline %>% dplyr::select(KEY, start_time) %>% mutate(start_time = as.POSIXct(start_time)), by = c('PARENT_KEY' = 'KEY')),
healtheconmonthly_repeat_individual %>% dplyr::select(extid, PARENT_KEY, hecon_individual_status) %>%
left_join(healtheconmonthly %>% dplyr::select(KEY, start_time) %>% mutate(start_time = as.POSIXct(start_time)), by = c('PARENT_KEY' = 'KEY'))
left_join(healtheconmonthly %>% dplyr::select(KEY, visit, start_time) %>% mutate(start_time = as.POSIXct(start_time)), by = c('PARENT_KEY' = 'KEY'))
) %>%
filter(!is.na(hecon_individual_status)) %>%
arrange(desc(start_time)) %>%
dplyr::distinct(extid, .keep_all = TRUE) %>%
dplyr::select(extid, starting_hecon_status = hecon_individual_status)
dplyr::select(start_time, extid, visit, starting_hecon_status = hecon_individual_status)

# QC on healthecon individual statuses
log_checks <- check_hecon_ind_status_diff(starting_hecon_statuses, status_col = 'starting_hecon_status')
if(nrow(log_checks) == 0){
starting_hecon_statuses <- starting_hecon_statuses %>%
dplyr::arrange(desc(start_time)) %>%
dplyr::distinct(extid, .keep_all = TRUE) %>%
dplyr::select(-visit, -start_time)
}else{
stop('Please check log_checks to check status change errors')
}

# In the case of someone not being in the above dataset but being preselected for health economics
# the status should be "out"
starting_roster <- left_join(starting_roster, starting_hecon_statuses) %>%
Expand Down Expand Up @@ -1373,13 +1494,25 @@ visits_done <- healtheconbaseline %>%
summarise(visits_done = paste0(sort(unique(visit)), collapse = ', '))
# Get baseline + monthly in/out status
right <- healtheconbaseline %>%
mutate(start_time = as.POSIXct(start_time)) %>%
dplyr::select(start_time, hhid, hecon_hh_status = hecon_household_status) %>%
mutate(start_time = as.POSIXct(start_time),
visit = 'V1') %>%
dplyr::select(visit, start_time, hhid, hecon_hh_status = hecon_household_status) %>%
bind_rows(healtheconmonthly %>%
mutate(start_time = as.POSIXct(start_time)) %>%
dplyr::select(start_time, hhid, hecon_hh_status = hecon_household_status)) %>%
arrange(desc(start_time)) %>%
dplyr::distinct(hhid, .keep_all = TRUE)
dplyr::select(visit, start_time, hhid, hecon_hh_status = hecon_household_status))

# QC on HH STATUSES
log_checks <- check_hecon_hh_status_diff(right, status_col = 'hecon_hh_status')
if(nrow(log_checks) == 0){
right <- right %>%
dplyr::arrange(desc(start_time)) %>%
dplyr::distinct(hhid, .keep_all = TRUE) %>%
dplyr::select(-visit, -start_time)
}else{
stop('Please check log_checks to check status change errors')
}


visits_done <- left_join(visits_done, right)
households <- left_join(households, visits_done)
# If NA hecon_hh_status, this means that the household did not have any visit, and
Expand Down Expand Up @@ -1729,7 +1862,7 @@ pfu_in <-
dplyr::select(extid, pregnancy_status) %>%
filter(pregnancy_status == 'in')

# Get the starting roster
# Get the starting roster
starting_roster <- v0demography_full_repeat_individual %>%
dplyr::select(PARENT_KEY, firstname, lastname, dob, sex, extid) %>%
left_join(v0demography_full %>% dplyr::select(hhid, start_time, KEY), by = c('PARENT_KEY' = 'KEY')) %>%
Expand Down Expand Up @@ -1969,15 +2102,15 @@ if(FALSE){
dir.create('rmds/pk_visit_control_sheets')
}
load('rmds/pk_tables.RData')

vcs_list <- sort(unique(individuals$cluster))
for(a in 1:length(vcs_list)){
this_vcs <- vcs_list[a]
message(a, ' of ', length(vcs_list), ' WD: ', getwd())
rmarkdown::render('rmds/pk_visit_control_sheet.Rmd', params = list('vcs' = this_vcs),
output_file = paste0( getwd(), '/pk_visit_control_sheets/', this_vcs, '.pdf'))
}

# Now stitch them all together
owd <- getwd()
setwd('rmds/pk_visit_control_sheets/')
Expand Down
63 changes: 63 additions & 0 deletions scripts/metadata/tests.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# This script is used as a utility script to do spot checks for metadata in healthecon


#' checks healthecon individual status diff based on visits
#' @return tibble of errors in status changes
check_hecon_ind_status_diff <- function(individuals, status_col){
### Checks Variables
possible_visits_list <- c('V1', 'V2', 'V3', 'V4', 'V5')
unique_visit <- unique(individuals$visit)
add_cols <- setdiff(possible_visits_list, unique_visit)

### Individual Checks
qc_data <- individuals %>%
dplyr::distinct(extid, visit, !!sym(status_col)) %>%
tidyr::pivot_wider(names_from = visit, id_cols = c(extid), values_from = !!sym(status_col)) %>%
bind_cols(setNames(rep(list(NA), length(add_cols)), add_cols))

checks <- qc_data %>%
dplyr::mutate(error_flag = case_when(
(!V1 %in% c('in', 'out')) ~ 'V0 -> V1 hecon hh status error',
(!is.na(V1) & !is.na(V2)) & (V1 == 'in' & V2 == 'out') ~ 'V1 -> V2 hecon ind status error in -> out is not possible',
(!is.na(V1) & !is.na(V2)) & (V1 == 'out' & (V2 %in% c('in', 'eos'))) ~ 'V1 -> V2 hecon ind status error out -> in or eos is not possible',
(!is.na(V2) & !is.na(V3)) & (V2 == 'in' & V3 == 'out') ~ 'V2 -> V3 hecon ind status error in -> out is not possible',
(!is.na(V2) & !is.na(V3)) & (V2 == 'out' & (V3 %in% c('in', 'eos'))) ~ 'V2 -> V3 hecon ind status error out -> in or eos is not possible',
(!is.na(V3) & !is.na(V4)) & (V3 == 'in' & V4 == 'out') ~ 'V3 -> V4 hecon ind status error in -> out is not possible',
(!is.na(V3) & !is.na(V4)) & (V3 == 'out' & (V4 %in% c('in', 'eos'))) ~ 'V3 -> V4 hecon ind status error out -> in or eos is not possible',
TRUE ~ 'pass'
)) %>%
dplyr::filter(error_flag != 'pass')

return(checks)

}

#' checks healthecon household status diff based on visits
#' @return tibble of errors in status changes
check_hecon_hh_status_diff <- function(households, status_col) {
### Checks Variables
possible_visits_list <- c('V1', 'V2', 'V3', 'V4', 'V5')
unique_visit <- unique(households$visit)
add_cols <- setdiff(possible_visits_list, unique_visit)

### Individual Checks
qc_data <- households %>%
dplyr::distinct(hhid, visit, !!sym(status_col)) %>%
tidyr::pivot_wider(names_from = visit, id_cols = c(hhid), values_from = !!sym(status_col)) %>%
bind_cols(setNames(rep(list(NA), length(add_cols)), add_cols))

checks <- qc_data %>%
dplyr::mutate(error_flag = case_when(
(!V1 %in% c('in', 'out')) ~ 'V0 -> V1 hecon hh status error',
(!is.na(V1) & !is.na(V2)) & (V1 == 'in' & V2 == 'out') ~ 'V1 -> V2 hecon ind status error in -> out is not possible',
(!is.na(V1) & !is.na(V2)) & (V1 == 'out' & (V2 %in% c('in', 'eos'))) ~ 'V1 -> V2 hecon ind status error out -> in or eos is not possible',
(!is.na(V2) & !is.na(V3)) & (V2 == 'in' & V3 == 'out') ~ 'V2 -> V3 hecon ind status error in -> out is not possible',
(!is.na(V2) & !is.na(V3)) & (V2 == 'out' & (V3 %in% c('in', 'eos'))) ~ 'V2 -> V3 hecon ind status error out -> in or eos is not possible',
(!is.na(V3) & !is.na(V4)) & (V3 == 'in' & V4 == 'out') ~ 'V3 -> V4 hecon ind status error in -> out is not possible',
(!is.na(V3) & !is.na(V4)) & (V3 == 'out' & (V4 %in% c('in', 'eos'))) ~ 'V3 -> V4 hecon ind status error out -> in or eos is not possible',
TRUE ~ 'pass'
)) %>%
dplyr::filter(error_flag != 'pass')

return(checks)
}