diff --git a/.Rbuildignore b/.Rbuildignore index 7366069..0f9f016 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -20,3 +20,5 @@ ^docs$ ^CRAN-SUBMISSION$ ^logo\.svg$ +^doc$ +^Meta$ diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 7d94d3a..42e8f80 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -22,7 +22,7 @@ jobs: runs-on: windows-latest strategy: matrix: - r-version: ['4.3.0'] + r-version: ['4.3.3'] # bump version 4.3.0 to 4.3.3 resolve non-conformal array error steps: - uses: actions/checkout@v3 @@ -30,6 +30,7 @@ jobs: uses: r-lib/actions/setup-r@v2 with: r-version: ${{ matrix.r-version }} + use-public-rspm: true - name: Install dependencies uses: r-lib/actions/setup-r-dependencies@v2 with: diff --git a/.gitignore b/.gitignore index b38ec7d..7a058d7 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,5 @@ inst/doc test_pkg rstan_code revdep +/doc/ +/Meta/ diff --git a/DESCRIPTION b/DESCRIPTION index 331cfba..d3f3bb5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -17,11 +17,12 @@ Encoding: UTF-8 LazyData: true Depends: R (>= 3.4.0) -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.3 Imports: deSolve, dplyr, tidyr, + janitor, ggplot2, locfit, purrr, @@ -30,6 +31,8 @@ Imports: methods, mgcv, mixdist, + scam, + mvtnorm, patchwork, assertthat, Rcpp (>= 0.12.0), @@ -60,6 +63,7 @@ Collate: 'semiparametric_models.R' 'mixture_model.R' 'hierarchical_bayesian_model.R' + 'to_titer.R' 'serosv.R' 'stanmodels.R' 'plots.R' diff --git a/NAMESPACE b/NAMESPACE index 6d3330b..80b1540 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -14,11 +14,13 @@ S3method(plot,sir_basic_model) S3method(plot,sir_static_model) S3method(plot,sir_subpops_model) S3method(plot,weibull_model) +export(add_thresholds) export(age_time_model) export(compare_models) export(compute_ci) export(compute_ci.age_time_model) export(compute_ci.fp_model) +export(compute_ci.hierarchical_bayesian_model) export(compute_ci.lp_model) export(compute_ci.mixture_model) export(compute_ci.penalized_spline_model) @@ -35,12 +37,17 @@ export(mixture_model) export(mseir_model) export(pava) export(penalized_spline_model) +export(plot_corrected_prev) export(plot_gcv) +export(plot_standard_curve) +export(plot_titer_qc) export(polynomial_model) export(set_plot_style) export(sir_basic_model) export(sir_static_model) export(sir_subpops_model) +export(standardize_data) +export(to_titer) export(transform_data) export(weibull_model) import(Rcpp) @@ -48,20 +55,22 @@ import(assertthat) import(dplyr) import(ggplot2) import(graphics) -import(magrittr) import(methods) -import(mgcv) import(patchwork) +import(purrr) +import(scam) import(tidyr) importFrom(RcppParallel,RcppParallelLibs) importFrom(assertthat,assert_that) importFrom(boot,inv.logit) importFrom(deSolve,ode) importFrom(dplyr,group_by) +importFrom(dplyr,mutate) importFrom(dplyr,n) importFrom(dplyr,summarize) importFrom(graphics,par) importFrom(graphics,plot) +importFrom(janitor,clean_names) importFrom(locfit,"crit<-") importFrom(locfit,crit) importFrom(locfit,gcv) @@ -70,12 +79,14 @@ importFrom(locfit,locfit) importFrom(locfit,lp) importFrom(magrittr,"%>%") importFrom(methods,is) +importFrom(mgcv,betar) importFrom(mgcv,gam) importFrom(mgcv,gamm) importFrom(mgcv,predict.gam) importFrom(mixdist,mix) importFrom(mixdist,mixgroup) importFrom(mixdist,mixparam) +importFrom(mvtnorm,rmvnorm) importFrom(purrr,imap_dfr) importFrom(rstan,sampling) importFrom(rstan,summary) @@ -89,6 +100,7 @@ importFrom(stats,gaussian) importFrom(stats,glm) importFrom(stats,predict) importFrom(stats,predict.glm) +importFrom(stats,prop.test) importFrom(stats,qnorm) importFrom(stats,qt) importFrom(stats4,mle) diff --git a/R/age_time_model.R b/R/age_time_model.R index c8d7280..9e75d98 100644 --- a/R/age_time_model.R +++ b/R/age_time_model.R @@ -1,6 +1,7 @@ -# ------ Demo function -------- #' Age-time varying seroprevalence -#' Fit age-stratified seroprevalence across multiple time points. Also try to monotonize age (or cohort) - specific seroprevalence. +#' +#' +#' @description Fit age-stratified seroprevalence across multiple time points. Also try to monotonize age (or birth cohort) - specific seroprevalence. #' #' @param data - input data, must have`age`, `status`, time, group columns, where group column determines how data is aggregated #' @param time_col - name of the column for time (default to `date`) @@ -8,18 +9,28 @@ #' @param age_correct - a boolean, if `TRUE`, monotonize age-specific prevalence. Monotonize birth cohort-specific seroprevalence otherwise. #' @param le - number of bins to generate age grid, used when monotonizing data #' @param ci - confidence interval for smoothing -#' @import mgcv +#' @param monotonize_method - either "pava" or "scam" +#' @import scam assertthat +#' @importFrom mgcv gam predict.gam betar #' -#' @return a list of class time_age_model with 3 items +#' @return a list of class time_age_model with 4 items #' \item{out}{a data.frame with dimension n_group x 9, where columns `info`, `sp`, `foi` store output for non-monotonized -#' data and `monotonized_info`, `monotonized_sp`, `monotonized_foi`, `monotonized_ci_mod` store output for monotnized data} +#' data and `monotonized_info`, `monotonized_sp`, `monotonized_foi`, `monotonized_ci_mod` store output for monotonized data} #' \item{grouping_col}{name of the column for grouping} #' \item{age_correct}{a boolean indicating whether the data is monotonized across age or cohort} +#' \item{datatype}{whether the input data is aggregated or line-listing data} #' @export -age_time_model <- function(data, time_col="date", grouping_col="group", - age_correct=F, le=512, ci = 0.95){ +age_time_model <- function(data, time_col="date", grouping_col="group", age_correct=F, le=512, ci = 0.95, monotonize_method = "pava"){ + # work around to resolve no visible binding note NOTE during check() + x <- label <- family <- fit <- se.fit <- ymin <- ymax <- y <- mean_time <- prevalence <- sim_data <- NULL + age <- ys <- shift_no <- cohort <- col_time <- monotonized_mod <- df <- info <- sp <- monotonized_info <- monotonized_sp <- NULL + assert_that( + (monotonize_method == "pava") | (monotonize_method == "scam"), + msg = 'Monotonize method must be either "pava" or "scam"' + ) + # ---- helper functions ----- shift_right <- \(n,x){ if(n == 1) x else dplyr::lag(x, n, default = NA)} # function to simulate data for monotonize process @@ -46,8 +57,23 @@ age_time_model <- function(data, time_col="date", grouping_col="group", } # function to monotonize data using serosv pava function monotonize_data <- \(dat, grp){ + dat <- dat %>% arrange(mean_time) + if(monotonize_method == "scam"){ + out <- tryCatch( + { + mod <- scam(prevalence ~ s(mean_time, bs = "mpi"), data=dat,family = betar) + dat$prevalence <- predict(mod, list(mean_time = dat$mean_time), type = "response") + dat + }, + error = \(e) { + e + } + ) + + if (!("error" %in% class(out))) return(out) + } + dat %>% - arrange(mean_time) %>% mutate( prevalence = serosv::pava(prevalence)$pai2 ) @@ -56,7 +82,7 @@ age_time_model <- function(data, time_col="date", grouping_col="group", model <- list() # --- preprocess data ------ - check_input <- serosv:::check_input(data) + check_input <- check_input(data) age_range <- range(data$age) age_grid <- seq(age_range[1], age_range[2], length.out = le) @@ -137,7 +163,7 @@ age_time_model <- function(data, time_col="date", grouping_col="group", # mapping to covert cohort to age cohort_age_mapping <- scam_data %>% - select(col_time, age, cohort) %>% + select(!!sym(grouping_col), age, cohort) %>% unique() # map cohort from monotized data to age (at collection time) @@ -157,13 +183,19 @@ age_time_model <- function(data, time_col="date", grouping_col="group", out <- scam_out %>% mutate( monotonized_mod = map(data, \(dat){ - gam(y ~ s(age), family = betar, data = dat) + # handle potential error when dataset is small + k <- if(length(unique(dat$age)) < 10) length(unique(dat$age)) - 1 else -1 + + gam(y ~ s(age, k=k), family = betar, data = dat) }), # also have model for smooth ci monotonized_ci_mod = map(data, \(dat){ + # handle potential error when dataset is small + k <- if(length(unique(dat$age)) < 10) length(unique(dat$age)) - 1 else -1 + list( - "ymin" = gam(ymin ~ s(age), family = betar, data = dat), - "ymax" = gam(ymax ~ s(age), family = betar, data = dat) + "ymin" = gam(ymin ~ s(age, k=k), family = gaussian, data = dat), + "ymax" = gam(ymax ~ s(age, k=k), family = gaussian, data = dat) ) }) ) %>% @@ -206,3 +238,4 @@ age_time_model <- function(data, time_col="date", grouping_col="group", model } + diff --git a/R/compare_models.R b/R/compare_models.R index ea9ccdb..681b521 100644 --- a/R/compare_models.R +++ b/R/compare_models.R @@ -23,6 +23,8 @@ compare_models <- function(...){ stop("Inputs must be serosv models") } + # TODO: apply cross validation to get matrix instead + data.frame( model = .y, type = class(.x), diff --git a/R/compute_ci.R b/R/compute_ci.R index f92d6a4..54228e5 100644 --- a/R/compute_ci.R +++ b/R/compute_ci.R @@ -123,6 +123,7 @@ compute_ci.lp_model <- function(x,ci = 0.95, ...){ out.DF } + #' Compute confidence interval for penalized_spline_model #' #' @param x - serosv models @@ -178,6 +179,58 @@ compute_ci.penalized_spline_model <- function(x,ci = 0.95, ...){ return(list(out.DF, out.FOI)) } +#' Compute 95\% credible interval for hierarchical Bayesian model +#' +#' @param x - serosv models +#' @param ... - arbitrary arguments +#' @importFrom mgcv predict.gam +#' @import dplyr +#' +#' @return list of confidence interval for seroprevalence and foi. Each confidence interval dataframe with 4 variables, x and y for the fitted values and ymin and ymax for the confidence interval +#' @export +compute_ci.hierarchical_bayesian_model <- function(x, ...){ + out_x <- x$df$age + out.DF <- NULL + + if (x$type == "far3"){ + alpha1 <- x$info["alpha1",c("2.5%","50%", "97.5%")] + alpha2 <- x$info["alpha2",c("2.5%","50%", "97.5%")] + alpha3 <- x$info["alpha3",c("2.5%","50%", "97.5%")] + + out.DF <- data.frame( + x = out_x, + ymin = x$sp_func(out_x, alpha1[1], alpha2[1], alpha3[1]), + y = x$sp_func(out_x, alpha1[2], alpha2[2], alpha3[2]), + ymax = x$sp_func(out_x, alpha1[3], alpha2[3], alpha3[3]) + ) + }else if(x$type == "far2"){ + alpha1 <- x$info["alpha1",c("2.5%","50%", "97.5%")] + alpha2 <- x$info["alpha2",c("2.5%","50%", "97.5%")] + + out.DF <- data.frame( + x = out_x, + ymin = x$sp_func(out_x, alpha1[1], alpha2[1]), + y = x$sp_func(out_x, alpha1[2], alpha2[2]), + ymax = x$sp_func(out_x, alpha1[3], alpha2[3]) + ) + + }else if(x$type == "log_logistic"){ + alpha1 <- x$info["alpha1",c("2.5%","50%", "97.5%")] + alpha2 <- x$info["alpha2",c("2.5%","50%", "97.5%")] + + out.DF <- data.frame( + x = out_x, + ymin = x$sp_func(out_x, alpha1[1], alpha2[1]), + y = x$sp_func(out_x, alpha1[2], alpha2[2]), + ymax = x$sp_func(out_x, alpha1[3], alpha2[3]) + ) + }else{ + warning('Expect model type to be one of the following: "far3", "far2", "log_logistic"') + } + + out.DF +} + #' Compute confidence interval for mixture model #' #' @param x - serosv mixture_model object @@ -216,6 +269,9 @@ compute_ci.mixture_model <- function(x,ci = 0.95, ...){ #' @return confidence interval dataframe with n_group x 3 cols, the columns are `group`, `sp_df`, `foi_df` #' @export compute_ci.age_time_model <- function(x, ci=0.95, le = 100, ...){ + # resolve no visible binding note + df <- monotonized_info <- monotonized_ci_mod <- age <- info <- fit <- se.fit <- sp_df <- foi_df <- NULL + # check which type of model user wants to visualize modtype <- if (is.null(list(...)[["modtype"]])) "monotonized" else list(...)$modtype assert_that( diff --git a/R/correct_prevalence.R b/R/correct_prevalence.R index dfe7255..2bc626d 100644 --- a/R/correct_prevalence.R +++ b/R/correct_prevalence.R @@ -1,4 +1,4 @@ -#' Estimate the true sero prevalence using Bayesian estimation +#' Estimate the true sero prevalence using Frequentist/Bayesian estimation #' #' @param data the input data frame, must either have `age`, `pos`, `tot` columns (for aggregated data) OR `age`, `status` for (linelisting data) #' @param bayesian whether to adjust sero-prevalence using the Bayesian or frequentist approach. If set to `TRUE`, true sero-prevalence is estimated using MCMC. @@ -11,11 +11,14 @@ #' @param iter (applicable when `bayesian=TRUE`) number of iterations #' #' @importFrom rstan sampling summary +#' @importFrom dplyr mutate +#' @importFrom stats prop.test +#' @importFrom magrittr %>% #' -#' @return -#' a list of 2 items -#' \item{info}{estimated parameters} -#' \item{corrected_sero}{data.frame containing age, the corresponding estimated seroprevalance, adjusted tot and pos} +#' @return a list of 3 items +#' \item{info}{estimated parameters (when `bayesian = TRUE`) or formula to compute corrected prevalence (when `bayesian = FALSE`)} +#' \item{df}{data.frame of input data (in aggregated form)} +#' \item{corrected_sero}{data.frame containing age, the corresponding estimated seroprevalance with 95\% confidence/credible interval, and adjusted tot and pos} #' @export #' #' @examples @@ -24,6 +27,9 @@ correct_prevalence <- function(data, bayesian=TRUE, init_se = 0.95, init_sp = 0.8, study_size_se = 1000, study_size_sp = 1000, chains = 1, warmup = 1000, iter = 2000){ + # resolve no visible binding note + lower <- upper <- prop.test <- NULL + output <- list() data <- check_input(data) @@ -55,23 +61,46 @@ correct_prevalence <- function(data, bayesian=TRUE, output$info <- summary(fit)$summary - # subsetting theta estimates (use mean value as corrected seroprevalence) - thetas <- summary(fit)$summary[3:(length(age) + 2), "mean"] + # subsetting theta estimates, also get the credible interval + thetas <- data.frame( + lower = summary(fit)$summary[3:(length(age) + 2), "2.5%"], + fit = summary(fit)$summary[3:(length(age) + 2), "50%"], + upper = summary(fit)$summary[3:(length(age) + 2), "97.5%"] + ) }else{ - # simply compute thetas using the formula - thetas <- (pos/tot + init_sp - 1) / (init_se + init_sp - 1) - thetas <- pmin(thetas, 1) # keep upper bound as 1 - thetas <- pmax(thetas, 0) # keep lower bound as 0 + thetas <- data.frame( + # use prop.test to get confidence interval + lower = mapply(\(pos, tot){prop.test(pos, tot, conf.level = 0.95)$conf.int[1]}, pos, tot), + fit = pos/tot, + upper = mapply(\(pos, tot){prop.test(pos, tot, conf.level = 0.95)$conf.int[2]}, pos, tot) + ) %>% + mutate( + # estimate true prevalence and lower, upper bound + lower = pmax(0, (lower - 1 + init_sp)/(init_se + init_sp - 1)), + fit = pmax(0, (fit + init_sp - 1) / (init_se + init_sp - 1)) %>% pmin(1), + upper = pmin(1, (upper - 1 + init_sp)/(init_se + init_sp - 1)) + ) + # info for frequentist approach - output$info <- "Formula: real_sero = (apparent_sero + sp - 1) / (se + sp -1)" + output$info <- "Formula: real_sero = (observed_sero + sp - 1) / (se + sp -1)" } - output$corrected_se <- data.frame( + output$df <- data.frame( age = age, - sero = thetas, - pos = thetas*tot, #adjusted pos with estimated sero + pos = pos, tot = tot ) + output$corrected_se <- data.frame( + age = age, + sero = thetas$fit, + pos = thetas$fit*tot, #adjusted pos with estimated sero + tot = tot, + sero_lwr = thetas$lower, + sero_upr = thetas$upper + ) + + output$method <- if(bayesian) "bayesian" else "frequentist" + output } diff --git a/R/hierarchical_bayesian_model.R b/R/hierarchical_bayesian_model.R index 514ea62..c62d032 100644 --- a/R/hierarchical_bayesian_model.R +++ b/R/hierarchical_bayesian_model.R @@ -18,6 +18,8 @@ #' \item{info}{parameters for the fitted model} #' \item{sp}{seroprevalence} #' \item{foi}{force of infection} +#' \item{sp_func}{function to compute seroprevalence given age and model parameters} +#' \item{foi}{function to compute force of infection given age and model parameters} #' @export #' #' @examples @@ -62,51 +64,66 @@ hierarchical_bayesian_model <- function(data, model$info <- summary(fit)$summary - theta <- data.frame( - sp = rep (0,length(age)), - foi = rep (0,length(age)) - ) + theta <- list() if (type == "far3"){ alpha1 <- model$info["alpha1",c("mean")] alpha2 <- model$info["alpha2",c("mean")] alpha3 <- model$info["alpha3",c("mean")] - for (i in 1:data$Nage){ - theta$sp[i] = 1-exp((alpha1/alpha2)*data$age[i]*exp(-alpha2*data$age[i])+ - (1/alpha2)*((alpha1/alpha2)-alpha3)*(exp(-alpha2*data$age[i])-1)- - alpha3*data$age[i]) - theta$foi[i] = (alpha1*data$age[i]-alpha3)*exp(-alpha2*data$age[i])+alpha3 + theta$sp_func <- \(age, alpha1, alpha2, alpha3){ + 1 - exp((alpha1/alpha2)*age*exp(-alpha2*age)+ + (1/alpha2)*((alpha1/alpha2)-alpha3)*(exp(-alpha2*age)-1)- + alpha3*age) } + theta$foi_func <- \(age, alpha1, alpha2, alpha3){ + (alpha1*age-alpha3)*exp(-alpha2*age)+alpha3 + } + + theta$sp <- theta$sp_func(data$age, alpha1, alpha2, alpha3) + theta$foi <- theta$foi_func(data$age, alpha1, alpha2, alpha3) + } if (type == "far2"){ alpha1 <- model$info["alpha1",c("mean")] alpha2 <- model$info["alpha2",c("mean")] - for (i in 1:data$Nage){ - theta$sp[i] = 1-exp((alpha1 / alpha2) * data$age[i] * exp(-alpha2 * data$age[i]) + - (1 / alpha2) * ((alpha1 / alpha2)) * (exp(-alpha2 * data$age[i]) - 1)); - theta$foi[i] = (alpha1*data$age[i])*exp(-alpha2*data$age[i]) + theta$sp_func <- \(age, alpha1, alpha2){ + 1-exp((alpha1 / alpha2) * age * exp(-alpha2 * age) + + (1 / alpha2) * ((alpha1 / alpha2)) * (exp(-alpha2 * age) - 1)) + } + theta$foi_func <- \(age, alpha1, alpha2){ + (alpha1*age)*exp(-alpha2*age) } + theta$sp <- theta$sp_func(data$age, alpha1, alpha2) + theta$foi <- theta$foi_func(data$age, alpha1, alpha2) + } if (type == "log_logistic"){ alpha1 <- model$info["alpha1",c("mean")] alpha2 <- model$info["alpha2",c("mean")] - for (i in 1:data$Nage){ - theta$sp[i] = inv.logit(alpha2+alpha1*log(data$age[i])) - theta$foi[i] = alpha1*exp(alpha2)*(data$age[i]^(alpha1-1))*(1-theta$sp[i]) + theta$sp_func <- \(age, alpha1, alpha2){ + inv.logit(alpha2+alpha1*log(age)) + } + theta$foi_func <- \(age, sp, alpha1, alpha2){ + alpha1*exp(alpha2)*(age^(alpha1-1))*(1-sp) } + + theta$sp <- theta$sp_func(data$age, alpha1, alpha2) + theta$foi <- theta$foi_func(data$age, theta$sp, alpha1, alpha2) } model$sp <- theta$sp model$foi <- theta$foi model$df <- data.frame(age = age, tot = tot, pos = pos) model$type <- type + model$sp_func <- theta$sp_func + model$foi_func <- theta$foi_func class(model) <- "hierarchical_bayesian_model" diff --git a/R/plots.R b/R/plots.R index ca39e4b..78b4f89 100644 --- a/R/plots.R +++ b/R/plots.R @@ -399,8 +399,10 @@ plot.lp_model <- function(x, ...) { plot.hierarchical_bayesian_model <- function(x, ...){ cex <- if (is.null(list(...)[["cex"]])) 20 else list(...)$cex + out.DF <- compute_ci.hierarchical_bayesian_model(x) + with(x$df, { - plot_util(age = age, pos = pos, tot = tot, sero = x$sp, foi = x$foi, cex=cex) + plot_util(age = age, pos = pos, tot = tot, sero = out.DF, foi = x$foi, cex=cex) }) } @@ -530,10 +532,14 @@ plot.estimate_from_mixture <- function(x, ... ){ #' Plot output for age_time_model #' #' @param x - a `age_time_model` object -#' @param facet - whether to facet the plot by group or not -#' @param modtype - indicate which model to plot, either "monotonized" or "non-monotonized" -#' @param le - number of bins to generate x axis, higher value return smoother plot -#' @param cex - adjust the of the data points (only when facet = TRUE) +#' @param ... arbitrary params. +#' Supported options include: +#' \itemize{ +#' \item \code{facet}: Whether to facet the plot by group. +#' \item \code{modtype}: Which model to plot, either \code{"monotonized"} or \code{"non-monotonized"}. +#' \item \code{le}: Number of bins used to generate the x-axis; higher values produce smoother curves. +#' \item \code{cex}: Adjusts the size of data points (only when \code{facet = TRUE}). +#' } #' #' @importFrom graphics plot #' @import ggplot2 assertthat tidyr @@ -595,7 +601,7 @@ plot.age_time_model <- function(x, ...){ data = df_dat ), guides(shape = "none", size = "none"), - serosv:::set_plot_style(), + set_plot_style(), facet_wrap(vars(!!sym(x$grouping_col))) ) else labs(color = x$grouping_col, fill = x$grouping_col) @@ -658,3 +664,96 @@ plot_gcv <- function(age, pos, tot, nn_seq, h_seq, kern="tcub", deg=2) { # --- Combine 2 plots nn_plot + h_plot + plot_layout(ncol=2) } + +#' Plot output for corrected_prevalence +#' +#' @param x - the output of `correct_prevalence()` function +#' @param y - another output of `correct_prevalence()` function (optional, for comparison only) +#' @param facet - whether to plot as facets or on the same plot (only when y is provided) +#' @import ggplot2 tidyr patchwork +#' @importFrom magrittr %>% +#' @importFrom assertthat assert_that +#' +#' @return ggplot object +#' @export +plot_corrected_prev <- function(x, y=NULL, facet=FALSE){ + dat <- x$df + corrected_dat <- x$corrected_se %>% mutate(label = paste0("estimated prevalence")) + corrected_dat2 <- NULL + + if(!is.null(y)){ + if (!facet){ + corrected_dat <- bind_rows( + corrected_dat %>% mutate(label = paste0("estimated prevalence (", x$method,")")), + y$corrected_se %>% mutate(label = paste0("estimated prevalence (", y$method,")")) + ) + }else{ + corrected_dat2 <- y$corrected_se %>% mutate(label = paste0("estimated prevalence")) + } + } + + generate_layers <- function(data, title = NULL){ + # always return point layer + out <- list( + geom_point( + aes( + x = age, y = sero, + color = label + ), + alpha = 0.7, data = data + ) + ) + # if facet or only x is provided or facet, add error_bar layer + if(facet | is.null(y)){ + out[[length(out) + 1]] <- geom_errorbar( + aes( + x = age, y = sero, + ymin = sero_lwr, + ymax = sero_upr, + color = label + ), + alpha = 0.7, data = data + ) + } + # them add label layer + out[[length(out) + 1]] <- labs(x = "Age", y = "Prevalence", title = title) + # finally add scale color layer + out[[length(out) + 1]] <- if(facet | is.null(y)) + scale_color_manual( + values = c( + "apparent prevalence" = "red", + "estimated prevalence" = "royalblue" + )) + else + scale_color_manual( + values = c( + "apparent prevalence" = "red", + "estimated prevalence (bayesian)" = "blueviolet", + "estimated prevalence (frequentist)" = "royalblue" + )) + out + } + + # when y is not given force facet to FALSE + facet <- if(is.null(y)) FALSE else facet + + plot <- ggplot() + + geom_point(aes( + x = age, y = pos / tot, + color = "apparent prevalence" + ), data = dat) + + if(!facet){ + plot <- plot + generate_layers(corrected_dat, title = if(is.null(y)) paste0("Plot for ", x$method, " approach") else NULL) + }else{ + plot1 <- plot + generate_layers(corrected_dat, title = paste0("Plot for ", x$method, " approach")) + plot2 <- plot + generate_layers(corrected_dat2, title = paste0("Plot for ", y$method, " approach")) + + plot <- plot1/plot2 + } + + plot +} + + + diff --git a/R/to_titer.R b/R/to_titer.R new file mode 100644 index 0000000..2735f89 --- /dev/null +++ b/R/to_titer.R @@ -0,0 +1,377 @@ +#' Process assay test result to titer +#' +#' @param df - a standardized data.frame generated by`standardize_data()` +#' @param model - either the name of a built-in model to fit standard curve or a named list of 2 functions for "mod" and "quantify_ci" +#' @param positive_threshold - if not NULL, processed_data will have the serostatus labeled +#' @param ci - confidence interval for the titer estimates +#' @param negative_control - if TRUE, output tibble will include the result for negative controls +#' @importFrom magrittr %>% +#' @import dplyr purrr +#' +#' @return a data.frame with 8 columns +#' \item{plate_id}{id of the plate} +#' \item{data}{list of `data.frame`s containing the results from each plate} +#' \item{antitoxin_df}{list of `data.frame`s containing the results for antitoxins from each plate} +#' \item{standard_curve_func}{list of functions mapping from OD to titer for each plate} +#' \item{std_crv_midpoint}{midpoint of the standard curve, for qualitative analysis} +#' \item{processed_data}{list of `tibble`s containing samples with titer estimates (lower, median, upper)} +#' \item{negative_control}{list of `tibble`s containing negative control check results (if `negative_control=TRUE`)} +#' @export +to_titer <- function(df, model="4PL", positive_threshold=NULL, ci = .95, + negative_control=TRUE){ + # Expected format for df + # have columns: sample_id (which can be id of sample or label as antitoxin), result, dilution_factors, negative control + + mod <- if (is.character(model) && model == "4PL") { + nls4PL + } else if (is.list(model) && is.function(model[["mod"]])) { + model[["mod"]] + } + + quantify_ci_func <- if (is.character(model) && model == "4PL") { + simulate_nls_ci + } else if (is.list(model) && is.function(model[["quantify_ci"]])) { + model[["quantify_ci"]] + } + + out <- df %>% + group_by(plate_id) %>% + nest() %>% + mutate( + # For each data from each plate, do the following + # 1: retrieve the anti-toxins data from all plates + antitoxin_df = map(data, get_antitoxins), + # 2: generate the standard curve with CI in the form of a dataframe + standard_curve_df = map(antitoxin_df, \(df, mod) standard_curve_data(df, mod, quantify_ci=quantify_ci_func, level=ci), mod), + # 3: convert the standard curve dataframe into a standard curve function + standard_curve_func = map(standard_curve_df, data2function), + # 4: get the midpoint of the result for qualitative analysis when OD is too high + std_crv_midpoint = map_dbl(antitoxin_df, \(df) median(df$result, na.rm = TRUE) ), + # 5: convert the OD of the samples into log-concentrations + processed_data = pmap(list(data, standard_curve_func, std_crv_midpoint), \(dat, func, midpoint) { + dat %>% + filter(sample_id != "antitoxin") %>% + process_samples(func, midpoint=midpoint, positive_threshold = positive_threshold) + }) + ) + + if(negative_control){ + assert_that(is.numeric(positive_threshold), + msg = "A threshold value must be provided when `negative_control=TRUE`") + + out <- out %>% + mutate( + negative_control = map(data, get_negative_controls), + negative_control = pmap(list(negative_control, standard_curve_func, std_crv_midpoint), \(dat, func, midpoint) { + process_samples(dat, func, midpoint = midpoint, positive_threshold = positive_threshold) + }) + ) + } + + out +} + +#' Preprocess data +#' +#' @param df - data.frame with columns for plate id, sample id, result, dilution factor, and (optionally) negative controls +#' @param plate_id_col - name of the column storing plates id +#' @param id_col - name of the column storing sample id +#' @param result_col - name of the column storing result +#' @param dilution_fct_col - name of the column storing dilution factors +#' @param antitoxin_label - how antitoxin is label in the sample id column +#' @param negative_col - regex for columns for negative controls, assumed to be a label followed by the dilution factor (e.g. NEGATIVE_50, NEGATIVE_100) +#' +#' @importFrom magrittr %>% +#' @importFrom janitor clean_names +#' +#' @import dplyr +#' +#' @return a standardized data.frame that can be passed to `to_titer()` +#' @export +standardize_data <- function(df, + plate_id_col = "PLATE_ID", + id_col = "SAMPLE_ID", + result_col = "RESULT", + dilution_fct_col = "DILUTION_FACTORS", + antitoxin_label = "Anti_toxin", + negative_col = "^NEGATIVE_*"){ + + neg_control_cols <- grep(negative_col, colnames(df), value = TRUE) + if (length(neg_control_cols) == 0){ + message("Columns for negative controls not detected") + }else{ + colnames(df) <- gsub(negative_col, "negative_", colnames(df)) + } + + df <- df %>% + rename( + plate_id := plate_id_col, + sample_id := id_col, + result := result_col, + dilution_factors := dilution_fct_col + ) %>% + clean_names() %>% + mutate( + sample_id = if_else(sample_id == antitoxin_label, "antitoxin", sample_id) + ) + + + df +} + +# ======= Helper functions ======= +# Helper function to do 2 things +# - get antitoxins data +# - compute concentration from reference concentration and dilution factor +get_antitoxins <- function(plate, ref_conc = 10) { + # resolve no visible binding NOTE + sample_id <- dilution_factors <- NULL + + plate %>% + filter(sample_id == "antitoxin") %>% + mutate(concentration = ref_conc / dilution_factors) +} + +# Function to generate the standard curve and its confidence interval +standard_curve_data <- function(df, model, quantify_ci=simulate_nls_ci, + le = 512, level = .95, nb = 9999) { + + log_concentration <- log10(df$concentration) + logc <- seq(min(log_concentration), max(log_concentration), le = le) + alpha <- (1 - level) / 2 + df %>% + model() %>% + quantify_ci(list(concentration = 10^logc), nb=nb, alpha=alpha) %>% + (\(.x) bind_cols(logc = logc, .x))() +} + +# Function to turn the standard curve data.frame to a function that coverts OD to LC +data2function <- function(df) { + with(df, { + approxfun2 <- function(...) approxfun(y = logc, ...) + pred_lwr <- approxfun2(upper) + pred_mdi <- approxfun2(median) + pred_upp <- approxfun2(lower) + function(x) c(lower = pred_lwr(x), median = pred_mdi(x), upper = pred_upp(x)) + }) +} + +# Function to convert samples' OD to LC +process_samples <- function(plate, std_crv, midpoint=2, positive_threshold = NULL) { + out <- plate %>% + bind_cols(map_dfr(.$result, std_crv)) + + if(is.numeric(positive_threshold)){ + out <- out %>% mutate( + positive = if_else( + # check if upper bound for titer is available + !is.na(upper), + upper >= positive_threshold, # positive if above threshold + # if upper is NA, then OD (or result) is either too high or too low + # label positive when result is higher than midpoint + result > midpoint + ) + ) + } +} + +# === Functions for 4PL model ======== +# Helper function to look at the data and generate a good initial guess for the parameters +good_guess4PL <- function(x, y, eps = .3) { + nb_rep <- unique(table(x)) + the_order <- order(x) + x <- x[the_order] + y <- y[the_order] + a <- min(y) + d <- max(y) + c <- approx(y, x, (d - a) / 2, ties = "ordered")$y + list(a = a, c = c, d = d, + b = ( + approx(x, y, c + eps, ties = "ordered")$y - + approx(x, y, c - eps, ties = "ordered")$y + ) / (2 * eps)) +} + + +# function to fit data to a 4PL model +nls4PL <- function(df) { + nls(result ~ d + (a - d) / (1 + 10^((log10(concentration) - c) * b)), + data = df, + start = with(df, good_guess4PL(log10(concentration), result))) +} + +# helper function to generate data to quantify confidence interval (using bootstrapping) +# general flow is: +# - sample different values for parameters (number of samples = nb) +# - use model to compute OD for the new set of parameter values +#' @importFrom mvtnorm rmvnorm +simulate_nls_ci <- function(object, newdata, nb = 9999, alpha = .025) { + rowsplit <- function(df) split(df, 1:nrow(df)) + + nb |> + rmvnorm(mean = coef(object), sigma = vcov(object)) |> + as.data.frame() |> + rowsplit() |> + map(as.list) |> + map(~ c(.x, newdata)) |> + map_dfc(eval, expr = parse(text = as.character(formula(object))[3])) %>% + apply(1, quantile, c(alpha, .5, 1 - alpha)) %>% + t() %>% as.data.frame() %>% + setNames(c("lower", "median", "upper")) +} + +# ===== Quality Control functions ======== +# only use dilution factor where negative sample is indeed negative +get_negative_controls <- function(plate, std_crv){ + plate %>% + select(starts_with("negative")) %>% + unique() %>% + tidyr::pivot_longer(everything(), names_to = "dilution", values_to = "result") %>% + mutate(across(dilution, ~ stringr::str_remove(.x, "negative_") %>% as.integer())) +} + +# ======== Plot functions ========== +#' Visualize standard curves for each plate +#' +#' @param x - output of `to_titer()` +#' @param facet - whether to faceted by plates or plot all standard curves on a single plot +#' @param xlab - label of the x axis +#' @param ylab - label of the y axis +#' @param datapoint_size - size of the data point (only applicable when `facet=TRUE`) +#' +#' @importFrom magrittr %>% +#' @import ggplot2 dplyr +#' +#' @export +plot_standard_curve <- function(x, facet=TRUE, + xlab = "log10(concentration)", + ylab = "Optical density", + datapoint_size = 2){ + scdf <- x %>% select(plate_id, standard_curve_df) %>% unnest(standard_curve_df) %>% ungroup() + data <- x %>% select(plate_id, antitoxin_df) %>% unnest(antitoxin_df) %>% ungroup() + + if (is.null(ylim)) ylim <- c(0, max(scdf$upper, data$result)) + + plot <- ggplot() + + geom_line( + aes( + x = logc, y = median, group = if (!facet) plate_id else NULL + ), + color = "#fc0328", lwd = 0.5, data = scdf, show.legend = FALSE + ) + + labs( + x = xlab, + y = ylab + ) + + if (facet){ + plot <- plot + + geom_ribbon( + aes( + x = logc, y = median, ymin = lower, ymax = upper + ), + fill = adjustcolor("#fc0328", alpha.f = 0.2), data = scdf + ) + + geom_point( + aes( + x = log10(concentration), + y = result + ), + shape = 3, size = datapoint_size, + color = "blue", alpha = 0.8, + data = data + ) + + facet_wrap(~plate_id, drop = TRUE) + } + plot +} + +#' Visualize positive threshold at different dilution factors +#' +#' @param dilution_factors dilution factors to be visualized +#' @param positive_threshold titer threshold for sample to be considered positive +#' @param shift_text adjust how much the text is shifted along the x-axis (relative to the threshold line) +#' +#' @importFrom magrittr %>% +#' @import ggplot2 dplyr purrr +#' +#' @export +add_thresholds <- function(dilution_factors, positive_threshold = 0.1, + shift_text = 0.15) { + list( + geom_vline(aes(xintercept = log10( + positive_threshold / c(1, dilution_factors) + )), + color = "forestgreen"), + geom_text( + aes(x = x, y = 4, label = label), + data = data.frame( + x = log10(positive_threshold / c(1, dilution_factors)) + shift_text, + label = paste0("1:", c(1, dilution_factors)) + ), + color = "forestgreen" + ) + ) +} + + +#' Quality control plot +#' +#' Visualize for each sample, whether titer estimates can be computed at the tested dilutions. +#' +#' Each sample is represented by a `n_estimates x n_dilutions` grid where cell color indicate +#' estimate availability (green = estimate available, orange = result too low, red = result too high) +#' +#' These sample grids are arranged in columns where each column represent samples from a plate +#' +#' The figure below demonstrates the interpretation of the plot. +#' \figure{interpret_titer_qc.png}{options: width="70\%"} +#' +#' @param x - output of `to_titer()` +#' @param n_plates - maximum number of plates to plot +#' @param n_samples - maximum number of samples per plate to plot +#' @param n_dilutions - number of dilutions used for testing +#' +#' @importFrom magrittr %>% +#' @import ggplot2 dplyr purrr +#' +#' @export +plot_titer_qc <- function(x, n_plates=18, n_samples=22, n_dilutions = 3){ + # function to add missing values if there are less than 22 samples in a plate: + n_estimates <- 3 # number of estimates per dilution (point + confidence interval) + n_plates <- if(is.numeric(n_plates)) n_plates else nrow(x) + + template <- matrix(rep(NA, n_samples * n_dilutions * n_estimates), nrow = n_estimates) + format_mat <- function(x) { + template[, 1:min(n_samples*n_dilutions, ncol(x))] <- x[,1:min(n_samples*n_dilutions, ncol(x))] + template + } + + # get standard curve midpoint to determine whether OD is too high or too low when titer is NA + midpoint <- median(x$std_crv_midpoint) + + # the function that draws the heatmap for 1 plate + abline2 <- function(...) abline(..., col = "white") + plot_heatmap <- function(x) { + midpoint <- mean(range(x$result, na.rm = TRUE)) + + x %>% + mutate(across(c(lower, median, upper), + ~ as.numeric(is.na(.x)) * ((result > midpoint) + 1))) %>% + select(lower, median, upper) %>% + as.matrix() %>% + t() %>% + format_mat() %>% + (\(.x) .x[, ncol(.x):1])() %>% + (\(.x) image(1:n_estimates, 1:(n_samples * n_dilutions), .x, + axes = FALSE, ann = FALSE, col = c(3, 7, 2), zlim = c(1,n_estimates)-1))() + + abline2(v = 1:(n_estimates-1) + .5) + abline2(h = seq(n_dilutions, n_dilutions * n_samples, n_dilutions) + .5) + } + + opar <- par(mfrow = c(1, n_plates), mar = c(0,0.25,0,0.25)) + walk(head(x$processed_data, n=n_plates), plot_heatmap) + + par(opar) +} diff --git a/R/utils.R b/R/utils.R index 481763f..e288f77 100644 --- a/R/utils.R +++ b/R/utils.R @@ -55,6 +55,8 @@ pava<- function(pos=pos,tot=rep(1,length(pos))) return(list(pai1=pai1,pai2=pai2)) } +#' Aggregate data +#' #' Generate a dataframe with `t`, `pos` and `tot` columns from #' `t` and `seropositive` vectors. #' @@ -70,7 +72,7 @@ pava<- function(pos=pos,tot=rep(1,length(pos))) #' @importFrom dplyr group_by #' @importFrom dplyr n #' @importFrom dplyr summarize -#' @import magrittr +#' @importFrom magrittr %>% #' #' @return dataframe in aggregated format #' @export diff --git a/README.Rmd b/README.Rmd index a96289b..dadb13d 100644 --- a/README.Rmd +++ b/README.Rmd @@ -22,7 +22,7 @@ knitr::opts_chunk$set( -`serosv` is an easy-to-use and efficient tool to estimate infectious diseases parameters (seroprevalence and force of infection) using serological data. The current version is based on the book "Modeling Infectious Disease Parameters Based on Serological and Social Contact Data -- A Modern Statistical Perspective" by [Hens et al., 2012 Springer](https://link.springer.com/book/10.1007/978-1-4614-4072-7). +`serosv` is an easy-to-use and efficient tool to estimate infectious diseases parameters (seroprevalence and force of infection) using serological data. The current version is mostly based on the book "Modeling Infectious Disease Parameters Based on Serological and Social Contact Data -- A Modern Statistical Perspective" by [Hens et al., 2012 Springer](https://link.springer.com/book/10.1007/978-1-4614-4072-7). ## Installation @@ -82,7 +82,7 @@ Hierarchical Bayesian approaches: Load the rubella in UK dataset. -```{r, eval=FALSE} +```{r} library(serosv) ``` @@ -112,18 +112,20 @@ plot(fpmd) ### Fitting Parvo B19 data from Finland ```{r} +library(dplyr) parvob19 <- parvob19_fi_1997_1998 # for linelisting data, either transform it to aggregated transform_data( parvob19$age, parvob19$seropositive, - stratum_col = "age") %>% - polynomial_model(type = "Muench") %>% + stratum_col = "age") |> + polynomial_model(type = "Muench") |> plot() + # or fit data as is -parvob19 %>% - rename(status = seropositive) %>% - polynomial_model(type = "Muench") %>% +parvob19 |> + rename(status = seropositive) |> + polynomial_model(type = "Muench") |> plot() ``` diff --git a/README.md b/README.md index 85b5f9e..eaf53b6 100644 --- a/README.md +++ b/README.md @@ -13,9 +13,9 @@ coverage](https://codecov.io/gh/OUCRU-Modelling/serosv/branch/main/graph/badge.s `serosv` is an easy-to-use and efficient tool to estimate infectious diseases parameters (seroprevalence and force of infection) using -serological data. The current version is based on the book “Modeling -Infectious Disease Parameters Based on Serological and Social Contact -Data – A Modern Statistical Perspective” by [Hens et al., 2012 +serological data. The current version is mostly based on the book +“Modeling Infectious Disease Parameters Based on Serological and Social +Contact Data – A Modern Statistical Perspective” by [Hens et al., 2012 Springer](https://link.springer.com/book/10.1007/978-1-4614-4072-7). ## Installation @@ -108,8 +108,10 @@ best_2d_mn #> Call: glm(formula = as.formula(formulate(p_cur)), family = binomial(link = link)) #> #> Coefficients: -#> (Intercept) I(age^-0.9) I(I(age^-0.9) * log(age)) -#> 4.342 -4.696 -9.845 +#> (Intercept) I(age^-0.9) +#> 4.342 -4.696 +#> I(I(age^-0.9) * log(age)) +#> -9.845 #> #> Degrees of Freedom: 43 Total (i.e. Null); 41 Residual #> Null Deviance: 1369 @@ -126,31 +128,41 @@ fpmd <- fp_model( plot(fpmd) ``` - + ### Fitting Parvo B19 data from Finland ``` r +library(dplyr) +#> Warning: package 'dplyr' was built under R version 4.3.1 +#> +#> Attaching package: 'dplyr' +#> The following objects are masked from 'package:stats': +#> +#> filter, lag +#> The following objects are masked from 'package:base': +#> +#> intersect, setdiff, setequal, union parvob19 <- parvob19_fi_1997_1998 # for linelisting data, either transform it to aggregated transform_data( parvob19$age, parvob19$seropositive, - stratum_col = "age") %>% - polynomial_model(type = "Muench") %>% + stratum_col = "age") |> + polynomial_model(type = "Muench") |> plot() ``` - + ``` r # or fit data as is -parvob19 %>% - rename(status = seropositive) %>% - polynomial_model(type = "Muench") %>% +parvob19 |> + rename(status = seropositive) |> + polynomial_model(type = "Muench") |> plot() ``` - + diff --git a/_pkgdown.yml b/_pkgdown.yml index 208fa7b..3e3609b 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -59,6 +59,9 @@ reference: - title: Plotting functions contents: + - plot_corrected_prev + - plot_standard_curve + - plot_titer_qc - plot_gcv - plot.polynomial_model - plot.farrington_model @@ -75,14 +78,17 @@ reference: - plot.estimate_from_mixture - plot.age_time_model - set_plot_style + - add_thresholds - title: Other utilities contents: - serosv-package - est_foi - pava + - to_titer - correct_prevalence - transform_data + - standardize_data - compare_models - compute_ci - compute_ci.fp_model @@ -91,6 +97,7 @@ reference: - compute_ci.penalized_spline_model - compute_ci.mixture_model - compute_ci.age_time_model + - compute_ci.hierarchical_bayesian_model - find_best_fp_powers articles: @@ -104,6 +111,7 @@ articles: - semiparametric_model - hierarchical_model - model_quantitative_data + - repeated_cross_sectional - title: Utilities navbar: Utilities contents: diff --git a/cran-comments.md b/cran-comments.md index a535e15..9bcdd0f 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,8 +1,6 @@ ## Minor release In this version I have * add correct_prevalence() and compare_models() functions -* update API to takes a data.frame as input -* resolve import warning ## R CMD check results checking installed package size ... NOTE diff --git a/docs/404.html b/docs/404.html index 4ac8e06..39e5eba 100644 --- a/docs/404.html +++ b/docs/404.html @@ -6,86 +6,72 @@ Page not found (404) • serosv - - - - - - + + + + + - - - + + + - Skip to contents - -