diff --git a/DESCRIPTION b/DESCRIPTION
index f87cb256..f2d7fb09 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -36,10 +36,6 @@ Authors@R: c(person(given = "Paul",
family = "Avraam",
role = c("aut"),
comment = c(ORCID = "0000-0001-8908-2441")),
- person(given = "Demetris",
- family = "Avraam",
- role = c("aut"),
- comment = c(ORCID = "0000-0001-8908-2441")),
person(given = "Yannick",
family = "Marcon",
role = c("aut"),
diff --git a/NAMESPACE b/NAMESPACE
index ec905eb6..40674573 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -73,6 +73,7 @@ export(ds.matrixDimnames)
export(ds.matrixInvert)
export(ds.matrixMult)
export(ds.matrixTranspose)
+export(ds.mdPattern)
export(ds.mean)
export(ds.meanByClass)
export(ds.meanSdGp)
diff --git a/R/ds.asFactor.R b/R/ds.asFactor.R
index 476f00f8..8e5fbd09 100644
--- a/R/ds.asFactor.R
+++ b/R/ds.asFactor.R
@@ -48,7 +48,7 @@
#' \code{baseline.level = 1} and \code{forced.factor.levels = c(1,2,3,4,5)}.
#' The input vector is converted to the following matrix of dummy variables:
#'
-#' \tabular{rrrrr}{
+#' \tabular{rrrr}{
#' \strong{DV2} \tab \strong{DV3} \tab \strong{DV4} \tab \strong{DV5} \cr
#' 0 \tab 0 \tab 0 \tab 0\cr
#' 1 \tab 0 \tab 0 \tab 0\cr
diff --git a/R/ds.mdPattern.R b/R/ds.mdPattern.R
new file mode 100644
index 00000000..af59498e
--- /dev/null
+++ b/R/ds.mdPattern.R
@@ -0,0 +1,305 @@
+#'
+#' @title Display missing data patterns with disclosure control
+#' @description This function is a client-side wrapper for the server-side mdPatternDS
+#' function. It generates a missing data pattern matrix similar to mice::md.pattern but
+#' with disclosure control applied to prevent revealing small cell counts.
+#' @details The function calls the server-side mdPatternDS function which uses
+#' mice::md.pattern to analyze missing data patterns. Patterns with counts below the
+#' disclosure threshold (default: nfilter.tab = 3) are suppressed to maintain privacy.
+#'
+#' \strong{Output Format:}
+#' - Each row represents a missing data pattern
+#' - Pattern counts are shown in row names (e.g., "150", "25")
+#' - Columns show 1 if the variable is observed, 0 if missing
+#' - Last column shows the total number of missing values per pattern
+#' - Last row shows the total number of missing values per variable
+#'
+#' \strong{Disclosure Control:}
+#'
+#' Suppressed patterns (count below threshold) are indicated by:
+#' - Row name: "suppressed()" where N is the threshold
+#' - All pattern values set to NA
+#' - Summary row also suppressed to prevent back-calculation
+#'
+#' \strong{Pooling Behavior (type='combine'):}
+#'
+#' When pooling across studies, the function uses a \emph{conservative approach}
+#' for disclosure control:
+#'
+#' 1. Identifies identical missing patterns across studies
+#' 2. \strong{EXCLUDES suppressed patterns from pooling} - patterns suppressed in
+#' ANY study are not included in the pooled count
+#' 3. Sums counts only for non-suppressed identical patterns
+#' 4. Re-validates pooled counts against disclosure threshold
+#'
+#' \strong{Important:} This conservative approach means:
+#' - Pooled counts may be \emph{underestimates} if some studies had suppressed patterns
+#' - This prevents disclosure through subtraction (e.g., if study A shows count=5
+#' and pool shows count=7, one could deduce study B has count=2, violating disclosure)
+#' - Different patterns across studies are preserved separately in the pooled result
+#'
+#' @param x a character string specifying the name of a data frame or matrix on the
+#' server-side containing the data to analyze.
+#' @param type a character string specifying the output type. If 'split' (default),
+#' returns separate patterns for each study. If 'combine', attempts to pool patterns
+#' across studies.
+#' @param datasources a list of \code{\link[DSI]{DSConnection-class}} objects obtained
+#' after login. If the \code{datasources} argument is not specified, the default set of
+#' connections will be used: see \code{\link[DSI]{datashield.connections_default}}.
+#' @return For type='split': A list with one element per study, each containing:
+#' \describe{
+#' \item{pattern}{The missing data pattern matrix for that study}
+#' \item{valid}{Logical indicating if all patterns meet disclosure requirements}
+#' \item{message}{A message describing the validity status}
+#' }
+#'
+#' For type='combine': A list containing:
+#' \describe{
+#' \item{pattern}{The pooled missing data pattern matrix across all studies}
+#' \item{valid}{Logical indicating if all pooled patterns meet disclosure requirements}
+#' \item{message}{A message describing the validity status}
+#' }
+#' @author Xavier Escribà montagut for DataSHIELD Development Team
+#' @export
+#' @examples
+#' \dontrun{
+#' ## Version 6, for version 5 see the Wiki
+#'
+#' # Connecting to the Opal servers
+#'
+#' require('DSI')
+#' require('DSOpal')
+#' require('dsBaseClient')
+#'
+#' builder <- DSI::newDSLoginBuilder()
+#' builder$append(server = "study1",
+#' url = "http://192.168.56.100:8080/",
+#' user = "administrator", password = "datashield_test&",
+#' table = "CNSIM.CNSIM1", driver = "OpalDriver")
+#' builder$append(server = "study2",
+#' url = "http://192.168.56.100:8080/",
+#' user = "administrator", password = "datashield_test&",
+#' table = "CNSIM.CNSIM2", driver = "OpalDriver")
+#' logindata <- builder$build()
+#'
+#' connections <- DSI::datashield.login(logins = logindata, assign = TRUE, symbol = "D")
+#'
+#' # Get missing data patterns for each study separately
+#' patterns_split <- ds.mdPattern(x = "D", type = "split", datasources = connections)
+#'
+#' # View results for study1
+#' print(patterns_split$study1$pattern)
+#' # var1 var2 var3
+#' # 150 1 1 1 0 <- 150 obs complete
+#' # 25 0 1 1 1 <- 25 obs missing var1
+#' # 25 0 0 25 <- Summary: 25 missing per variable
+#'
+#' # Get pooled missing data patterns across studies
+#' patterns_pooled <- ds.mdPattern(x = "D", type = "combine", datasources = connections)
+#' print(patterns_pooled$pattern)
+#'
+#' # Example with suppressed patterns:
+#' # If study1 has a pattern with count=2 (suppressed) and study2 has same pattern
+#' # with count=5 (valid), the pooled result will show count=5 (conservative approach)
+#' # A warning will indicate: "Pooled counts may underestimate the true total"
+#'
+#' # Clear the Datashield R sessions and logout
+#' datashield.logout(connections)
+#' }
+#'
+ds.mdPattern <- function(x = NULL, type = 'split', datasources = NULL){
+
+ # Look for DS connections
+ if(is.null(datasources)){
+ datasources <- datashield.connections_find()
+ }
+
+ # Ensure datasources is a list of DSConnection-class
+ if(!(is.list(datasources) && all(unlist(lapply(datasources, function(d) {methods::is(d,"DSConnection")}))))){
+ stop("The 'datasources' were expected to be a list of DSConnection-class objects", call.=FALSE)
+ }
+
+ if(is.null(x)){
+ stop("Please provide the name of a data frame or matrix!", call.=FALSE)
+ }
+
+ # Get study names
+ study_names <- names(datasources)
+
+ # Call the server side function
+ cally <- call("mdPatternDS", x)
+ results <- DSI::datashield.aggregate(datasources, cally)
+
+ # Process results based on type
+ if(type == "split"){
+ # Return individual study results
+ return(results)
+
+ } else if(type == "combine"){
+ # Pool results across studies
+
+ # First check if any study has invalid patterns
+ any_invalid <- any(sapply(results, function(r) !r$valid))
+ invalid_studies <- names(results)[sapply(results, function(r) !r$valid)]
+
+ if(any_invalid){
+ warning(
+ "Disclosure control: Some studies have suppressed patterns (below threshold).\n",
+ " Studies with suppressed patterns: ", paste(invalid_studies, collapse=", "), "\n",
+ " These patterns are EXCLUDED from pooling to prevent disclosure.\n",
+ " Pooled counts may underestimate the true total.",
+ call. = FALSE
+ )
+ }
+
+ # Extract patterns from each study
+ patterns_list <- lapply(results, function(r) r$pattern)
+
+ # Check if all patterns have the same variables (columns)
+ n_vars <- sapply(patterns_list, ncol)
+ if(length(unique(n_vars)) > 1){
+ stop("Cannot pool patterns: studies have different numbers of variables", call.=FALSE)
+ }
+
+ var_names <- colnames(patterns_list[[1]])
+ if(length(patterns_list) > 1){
+ for(i in 2:length(patterns_list)){
+ if(!identical(colnames(patterns_list[[i]]), var_names)){
+ warning("Variable names differ across studies. Pooling by position.")
+ break
+ }
+ }
+ }
+
+ # Pool the patterns
+ pooled_pattern <- .pool_md_patterns(patterns_list, study_names)
+
+ # Check validity of pooled results
+ # Get threshold from first study's results or use a default check
+ nfilter.tab <- getOption("default.nfilter.tab")
+ if(is.null(nfilter.tab)) nfilter.tab <- 3
+
+ n_patterns <- nrow(pooled_pattern) - 1
+ pooled_valid <- TRUE
+
+ if(n_patterns > 0){
+ # Pattern counts are in row names
+ pattern_counts <- as.numeric(rownames(pooled_pattern)[1:n_patterns])
+ pattern_counts <- pattern_counts[!is.na(pattern_counts) & pattern_counts > 0]
+
+ if(any(pattern_counts < nfilter.tab)){
+ pooled_valid <- FALSE
+ }
+ }
+
+ pooled_message <- ifelse(pooled_valid,
+ "Valid: all pooled pattern counts meet disclosure requirements",
+ "Some pooled pattern counts may be below threshold")
+
+ return(list(
+ pattern = pooled_pattern,
+ valid = pooled_valid,
+ message = pooled_message,
+ studies = study_names
+ ))
+
+ } else {
+ stop("Argument 'type' must be either 'split' or 'combine'", call.=FALSE)
+ }
+}
+
+#' @title Pool missing data patterns across studies
+#' @description Internal function to pool md.pattern results from multiple studies
+#' @param patterns_list List of pattern matrices from each study
+#' @param study_names Names of the studies
+#' @return Pooled pattern matrix
+#' @keywords internal
+.pool_md_patterns <- function(patterns_list, study_names){
+
+ # Initialize with first study's pattern structure
+ pooled <- patterns_list[[1]]
+ n_vars <- ncol(pooled)
+ n_rows <- nrow(pooled) - 1 # Exclude summary row
+
+ # Create a list to store unique patterns
+ unique_patterns <- list()
+ pattern_counts <- list()
+
+ # Process each study
+ for(i in seq_along(patterns_list)){
+ pattern <- patterns_list[[i]]
+ study_n_patterns <- nrow(pattern) - 1
+
+ if(study_n_patterns > 0){
+ for(j in 1:study_n_patterns){
+ # Get pattern (columns show 1/0 for observed/missing)
+ pat_vector <- pattern[j, 1:(n_vars-1)]
+ # Pattern count is in row name
+ pat_count_str <- rownames(pattern)[j]
+ pat_count <- suppressWarnings(as.numeric(pat_count_str))
+
+ # Skip if suppressed (non-numeric row name like "suppressed(<3)")
+ if(is.na(pat_count)){
+ next
+ }
+
+ # Convert pattern to string for comparison
+ pat_string <- paste(pat_vector, collapse="_")
+
+ # Check if this pattern already exists
+ if(pat_string %in% names(unique_patterns)){
+ # Add to existing count
+ pattern_counts[[pat_string]] <- pattern_counts[[pat_string]] + pat_count
+ } else {
+ # New pattern
+ unique_patterns[[pat_string]] <- pat_vector
+ pattern_counts[[pat_string]] <- pat_count
+ }
+ }
+ }
+ }
+
+ # Build pooled pattern matrix
+ if(length(unique_patterns) == 0){
+ # No valid patterns
+ pooled[1:n_rows, ] <- NA
+ } else {
+ # Sort patterns by count (descending)
+ sorted_idx <- order(unlist(pattern_counts), decreasing = TRUE)
+ sorted_patterns <- unique_patterns[sorted_idx]
+ sorted_counts <- pattern_counts[sorted_idx]
+
+ # Create new pooled matrix
+ n_pooled_patterns <- length(sorted_patterns)
+ pooled <- matrix(NA, nrow = n_pooled_patterns + 1, ncol = n_vars)
+ colnames(pooled) <- colnames(patterns_list[[1]])
+
+ # Set row names (counts for patterns, empty for summary)
+ row_names <- c(as.character(unlist(sorted_counts)), "")
+ rownames(pooled) <- row_names
+
+ # Fill in patterns
+ for(i in 1:n_pooled_patterns){
+ pooled[i, 1:(n_vars-1)] <- sorted_patterns[[i]]
+ # Calculate number of missing for this pattern
+ pooled[i, n_vars] <- sum(sorted_patterns[[i]] == 0)
+ }
+ }
+
+ # Calculate summary row (total missing per variable)
+ # Sum across studies
+ summary_row <- rep(0, n_vars)
+ for(i in seq_along(patterns_list)){
+ study_summary <- patterns_list[[i]][nrow(patterns_list[[i]]), ]
+ # Only add if not suppressed
+ if(!all(is.na(study_summary))){
+ summary_row <- summary_row + ifelse(is.na(study_summary), 0, study_summary)
+ }
+ }
+
+ # Add summary row
+ pooled[nrow(pooled), ] <- summary_row
+
+ return(pooled)
+}
+
diff --git a/docker-compose_armadillo.yml b/docker-compose_armadillo.yml
index 26bd8b85..37c44cda 100644
--- a/docker-compose_armadillo.yml
+++ b/docker-compose_armadillo.yml
@@ -3,7 +3,7 @@ services:
hostname: armadillo
ports:
- 8080:8080
- image: datashield/armadillo_citest:5.9.4
+ image: datashield/armadillo_citest:5.11.0
environment:
LOGGING_CONFIG: 'classpath:logback-file.xml'
AUDIT_LOG_PATH: '/app/logs/audit.log'
@@ -16,6 +16,7 @@ services:
default:
hostname: default
- image: datashield/rock-omicron-karma-permissive:devel
+ image: datashield/rock-quebrada-lamda:latest
+# image: datashield/rserver-panda-lamda:devel
environment:
DEBUG: "FALSE"
diff --git a/docker-compose_opal.yml b/docker-compose_opal.yml
index 1a048f51..a62dec67 100644
--- a/docker-compose_opal.yml
+++ b/docker-compose_opal.yml
@@ -20,6 +20,6 @@ services:
- MONGO_INITDB_ROOT_USERNAME=root
- MONGO_INITDB_ROOT_PASSWORD=foobar
rock:
- image: datashield/rock-lemon-donkey-permissive:draft
+ image: datashield/rock-quebrada-lamda-permissive:latest
environment:
DEBUG: "FALSE"
diff --git a/docs/404.html b/docs/404.html
index 761ee0b9..76de734e 100644
--- a/docs/404.html
+++ b/docs/404.html
@@ -32,7 +32,7 @@
dsBaseClient
- 6.3.4
+ 6.3.5-9000
@@ -73,12 +73,12 @@
Burton P, Wilson R, Butters O, Ryser-Welch P, Westerberg A, Abarrategui L, Villegas-Diaz R, Avraam D, Avraam D, Marcon Y, Bishop T, Gaye A, Escribà-Montagut X, Wheater S (2025).
+
Burton P, Wilson R, Butters O, Ryser-Welch P, Westerberg A, Abarrategui L, Villegas-Diaz R, Avraam D, Marcon Y, Bishop T, Gaye A, Escribà-Montagut X, Wheater S (????).
dsBaseClient: 'DataSHIELD' Client Side Base Functions.
-R package version 6.3.4.
+R package version 6.3.5-9000.
@Manual{,
title = {dsBaseClient: 'DataSHIELD' Client Side Base Functions},
- author = {Paul Burton and Rebecca Wilson and Olly Butters and Patricia Ryser-Welch and Alex Westerberg and Leire Abarrategui and Roberto Villegas-Diaz and Demetris Avraam and Demetris Avraam and Yannick Marcon and Tom Bishop and Amadou Gaye and Xavier Escribà-Montagut and Stuart Wheater},
- note = {R package version 6.3.4},
+ author = {Paul Burton and Rebecca Wilson and Olly Butters and Patricia Ryser-Welch and Alex Westerberg and Leire Abarrategui and Roberto Villegas-Diaz and Demetris Avraam and Yannick Marcon and Tom Bishop and Amadou Gaye and Xavier Escribà-Montagut and Stuart Wheater},
+ note = {R package version 6.3.5-9000},
}
Gaye A, Marcon Y, Isaeva J, LaFlamme P, Turner A, Jones E, Minion J, Boyd A, Newby C, Nuotio M, Wilson R, Butters O, Murtagh B, Demir I, Doiron D, Giepmans L, Wallace S, Budin-Ljøsne I, Schmidt C, Boffetta P, Boniol M, Bota M, Carter K, deKlerk N, Dibben C, Francis R, Hiekkalinna T, Hveem K, Kvaløy K, Millar S, Perry I, Peters A, Phillips C, Popham F, Raab G, Reischl E, Sheehan N, Waldenberger M, Perola M, van den Heuvel E, Macleod J, Knoppers B, Stolk R, Fortier I, Harris J, Woffenbuttel B, Murtagh M, Ferretti V, Burton P (2014).
“DataSHIELD: taking the analysis to the data, not the data to the analysis.”
@@ -168,11 +164,11 @@
Installationinstall.packages("remotes")remotes::install_github("datashield/dsBaseClient", "<BRANCH>")
-# Install v6.3.4 with the following
-remotes::install_github("datashield/dsBaseClient", "6.3.4")
+# Install v6.3.5 with the following
+remotes::install_github("datashield/dsBaseClient", "6.3.5")
[1] Burton P, Wilson R, Butters O, Ryser-Welch P, Westerberg A, Abarrategui L, Villegas-Diaz R, Avraam D, Marcon Y, Bishop T, Gaye A, Escribà Montagut X, Wheater S (2025). dsBaseClient: ‘DataSHIELD’ Client Side Base Functions. R package version 6.3.4.
+
[1] Burton P, Wilson R, Butters O, Ryser-Welch P, Westerberg A, Abarrategui L, Villegas-Diaz R, Avraam D, Marcon Y, Bishop T, Gaye A, Escribà Montagut X, Wheater S (2025). dsBaseClient: ‘DataSHIELD’ Client Side Base Functions. R package version 6.3.5.
[2] Gaye A, Marcon Y, Isaeva J, LaFlamme P, Turner A, Jones E, Minion J, Boyd A, Newby C, Nuotio M, Wilson R, Butters O, Murtagh B, Demir I, Doiron D, Giepmans L, Wallace S, Budin-Ljøsne I, Oliver Schmidt C, Boffetta P, Boniol M, Bota M, Carter K, deKlerk N, Dibben C, Francis R, Hiekkalinna T, Hveem K, Kvaløy K, Millar S, Perry I, Peters A, Phillips C, Popham F, Raab G, Reischl E, Sheehan N, Waldenberger M, Perola M, van den Heuvel E, Macleod J, Knoppers B, Stolk R, Fortier I, Harris J, Woffenbuttel B, Murtagh M, Ferretti V, Burton P (2014). “DataSHIELD: taking the analysis to the data, not the data to the analysis.” International Journal of Epidemiology, 43(6), 1929-1944. https://doi.org/10.1093/ije/dyu188.
[3] Wilson R, W. Butters O, Avraam D, Baker J, Tedds J, Turner A, Murtagh M, R. Burton P (2017). “DataSHIELD – New Directions and Dimensions.” Data Science Journal, 16(21), 1-21. https://doi.org/10.5334/dsj-2017-021.
[4] Avraam D, Wilson R, Aguirre Chan N, Banerjee S, Bishop T, Butters O, Cadman T, Cederkvist L, Duijts L, Escribà Montagut X, Garner H, Gonçalves G, González J, Haakma S, Hartlev M, Hasenauer J, Huth M, Hyde E, Jaddoe V, Marcon Y, Mayrhofer M, Molnar-Gabor F, Morgan A, Murtagh M, Nestor M, Nybo Andersen A, Parker S, Pinot de Moira A, Schwarz F, Strandberg-Larsen K, Swertz M, Welten M, Wheater S, Burton P (2024). “DataSHIELD: mitigating disclosure risk in a multi-site federated analysis platform.” Bioinformatics Advances, 5(1), 1-21. https://doi.org/10.1093/bioadv/vbaf046.
If we set the argument fixed.dummy.vars = TRUE,
baseline.level = 1 and forced.factor.levels = c(1,2,3,4,5).
The input vector is converted to the following matrix of dummy variables:
-
DV2
DV3
DV4
DV5
0
0
0
0
1
0
0
0
0
0
0
0
0
1
0
0
0
0
1
0
0
0
1
0
0
0
0
0
0
1
0
0
0
0
1
0
For the same example if the baseline.level = 3 then the matrix is:
+
DV2
DV3
DV4
DV5
0
0
0
0
1
0
0
0
0
0
0
0
0
1
0
0
0
0
1
0
0
0
1
0
0
0
0
0
0
1
0
0
0
0
1
0
0
0
0
1
For the same example if the baseline.level = 3 then the matrix is:
DV1
DV2
DV4
DV5
1
0
0
0
0
1
0
0
1
0
0
0
0
0
0
0
0
0
1
0
0
0
1
0
1
0
0
0
0
0
0
0
0
0
1
0
0
0
0
1
In the first instance the first row of the matrix has zeros in all entries indicating
that the first data point belongs to level 1 (as the baseline level is equal to 1).
The second row has 1 at the first (DV2) column and zeros elsewhere,
@@ -229,11 +229,11 @@
Display missing data patterns with disclosure control
+
+
ds.mdPattern.Rd
+
+
+
+
This function is a client-side wrapper for the server-side mdPatternDS
+function. It generates a missing data pattern matrix similar to mice::md.pattern but
+with disclosure control applied to prevent revealing small cell counts.
+
+
+
+
ds.mdPattern(x =NULL, type ="split", datasources =NULL)
+
+
+
+
Arguments
+
+
+
x
+
a character string specifying the name of a data frame or matrix on the
+server-side containing the data to analyze.
+
+
+
type
+
a character string specifying the output type. If 'split' (default),
+returns separate patterns for each study. If 'combine', attempts to pool patterns
+across studies.
For type='split': A list with one element per study, each containing:
pattern
+
The missing data pattern matrix for that study
+
+
valid
+
Logical indicating if all patterns meet disclosure requirements
+
+
message
+
A message describing the validity status
+
+
+
For type='combine': A list containing:
pattern
+
The pooled missing data pattern matrix across all studies
+
+
valid
+
Logical indicating if all pooled patterns meet disclosure requirements
+
+
message
+
A message describing the validity status
+
+
+
+
+
Details
+
The function calls the server-side mdPatternDS function which uses
+mice::md.pattern to analyze missing data patterns. Patterns with counts below the
+disclosure threshold (default: nfilter.tab = 3) are suppressed to maintain privacy.
+
Output Format:
+- Each row represents a missing data pattern
+- Pattern counts are shown in row names (e.g., "150", "25")
+- Columns show 1 if the variable is observed, 0 if missing
+- Last column shows the total number of missing values per pattern
+- Last row shows the total number of missing values per variable
+
Disclosure Control:
+
Suppressed patterns (count below threshold) are indicated by:
+- Row name: "suppressed(<N>)" where N is the threshold
+- All pattern values set to NA
+- Summary row also suppressed to prevent back-calculation
+
Pooling Behavior (type='combine'):
+
When pooling across studies, the function uses a conservative approach
+for disclosure control:
+
1. Identifies identical missing patterns across studies
+2. EXCLUDES suppressed patterns from pooling - patterns suppressed in
+ ANY study are not included in the pooled count
+3. Sums counts only for non-suppressed identical patterns
+4. Re-validates pooled counts against disclosure threshold
+
Important: This conservative approach means:
+- Pooled counts may be underestimates if some studies had suppressed patterns
+- This prevents disclosure through subtraction (e.g., if study A shows count=5
+ and pool shows count=7, one could deduce study B has count=2, violating disclosure)
+- Different patterns across studies are preserved separately in the pooled result
+
+
+
Author
+
Xavier Escribà montagut for DataSHIELD Development Team
+
+
+
+
Examples
+
if(FALSE){# \dontrun{
+## Version 6, for version 5 see the Wiki
+
+# Connecting to the Opal servers
+
+require('DSI')
+require('DSOpal')
+require('dsBaseClient')
+
+builder<-DSI::newDSLoginBuilder()
+builder$append(server ="study1",
+ url ="http://192.168.56.100:8080/",
+ user ="administrator", password ="datashield_test&",
+ table ="CNSIM.CNSIM1", driver ="OpalDriver")
+builder$append(server ="study2",
+ url ="http://192.168.56.100:8080/",
+ user ="administrator", password ="datashield_test&",
+ table ="CNSIM.CNSIM2", driver ="OpalDriver")
+logindata<-builder$build()
+
+connections<-DSI::datashield.login(logins =logindata, assign =TRUE, symbol ="D")
+
+# Get missing data patterns for each study separately
+patterns_split<-ds.mdPattern(x ="D", type ="split", datasources =connections)
+
+# View results for study1
+print(patterns_split$study1$pattern)
+# var1 var2 var3
+# 150 1 1 1 0 <- 150 obs complete
+# 25 0 1 1 1 <- 25 obs missing var1
+# 25 0 0 25 <- Summary: 25 missing per variable
+
+# Get pooled missing data patterns across studies
+patterns_pooled<-ds.mdPattern(x ="D", type ="combine", datasources =connections)
+print(patterns_pooled$pattern)
+
+# Example with suppressed patterns:
+# If study1 has a pattern with count=2 (suppressed) and study2 has same pattern
+# with count=5 (valid), the pooled result will show count=5 (conservative approach)
+# A warning will indicate: "Pooled counts may underestimate the true total"
+
+# Clear the Datashield R sessions and logout
+datashield.logout(connections)
+}# }
+
+
diff --git a/docs/sitemap.xml b/docs/sitemap.xml
index 39df9eef..fe21f864 100644
--- a/docs/sitemap.xml
+++ b/docs/sitemap.xml
@@ -6,6 +6,7 @@
/reference/checkClass.html/reference/colPercent.html/reference/computeWeightedMeans.html
+/reference/dot-pool_md_patterns.html/reference/ds.Boole.html/reference/ds.abs.html/reference/ds.asCharacter.html
@@ -84,6 +85,7 @@
/reference/ds.matrixInvert.html/reference/ds.matrixMult.html/reference/ds.matrixTranspose.html
+/reference/ds.mdPattern.html/reference/ds.mean.html/reference/ds.meanByClass.html/reference/ds.meanSdGp.html
diff --git a/man/dot-pool_md_patterns.Rd b/man/dot-pool_md_patterns.Rd
new file mode 100644
index 00000000..baabf3e9
--- /dev/null
+++ b/man/dot-pool_md_patterns.Rd
@@ -0,0 +1,20 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ds.mdPattern.R
+\name{.pool_md_patterns}
+\alias{.pool_md_patterns}
+\title{Pool missing data patterns across studies}
+\usage{
+.pool_md_patterns(patterns_list, study_names)
+}
+\arguments{
+\item{patterns_list}{List of pattern matrices from each study}
+
+\item{study_names}{Names of the studies}
+}
+\value{
+Pooled pattern matrix
+}
+\description{
+Internal function to pool md.pattern results from multiple studies
+}
+\keyword{internal}
diff --git a/man/ds.asFactor.Rd b/man/ds.asFactor.Rd
index c412df38..24125632 100644
--- a/man/ds.asFactor.Rd
+++ b/man/ds.asFactor.Rd
@@ -95,7 +95,7 @@ If we set the argument \code{fixed.dummy.vars = TRUE},
\code{baseline.level = 1} and \code{forced.factor.levels = c(1,2,3,4,5)}.
The input vector is converted to the following matrix of dummy variables:
-\tabular{rrrrr}{
+\tabular{rrrr}{
\strong{DV2} \tab \strong{DV3} \tab \strong{DV4} \tab \strong{DV5} \cr
0 \tab 0 \tab 0 \tab 0\cr
1 \tab 0 \tab 0 \tab 0\cr
diff --git a/man/ds.colnames.Rd b/man/ds.colnames.Rd
index e7391081..9460a567 100644
--- a/man/ds.colnames.Rd
+++ b/man/ds.colnames.Rd
@@ -9,20 +9,20 @@ ds.colnames(x = NULL, datasources = NULL)
\arguments{
\item{x}{a character string providing the name of the input data frame or matrix.}
-\item{datasources}{a list of \code{\link[DSI]{DSConnection-class}} objects obtained after login.
+\item{datasources}{a list of \code{\link[DSI]{DSConnection-class}} objects obtained after login.
If the \code{datasources} argument is not specified
the default set of connections will be used: see \code{\link[DSI]{datashield.connections_default}}.}
}
\value{
-\code{ds.colnames} returns the column names of
+\code{ds.colnames} returns the column names of
the specified server-side data frame or matrix.
}
\description{
-Retrieves column names of an R object on the server-side.
+Retrieves column names of an R object on the server-side.
This function is similar to R function \code{colnames}.
}
\details{
-The input is restricted to the object of type \code{data.frame} or \code{matrix}.
+The input is restricted to the object of type \code{data.frame} or \code{matrix}.
Server function called: \code{colnamesDS}
}
@@ -37,28 +37,28 @@ Server function called: \code{colnamesDS}
require('dsBaseClient')
builder <- DSI::newDSLoginBuilder()
- builder$append(server = "study1",
- url = "http://192.168.56.100:8080/",
- user = "administrator", password = "datashield_test&",
+ builder$append(server = "study1",
+ url = "http://192.168.56.100:8080/",
+ user = "administrator", password = "datashield_test&",
table = "CNSIM.CNSIM1", driver = "OpalDriver")
- builder$append(server = "study2",
- url = "http://192.168.56.100:8080/",
- user = "administrator", password = "datashield_test&",
+ builder$append(server = "study2",
+ url = "http://192.168.56.100:8080/",
+ user = "administrator", password = "datashield_test&",
table = "CNSIM.CNSIM2", driver = "OpalDriver")
builder$append(server = "study3",
- url = "http://192.168.56.100:8080/",
- user = "administrator", password = "datashield_test&",
+ url = "http://192.168.56.100:8080/",
+ user = "administrator", password = "datashield_test&",
table = "CNSIM.CNSIM3", driver = "OpalDriver")
logindata <- builder$build()
-
+
# Log onto the remote Opal training servers
- connections <- DSI::datashield.login(logins = logindata, assign = TRUE, symbol = "D")
+ connections <- DSI::datashield.login(logins = logindata, assign = TRUE, symbol = "D")
# Getting column names of the R objects stored in the server-side
ds.colnames(x = "D",
datasources = connections[1]) #only the first server ("study1") is used
# Clear the Datashield R sessions and logout
- datashield.logout(connections)
+ datashield.logout(connections)
}
}
\seealso{
diff --git a/man/ds.mdPattern.Rd b/man/ds.mdPattern.Rd
new file mode 100644
index 00000000..b1bacc0b
--- /dev/null
+++ b/man/ds.mdPattern.Rd
@@ -0,0 +1,126 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ds.mdPattern.R
+\name{ds.mdPattern}
+\alias{ds.mdPattern}
+\title{Display missing data patterns with disclosure control}
+\usage{
+ds.mdPattern(x = NULL, type = "split", datasources = NULL)
+}
+\arguments{
+\item{x}{a character string specifying the name of a data frame or matrix on the
+server-side containing the data to analyze.}
+
+\item{type}{a character string specifying the output type. If 'split' (default),
+returns separate patterns for each study. If 'combine', attempts to pool patterns
+across studies.}
+
+\item{datasources}{a list of \code{\link[DSI]{DSConnection-class}} objects obtained
+after login. If the \code{datasources} argument is not specified, the default set of
+connections will be used: see \code{\link[DSI]{datashield.connections_default}}.}
+}
+\value{
+For type='split': A list with one element per study, each containing:
+\describe{
+ \item{pattern}{The missing data pattern matrix for that study}
+ \item{valid}{Logical indicating if all patterns meet disclosure requirements}
+ \item{message}{A message describing the validity status}
+}
+
+For type='combine': A list containing:
+\describe{
+ \item{pattern}{The pooled missing data pattern matrix across all studies}
+ \item{valid}{Logical indicating if all pooled patterns meet disclosure requirements}
+ \item{message}{A message describing the validity status}
+}
+}
+\description{
+This function is a client-side wrapper for the server-side mdPatternDS
+function. It generates a missing data pattern matrix similar to mice::md.pattern but
+with disclosure control applied to prevent revealing small cell counts.
+}
+\details{
+The function calls the server-side mdPatternDS function which uses
+mice::md.pattern to analyze missing data patterns. Patterns with counts below the
+disclosure threshold (default: nfilter.tab = 3) are suppressed to maintain privacy.
+
+\strong{Output Format:}
+- Each row represents a missing data pattern
+- Pattern counts are shown in row names (e.g., "150", "25")
+- Columns show 1 if the variable is observed, 0 if missing
+- Last column shows the total number of missing values per pattern
+- Last row shows the total number of missing values per variable
+
+\strong{Disclosure Control:}
+
+Suppressed patterns (count below threshold) are indicated by:
+- Row name: "suppressed()" where N is the threshold
+- All pattern values set to NA
+- Summary row also suppressed to prevent back-calculation
+
+\strong{Pooling Behavior (type='combine'):}
+
+When pooling across studies, the function uses a \emph{conservative approach}
+for disclosure control:
+
+1. Identifies identical missing patterns across studies
+2. \strong{EXCLUDES suppressed patterns from pooling} - patterns suppressed in
+ ANY study are not included in the pooled count
+3. Sums counts only for non-suppressed identical patterns
+4. Re-validates pooled counts against disclosure threshold
+
+\strong{Important:} This conservative approach means:
+- Pooled counts may be \emph{underestimates} if some studies had suppressed patterns
+- This prevents disclosure through subtraction (e.g., if study A shows count=5
+ and pool shows count=7, one could deduce study B has count=2, violating disclosure)
+- Different patterns across studies are preserved separately in the pooled result
+}
+\examples{
+\dontrun{
+ ## Version 6, for version 5 see the Wiki
+
+ # Connecting to the Opal servers
+
+ require('DSI')
+ require('DSOpal')
+ require('dsBaseClient')
+
+ builder <- DSI::newDSLoginBuilder()
+ builder$append(server = "study1",
+ url = "http://192.168.56.100:8080/",
+ user = "administrator", password = "datashield_test&",
+ table = "CNSIM.CNSIM1", driver = "OpalDriver")
+ builder$append(server = "study2",
+ url = "http://192.168.56.100:8080/",
+ user = "administrator", password = "datashield_test&",
+ table = "CNSIM.CNSIM2", driver = "OpalDriver")
+ logindata <- builder$build()
+
+ connections <- DSI::datashield.login(logins = logindata, assign = TRUE, symbol = "D")
+
+ # Get missing data patterns for each study separately
+ patterns_split <- ds.mdPattern(x = "D", type = "split", datasources = connections)
+
+ # View results for study1
+ print(patterns_split$study1$pattern)
+ # var1 var2 var3
+ # 150 1 1 1 0 <- 150 obs complete
+ # 25 0 1 1 1 <- 25 obs missing var1
+ # 25 0 0 25 <- Summary: 25 missing per variable
+
+ # Get pooled missing data patterns across studies
+ patterns_pooled <- ds.mdPattern(x = "D", type = "combine", datasources = connections)
+ print(patterns_pooled$pattern)
+
+ # Example with suppressed patterns:
+ # If study1 has a pattern with count=2 (suppressed) and study2 has same pattern
+ # with count=5 (valid), the pooled result will show count=5 (conservative approach)
+ # A warning will indicate: "Pooled counts may underestimate the true total"
+
+ # Clear the Datashield R sessions and logout
+ datashield.logout(connections)
+}
+
+}
+\author{
+Xavier Escribà montagut for DataSHIELD Development Team
+}
diff --git a/tests/docker/armadillo/standard/config/application.yml b/tests/docker/armadillo/standard/config/application.yml
index 12b78ec8..54e90c36 100644
--- a/tests/docker/armadillo/standard/config/application.yml
+++ b/tests/docker/armadillo/standard/config/application.yml
@@ -14,17 +14,11 @@ armadillo:
# oidc-admin-user: user@yourdomain.org
profiles:
- name: default
- image: datashield/rock-omicron-karma:devel
+ image: datashield/rock-quebrada-lamda-permissive:latest
port: 8085
host: default
package-whitelist: # Packages for 'permissive'
- dsBase
- - dsMediation
- - dsMTLBase
- - dsSurvival
- - dsTidyverse
- - dsExposome
- - dsOmics
- resourcer
function-blacklist: [ ]
options:
diff --git a/tests/testthat.R b/tests/testthat.R
index 3e6bbe15..389ee66c 100644
--- a/tests/testthat.R
+++ b/tests/testthat.R
@@ -9,4 +9,5 @@
library(testthat)
library(dsBaseClient)
-test_check("dsBaseClient")
+if (identical(Sys.getenv("NOT_CRAN"), "true"))
+ test_check("dsBaseClient")
diff --git a/tests/testthat/perf_files/default_perf_profile.csv b/tests/testthat/perf_files/default_perf_profile.csv
index d75711a3..ead05698 100644
--- a/tests/testthat/perf_files/default_perf_profile.csv
+++ b/tests/testthat/perf_files/default_perf_profile.csv
@@ -6,7 +6,7 @@
"ds.asNumeric::perf:0","2.185","0.5","2"
"ds.assign::perf::0","5.490","0.5","2"
"ds.class::perf::combine:0","4.760","0.5","2"
-"ds.colnames::perf:0","4.159","0.5","2"
+"ds.colnames::perf:0","9.578","0.5","2"
"ds.exists::perf::combine:0","11.09","0.5","2"
"ds.length::perf::combine:0","9.479","0.5","2"
"ds.mean::perf::combine:0","9.650","0.5","2"
diff --git a/tests/testthat/test-arg-ds.foobar.R b/tests/testthat/test-arg-ds.foobar.R
index 36d5ac97..19f959f2 100644
--- a/tests/testthat/test-arg-ds.foobar.R
+++ b/tests/testthat/test-arg-ds.foobar.R
@@ -29,10 +29,9 @@ test_that("setup", {
test_that("NULL connections", {
calltext <- call("fooBarDS")
if (ds.test_env$driver == "ArmadilloDriver") {
- expect_error(datashield.aggregate(conns=NULL, expr=calltext), "no applicable method for `@` applied to an object of class \"NULL\"", fixed=TRUE)
-# expect_error(datashield.aggregate(conns=NULL, expr=calltext), "trying to get slot \"name\" from an object of a basic class (\"NULL\") with no slots", fixed=TRUE)
+ expect_error(datashield.aggregate(conns=NULL, expr=calltext), "unable to find an inherited method for function 'dsIsAsync' for signature 'conn = \"NULL\"'", fixed=TRUE)
} else if (ds.test_env$driver == "OpalDriver") {
- expect_error(datashield.aggregate(conns=NULL, expr=calltext), "no applicable method for `@` applied to an object of class \"NULL\"", fixed=TRUE)
+ expect_error(datashield.aggregate(conns=NULL, expr=calltext), "unable to find an inherited method for function 'dsIsAsync' for signature 'conn = \"NULL\"'", fixed=TRUE)
} else {
fail(message = "Unknown driver type", info = ds.test_env$driver)
}
@@ -70,10 +69,9 @@ test_that("non existent aggregate foobarDS", {
test_that("NULL connections", {
calltext <- call("fooBarDS")
if (ds.test_env$driver == "ArmadilloDriver") {
- expect_error(datashield.assign(conns=NULL, symbol="new_obj", value=calltext), "no applicable method for `@` applied to an object of class \"NULL\"", fixed=TRUE)
-# expect_error(datashield.assign(conns=NULL, symbol="new_obj", value=calltext), "trying to get slot \"name\" from an object of a basic class (\"NULL\") with no slots", fixed=TRUE)
+ expect_error(datashield.assign(conns=NULL, symbol="new_obj", value=calltext), "unable to find an inherited method for function 'dsIsAsync' for signature 'conn = \"NULL\"'", fixed=TRUE)
} else if (ds.test_env$driver == "OpalDriver") {
- expect_error(datashield.assign(conns=NULL, symbol="new_obj", value=calltext), "no applicable method for `@` applied to an object of class \"NULL\"", fixed=TRUE)
+ expect_error(datashield.assign(conns=NULL, symbol="new_obj", value=calltext), "unable to find an inherited method for function 'dsIsAsync' for signature 'conn = \"NULL\"'", fixed=TRUE)
} else {
fail(message = "Unknown driver type", info = ds.test_env$driver)
}
diff --git a/tests/testthat/test-smk-ds.colnames.R b/tests/testthat/test-smk-ds.colnames.R
index 0e9aaf35..ee98cc2e 100644
--- a/tests/testthat/test-smk-ds.colnames.R
+++ b/tests/testthat/test-smk-ds.colnames.R
@@ -25,7 +25,6 @@ test_that("setup", {
# Tests
#
-options(datashield.errors.print = TRUE)
# context("ds.colnames::smk")
test_that("simple colnames", {
myvectors <- c("D$LAB_TSC", "D$LAB_TRIG")
@@ -48,18 +47,21 @@ test_that("simple colnames", {
test_that("fails if the object does not exist", {
expect_error(
ds.colnames("non_existing_df"),
- regexp = "'non_existing_df' does not exist",
+ regexp = "There are some DataSHIELD errors, list them with datashield.error()",
ignore.case = TRUE
)
})
-test_that("fails if object is not a data frame or matrix", {
- expect_error(
- ds.colnames("D$LAB_TSC"),
- regexp = "must be of type data.frame or matrix",
- ignore.case = TRUE
- )
-})
+###########################################
+### Remote checks not performed ###
+###########################################
+# test_that("fails if object is not a data frame or matrix", {
+# expect_error(
+# ds.colnames("D$LAB_TSC"),
+# regexp = "must be of type data.frame or matrix",
+# ignore.case = TRUE
+# )
+# })
#
# Done