From 7492de9be6c1a1de18f0c87cd184f0491f2ea240 Mon Sep 17 00:00:00 2001 From: katehoffshutta <43797774+katehoffshutta@users.noreply.github.com> Date: Mon, 5 May 2025 19:08:37 -0400 Subject: [PATCH] added unit test for duplicate filtering --- R/NetworkDataCompanion.R | 17 +++++++-------- ...t_filterDuplicatesMethylationMissingness.R | 21 +++++++++++++++++++ .../testdata/duplicate_meth_uuids.csv | 6 ++++++ 3 files changed, 34 insertions(+), 10 deletions(-) create mode 100644 tests/testthat/test_filterDuplicatesMethylationMissingness.R create mode 100644 tests/testthat/testdata/duplicate_meth_uuids.csv diff --git a/R/NetworkDataCompanion.R b/R/NetworkDataCompanion.R index fc56b9a..dfa8340 100644 --- a/R/NetworkDataCompanion.R +++ b/R/NetworkDataCompanion.R @@ -521,17 +521,14 @@ NetworkDataCompanion=setRefClass("NetworkDataCompanion", missing_df = data.frame("uuid"=names(methylation_betas[,-1]), "prop_miss"=apply(methylation_betas[,-1],2, function(x){sum(is.na(x))/length(x)})) - tcga_barcodes = ndc$mapUUIDtoTCGA(missing_df$uuid) - keep_barcodes = missing_df %>% inner_join(tcga_barcodes, by=c("uuid"="file_id")) %>% - dplyr::rename("TCGA_barcode"=submitter_id) %>% - mutate(TCGA_sample_and_type = ndc$extractSampleAndType(TCGA_barcode)) %>% + tcga_barcodes = mapUUIDtoTCGA(missing_df$uuid) + missing_df %>% inner_join(tcga_barcodes, by=c("uuid"="file_id")) %>% + dplyr::rename("TCGA_barcode" = submitter_id) %>% + mutate(TCGA_sample_and_type = extractSampleAndType(TCGA_barcode)) %>% group_by(TCGA_sample_and_type) %>% - summarize("TCGA_barcode_min_prop_miss"=TCGA_barcode[which.min(prop_miss)]) %>% - pull(TCGA_barcode_min_prop_miss) - - keep_uuids = tcga_barcodes %>% dplyr::filter(submitter_id %in% keep_barcodes) %>% - pull(file_id) - return(keep_uuids) + summarize("uuid_min_prop_miss"=uuid[which.min(prop_miss)]) %>% + pull(uuid_min_prop_miss) %>% + return() }, ## Filter samples indicated by *TCGA_barcodes* based on the method *method* and threshold *threshold* diff --git a/tests/testthat/test_filterDuplicatesMethylationMissingness.R b/tests/testthat/test_filterDuplicatesMethylationMissingness.R new file mode 100644 index 0000000..d428d22 --- /dev/null +++ b/tests/testthat/test_filterDuplicatesMethylationMissingness.R @@ -0,0 +1,21 @@ +context("[NetworkDataCompanion] Testing filterDuplicatesMethylationMissingness function ... ") + +test_that("Testing filterDuplicatesMethylationMissingness",{ + # samples loaded from quickstart, Harvard dataverse archive + # data_download/methylation/tcga_coad_cms1_methylations.txt + id_map = read.csv("testdata/duplicate_meth_uuids.csv", row.names=1) + # fake methylation data with various missingness + beta_vals = matrix(nrow=3,ncol=6) + beta_vals[,3] = rep(1,3) # this should be picked + beta_vals[,4] = c(NA,rep(1,2)) + beta_vals[,5] = rep(1,3) + beta_df = data.frame(beta_vals) + beta_df[,1] = c("cg00000001", + "cg00000002", + "cg00000003") + names(beta_df) =c("probeID",id_map$file_id) + + my_friend = NetworkDataCompanion::CreateNetworkDataCompanionObject() + keep_uuids = my_friend$filterDuplicatesMethylationMissingness(beta_df) + expect_equal(keep_uuids, c(id_map$file_id[2],id_map$file_id[4])) +}) diff --git a/tests/testthat/testdata/duplicate_meth_uuids.csv b/tests/testthat/testdata/duplicate_meth_uuids.csv new file mode 100644 index 0000000..ab69d85 --- /dev/null +++ b/tests/testthat/testdata/duplicate_meth_uuids.csv @@ -0,0 +1,6 @@ +"","file_id","submitter_id" +"ff44ff4e-15e9-46b6-986d-484dfd08bb16","ff44ff4e-15e9-46b6-986d-484dfd08bb16","TCGA-A6-6780-01A" +"6852b9a2-de41-45a4-a63b-a4a415b5cac4","6852b9a2-de41-45a4-a63b-a4a415b5cac4","TCGA-A6-3809-01A" +"8b6e4521-ed23-4ddd-96a4-941d36065418","8b6e4521-ed23-4ddd-96a4-941d36065418","TCGA-A6-3809-01B" +"d0730cce-7c73-4125-9022-5dc1069920b6","d0730cce-7c73-4125-9022-5dc1069920b6","TCGA-A6-6780-01A" +"391664d3-d617-4b6f-9d81-78cf83f61520","391664d3-d617-4b6f-9d81-78cf83f61520","TCGA-A6-6780-01B"