diff --git a/R/NetworkDataCompanion.R b/R/NetworkDataCompanion.R index fc56b9a..dfa8340 100644 --- a/R/NetworkDataCompanion.R +++ b/R/NetworkDataCompanion.R @@ -521,17 +521,14 @@ NetworkDataCompanion=setRefClass("NetworkDataCompanion", missing_df = data.frame("uuid"=names(methylation_betas[,-1]), "prop_miss"=apply(methylation_betas[,-1],2, function(x){sum(is.na(x))/length(x)})) - tcga_barcodes = ndc$mapUUIDtoTCGA(missing_df$uuid) - keep_barcodes = missing_df %>% inner_join(tcga_barcodes, by=c("uuid"="file_id")) %>% - dplyr::rename("TCGA_barcode"=submitter_id) %>% - mutate(TCGA_sample_and_type = ndc$extractSampleAndType(TCGA_barcode)) %>% + tcga_barcodes = mapUUIDtoTCGA(missing_df$uuid) + missing_df %>% inner_join(tcga_barcodes, by=c("uuid"="file_id")) %>% + dplyr::rename("TCGA_barcode" = submitter_id) %>% + mutate(TCGA_sample_and_type = extractSampleAndType(TCGA_barcode)) %>% group_by(TCGA_sample_and_type) %>% - summarize("TCGA_barcode_min_prop_miss"=TCGA_barcode[which.min(prop_miss)]) %>% - pull(TCGA_barcode_min_prop_miss) - - keep_uuids = tcga_barcodes %>% dplyr::filter(submitter_id %in% keep_barcodes) %>% - pull(file_id) - return(keep_uuids) + summarize("uuid_min_prop_miss"=uuid[which.min(prop_miss)]) %>% + pull(uuid_min_prop_miss) %>% + return() }, ## Filter samples indicated by *TCGA_barcodes* based on the method *method* and threshold *threshold* diff --git a/tests/testthat/test_filterDuplicatesMethylationMissingness.R b/tests/testthat/test_filterDuplicatesMethylationMissingness.R new file mode 100644 index 0000000..d428d22 --- /dev/null +++ b/tests/testthat/test_filterDuplicatesMethylationMissingness.R @@ -0,0 +1,21 @@ +context("[NetworkDataCompanion] Testing filterDuplicatesMethylationMissingness function ... ") + +test_that("Testing filterDuplicatesMethylationMissingness",{ + # samples loaded from quickstart, Harvard dataverse archive + # data_download/methylation/tcga_coad_cms1_methylations.txt + id_map = read.csv("testdata/duplicate_meth_uuids.csv", row.names=1) + # fake methylation data with various missingness + beta_vals = matrix(nrow=3,ncol=6) + beta_vals[,3] = rep(1,3) # this should be picked + beta_vals[,4] = c(NA,rep(1,2)) + beta_vals[,5] = rep(1,3) + beta_df = data.frame(beta_vals) + beta_df[,1] = c("cg00000001", + "cg00000002", + "cg00000003") + names(beta_df) =c("probeID",id_map$file_id) + + my_friend = NetworkDataCompanion::CreateNetworkDataCompanionObject() + keep_uuids = my_friend$filterDuplicatesMethylationMissingness(beta_df) + expect_equal(keep_uuids, c(id_map$file_id[2],id_map$file_id[4])) +}) diff --git a/tests/testthat/testdata/duplicate_meth_uuids.csv b/tests/testthat/testdata/duplicate_meth_uuids.csv new file mode 100644 index 0000000..ab69d85 --- /dev/null +++ b/tests/testthat/testdata/duplicate_meth_uuids.csv @@ -0,0 +1,6 @@ +"","file_id","submitter_id" +"ff44ff4e-15e9-46b6-986d-484dfd08bb16","ff44ff4e-15e9-46b6-986d-484dfd08bb16","TCGA-A6-6780-01A" +"6852b9a2-de41-45a4-a63b-a4a415b5cac4","6852b9a2-de41-45a4-a63b-a4a415b5cac4","TCGA-A6-3809-01A" +"8b6e4521-ed23-4ddd-96a4-941d36065418","8b6e4521-ed23-4ddd-96a4-941d36065418","TCGA-A6-3809-01B" +"d0730cce-7c73-4125-9022-5dc1069920b6","d0730cce-7c73-4125-9022-5dc1069920b6","TCGA-A6-6780-01A" +"391664d3-d617-4b6f-9d81-78cf83f61520","391664d3-d617-4b6f-9d81-78cf83f61520","TCGA-A6-6780-01B"