From 6567a2ffcf4e497800040dabcd0fdaa3fdc2a9be Mon Sep 17 00:00:00 2001 From: katehoffshutta <43797774+katehoffshutta@users.noreply.github.com> Date: Mon, 5 May 2025 11:49:31 -0400 Subject: [PATCH 1/2] outline additional duplicate handling functions --- R/NetworkDataCompanion.R | 59 +++++++++++++++++++++++++++++++++------- 1 file changed, 49 insertions(+), 10 deletions(-) diff --git a/R/NetworkDataCompanion.R b/R/NetworkDataCompanion.R index 641ecee..b0a9616 100644 --- a/R/NetworkDataCompanion.R +++ b/R/NetworkDataCompanion.R @@ -115,9 +115,6 @@ NetworkDataCompanion=setRefClass("NetworkDataCompanion", logCPM=log(cpm_vals+1))) }, - #### more methods go here - - # maybe have this presaved in class extractSampleOnly = function(TCGA_barcodes){ return(sapply(TCGA_barcodes, substr, 1, 12)) }, @@ -240,7 +237,6 @@ NetworkDataCompanion=setRefClass("NetworkDataCompanion", colnames(mymap) = c("probeID","geneNames","ensemblID","distToTSS") mymap = as.data.frame(mymap) - # iterate through map with for loop # please feel free to vectorize this etc for(i in 1:nrow(smallManifest)) @@ -271,7 +267,9 @@ NetworkDataCompanion=setRefClass("NetworkDataCompanion", if(!mapToNearest) + { return(mymap) + } if(mapToNearest) { processRow = function(x) # x is one row of mymap @@ -479,7 +477,49 @@ NetworkDataCompanion=setRefClass("NetworkDataCompanion", return(which(!duplicate_throwout)) }, - ## Filter out all duplicates based on sequencing depth, take random one if no info on seq depth for all vials + ## filter duplicates based on tumor purity + ## returns a list of TCGA barcodes to keep + ## there may still be some duplicates, where purity info is not available + filterDuplicatesPurity = function(TCGA_barcodes,method="ESTIMATE") + { + if (!(method %in% c("ESTIMATE", + "ABSOLUTE", "LUMP", "IHC", "CPE"))) + { + stop("Error: Expected method name should be ESTIMATE, ABSOLUTE, LUMP, IHC, CPE") + } + + out_df = TCGA_purities %>% dplyr::select(all_of(c("TCGA_barcode",method))) %>% + na.omit() %>% + inner_join(data.frame("TCGA_barcode"=TCGA_barcodes),by="TCGA_barcode") %>% + mutate("TCGA_sample_and_type" = extractSampleAndType(TCGA_barcode)) %>% + rename("purity"=method) %>% + group_by(TCGA_sample_and_type) %>% + summarize("TCGA_barcode_max_purity"=which.max(purity)) %>% + pull(TCGA_barcode_max_purity) %>% + return() + }, + + filterDuplicatesRandom = function(TCGA_barcodes,seed = 1989){ + set.seed(seed) + out_df = data.frame("TCGA_barcode"=TCGA_barcodes) %>% + mutate(TCGA_sample_and_type = extractSampleAndType(TCGA_barcodes)) + + permuted_df = out_df[sample(1:nrow(out_df)),] + permuted_df %>% + dplyr::filter(!duplicated(TCGA_sample_and_type)) %>% + pull(TCGA_barcode) %>% + return() + }, + + # filter methylation duplicates based on less overall missingness + # in measured beta values + filterDuplicatesMethylationMissingness = function(x) + { + + } + + ## Filter out all duplicates based on sequencing depth, + ## take random one if no info on seq depth for all vials ## Returns indices in given tcga barcodes to KEEP filterDuplicatesSeqDepthOther = function(expression_count_matrix, tcga_barcodes){ sample_vials_ge <- extractSampleAndTypeAndVial(colnames(expression_count_matrix)) @@ -515,7 +555,8 @@ NetworkDataCompanion=setRefClass("NetworkDataCompanion", } return(which(!duplicate_throwout)) }, - + + ## Filter samples indicated by *TCGA_barcodes* based on the method *method* and threshold *threshold* ## Returns a list of indices indicating which samples should be kept filterPurity = function(TCGA_barcodes, method="ESTIMATE", threshold=.6){ @@ -813,11 +854,8 @@ NetworkDataCompanion=setRefClass("NetworkDataCompanion", #' @export CreateNetworkDataCompanionObject CreateNetworkDataCompanionObject <- function(clinical_patient_file=NULL, project_name="default_project"){ - ## Load purities for purity package + ## Load purities from purity package obj <- CreateTCGAPurityFilteringObject() - - #this is an easy hack for not breaking, but something smarter would be great - #TODO skip purityfiltering completely and do it here instead with_purity = c("ACC","BLCA","BRCA","CESC","COAD","GBM", "HNSC","KIRC","KIRP","KICH","LGG","LIHC", "LUAD","LUSC","OV","PRAD","READ","SKCM", @@ -826,6 +864,7 @@ CreateNetworkDataCompanionObject <- function(clinical_patient_file=NULL, project if(project_name %in% with_purity){ purities <- obj$get_tissue_purities(cancer_type = project_name) + purities$TCGA_barcode = row.names(purities) } ## Load patient's clinical data From f1f2115c7a2d2e53b5e9b713f7a399f052afaa9d Mon Sep 17 00:00:00 2001 From: katehoffshutta <43797774+katehoffshutta@users.noreply.github.com> Date: Mon, 5 May 2025 13:24:09 -0400 Subject: [PATCH 2/2] update duplicate filtering --- R/NetworkDataCompanion.R | 63 ++++++------------- tests/testthat/test_filterDuplicatesRandom.R | 19 ++++++ .../test_filterDuplicatesSeqDepthOther.R | 17 ----- 3 files changed, 39 insertions(+), 60 deletions(-) create mode 100644 tests/testthat/test_filterDuplicatesRandom.R delete mode 100644 tests/testthat/test_filterDuplicatesSeqDepthOther.R diff --git a/R/NetworkDataCompanion.R b/R/NetworkDataCompanion.R index b0a9616..fc56b9a 100644 --- a/R/NetworkDataCompanion.R +++ b/R/NetworkDataCompanion.R @@ -494,7 +494,7 @@ NetworkDataCompanion=setRefClass("NetworkDataCompanion", mutate("TCGA_sample_and_type" = extractSampleAndType(TCGA_barcode)) %>% rename("purity"=method) %>% group_by(TCGA_sample_and_type) %>% - summarize("TCGA_barcode_max_purity"=which.max(purity)) %>% + summarize("TCGA_barcode_max_purity"=TCGA_barcode[which.max(purity)]) %>% pull(TCGA_barcode_max_purity) %>% return() }, @@ -511,52 +511,29 @@ NetworkDataCompanion=setRefClass("NetworkDataCompanion", return() }, - # filter methylation duplicates based on less overall missingness - # in measured beta values - filterDuplicatesMethylationMissingness = function(x) + # filter methylation duplicates based on less + # overall missingness in the measured beta values + # methylation_betas should be a data frame with + # probes in rows and UUIDs in columns, with the + # exception of the first column which is probeID + filterDuplicatesMethylationMissingness = function(methylation_betas) { + missing_df = data.frame("uuid"=names(methylation_betas[,-1]), + "prop_miss"=apply(methylation_betas[,-1],2, + function(x){sum(is.na(x))/length(x)})) + tcga_barcodes = ndc$mapUUIDtoTCGA(missing_df$uuid) + keep_barcodes = missing_df %>% inner_join(tcga_barcodes, by=c("uuid"="file_id")) %>% + dplyr::rename("TCGA_barcode"=submitter_id) %>% + mutate(TCGA_sample_and_type = ndc$extractSampleAndType(TCGA_barcode)) %>% + group_by(TCGA_sample_and_type) %>% + summarize("TCGA_barcode_min_prop_miss"=TCGA_barcode[which.min(prop_miss)]) %>% + pull(TCGA_barcode_min_prop_miss) - } - - ## Filter out all duplicates based on sequencing depth, - ## take random one if no info on seq depth for all vials - ## Returns indices in given tcga barcodes to KEEP - filterDuplicatesSeqDepthOther = function(expression_count_matrix, tcga_barcodes){ - sample_vials_ge <- extractSampleAndTypeAndVial(colnames(expression_count_matrix)) - seq_depth <- colSums(expression_count_matrix) - duplicate_throwout <- rep(NA, length(tcga_barcodes)) - for (idx in 1:length(tcga_barcodes)) - { - if (is.na(duplicate_throwout[idx])) - { - ## find all vials and replicates of current barcode - rep_idcs <- which(extractSampleOnly(tcga_barcodes[idx]) == extractSampleOnly(tcga_barcodes)) - rep_vials <- extractSampleAndTypeAndVial(tcga_barcodes[rep_idcs]) - ## match with vials in expression matrix - mIdx <- match(rep_vials, sample_vials_ge) - ## get matched vial with highest seqdepth, just take first one if no match at all - max_sample_idx <- 1 - curr_max <- 0 - for (j in 1:length(mIdx)) - { - if (!is.na(mIdx[j])) - { - if (seq_depth[mIdx[j]] > curr_max) - { - curr_max <- seq_depth[mIdx[j]] - max_sample_idx <- rep_idcs[j] - } - } - } - ## throw out all but maximum one - duplicate_throwout[rep_idcs] <- T - duplicate_throwout[max_sample_idx] <- F - } - } - return(which(!duplicate_throwout)) + keep_uuids = tcga_barcodes %>% dplyr::filter(submitter_id %in% keep_barcodes) %>% + pull(file_id) + return(keep_uuids) }, - ## Filter samples indicated by *TCGA_barcodes* based on the method *method* and threshold *threshold* ## Returns a list of indices indicating which samples should be kept filterPurity = function(TCGA_barcodes, method="ESTIMATE", threshold=.6){ diff --git a/tests/testthat/test_filterDuplicatesRandom.R b/tests/testthat/test_filterDuplicatesRandom.R new file mode 100644 index 0000000..afc79a1 --- /dev/null +++ b/tests/testthat/test_filterDuplicatesRandom.R @@ -0,0 +1,19 @@ +context("[NetworkDataCompanion] Testing filterDuplicatesRandom function ... ") + +test_that("Testing filterDuplicatesRandom",{ + + my_friend = NetworkDataCompanion::CreateNetworkDataCompanionObject() + barcode1 = "TCGA-A1-1234-01A" + barcode2 = "TCGA-A1-1234-11A" + barcode3 = "TCGA-A1-1234-21A" + barcode4 = "TCGA-A1-1234-01B" + barcode5 = "TCGA-A1-1234-21B" + barcode6 = "TCGA-A1-1234-21C" + barcodes = c(barcode1, barcode2, barcode3, barcode4, barcode5, barcode6) + set.seed(1989) + permuted_barcodes = sample(my_friend$extractSampleAndType(barcodes)) + keep_barcodes_manual = names(permuted_barcodes[!duplicated(permuted_barcodes)]) + keep_barcodes_function = my_friend$filterDuplicatesRandom(barcodes,seed=1989) + expect_equal(keep_barcodes_manual, keep_barcodes_function) + +}) diff --git a/tests/testthat/test_filterDuplicatesSeqDepthOther.R b/tests/testthat/test_filterDuplicatesSeqDepthOther.R deleted file mode 100644 index ee2bf47..0000000 --- a/tests/testthat/test_filterDuplicatesSeqDepthOther.R +++ /dev/null @@ -1,17 +0,0 @@ -context("[NetworkDataCompanion] Testing filterDuplicatesSeqDepthOther function ... ") - -test_that("Testing filterDuplicatesSeqDepthOther",{ - - my_friend = NetworkDataCompanion::CreateNetworkDataCompanionObject() - barcode1 = "TCGA-A1-1234-01A" - barcode2 = "TCGA-A1-1234-11A" - barcode3 = "TCGA-A1-1234-21A" - barcode4 = "TCGA-A1-1234-01B" - barcode5 = "TCGA-A1-1234-21B" - barcode6 = "TCGA-A1-1234-21C" - barcodes = c(barcode1, barcode2, barcode3, barcode4, barcode5, barcode6) - - expr <- data.frame(rbind(c(1, 2, 3, 4, 5, 6), c(rep(1, 6)))) - colnames(expr) <- barcodes - expect_equal(my_friend$filterDuplicatesSeqDepthOther(expr,barcodes), c(6)) -})