QuackenbushLab · katehoffshutta · May 5, 2025 · May 5, 2025 · May 5, 2025
diff --git a/R/NetworkDataCompanion.R b/R/NetworkDataCompanion.R
@@ -115,9 +115,6 @@ NetworkDataCompanion=setRefClass("NetworkDataCompanion",
                          logCPM=log(cpm_vals+1)))
            },
 
-           #### more methods go here
-
-           # maybe have this presaved in class
            extractSampleOnly = function(TCGA_barcodes){
              return(sapply(TCGA_barcodes, substr, 1, 12))
            },
@@ -240,7 +237,6 @@ NetworkDataCompanion=setRefClass("NetworkDataCompanion",
              colnames(mymap) = c("probeID","geneNames","ensemblID","distToTSS")
 
              mymap = as.data.frame(mymap)
-
              # iterate through map with for loop
              # please feel free to vectorize this etc
              for(i in 1:nrow(smallManifest))
@@ -271,7 +267,9 @@ NetworkDataCompanion=setRefClass("NetworkDataCompanion",
 
 
              if(!mapToNearest)
+             {
               return(mymap)
+             }
              if(mapToNearest)
              {
                processRow = function(x) # x is one row of mymap
@@ -479,43 +477,63 @@ NetworkDataCompanion=setRefClass("NetworkDataCompanion",
             return(which(!duplicate_throwout))
            },
 
-           ## Filter out all duplicates based on sequencing depth, take random one if no info on seq depth for all vials
-           ## Returns indices in given tcga barcodes to KEEP
-	         filterDuplicatesSeqDepthOther = function(expression_count_matrix, tcga_barcodes){
-             sample_vials_ge <- extractSampleAndTypeAndVial(colnames(expression_count_matrix))
-             seq_depth <- colSums(expression_count_matrix)
-             duplicate_throwout <- rep(NA, length(tcga_barcodes))
-             for (idx in 1:length(tcga_barcodes))
+           ## filter duplicates based on tumor purity
+           ## returns a list of TCGA barcodes to keep
+           ## there may still be some duplicates, where purity info is not available
+           filterDuplicatesPurity = function(TCGA_barcodes,method="ESTIMATE")
+           {
+             if (!(method %in% c("ESTIMATE",
+                                 "ABSOLUTE", "LUMP", "IHC", "CPE")))
              {
-               if (is.na(duplicate_throwout[idx]))
-               {
-                 ## find all vials and replicates of current barcode
-                 rep_idcs <- which(extractSampleOnly(tcga_barcodes[idx]) == extractSampleOnly(tcga_barcodes))
-                 rep_vials <- extractSampleAndTypeAndVial(tcga_barcodes[rep_idcs])
-                 ## match with vials in expression matrix
-                 mIdx <- match(rep_vials, sample_vials_ge)
-                 ## get matched vial with highest seqdepth, just take first one if no match at all
-                 max_sample_idx <- 1
-                 curr_max <- 0
-                 for (j in 1:length(mIdx))
-                 {
-                   if (!is.na(mIdx[j]))
-                   {
-                     if (seq_depth[mIdx[j]] > curr_max)
-                     {
-                       curr_max <- seq_depth[mIdx[j]]
-                       max_sample_idx <- rep_idcs[j]
-                     }
-                   }
-                 }
-                 ## throw out all but maximum one
-                 duplicate_throwout[rep_idcs] <- T
-                 duplicate_throwout[max_sample_idx] <- F
-               }
+               stop("Error: Expected method name should be ESTIMATE, ABSOLUTE, LUMP, IHC, CPE")
              }
-             return(which(!duplicate_throwout))
+
+             out_df = TCGA_purities %>% dplyr::select(all_of(c("TCGA_barcode",method))) %>%
+               na.omit() %>%
+               inner_join(data.frame("TCGA_barcode"=TCGA_barcodes),by="TCGA_barcode") %>%
+               mutate("TCGA_sample_and_type" = extractSampleAndType(TCGA_barcode)) %>%
+               rename("purity"=method) %>%
+               group_by(TCGA_sample_and_type) %>%
+               summarize("TCGA_barcode_max_purity"=TCGA_barcode[which.max(purity)]) %>%
+               pull(TCGA_barcode_max_purity) %>%
+               return()
            },
-
+
+           filterDuplicatesRandom = function(TCGA_barcodes,seed = 1989){
+             set.seed(seed)
+             out_df = data.frame("TCGA_barcode"=TCGA_barcodes) %>%
+               mutate(TCGA_sample_and_type = extractSampleAndType(TCGA_barcodes))
+
+             permuted_df = out_df[sample(1:nrow(out_df)),]
+             permuted_df %>% 
+               dplyr::filter(!duplicated(TCGA_sample_and_type)) %>%
+               pull(TCGA_barcode) %>%
+               return()
+           },
+
+           # filter methylation duplicates based on less 
+           # overall missingness in the measured beta values
+           # methylation_betas should be a data frame with 
+           # probes in rows and UUIDs in columns, with the 
+           # exception of the first column which is probeID
+           filterDuplicatesMethylationMissingness = function(methylation_betas)
+           {
+             missing_df = data.frame("uuid"=names(methylation_betas[,-1]),
+                                     "prop_miss"=apply(methylation_betas[,-1],2,
+                                                       function(x){sum(is.na(x))/length(x)}))
+             tcga_barcodes = ndc$mapUUIDtoTCGA(missing_df$uuid)
+             keep_barcodes = missing_df %>% inner_join(tcga_barcodes, by=c("uuid"="file_id")) %>%
+               dplyr::rename("TCGA_barcode"=submitter_id) %>% 
+               mutate(TCGA_sample_and_type =  ndc$extractSampleAndType(TCGA_barcode)) %>%
+               group_by(TCGA_sample_and_type) %>%
+               summarize("TCGA_barcode_min_prop_miss"=TCGA_barcode[which.min(prop_miss)]) %>%
+               pull(TCGA_barcode_min_prop_miss) 
+
+             keep_uuids = tcga_barcodes %>% dplyr::filter(submitter_id %in% keep_barcodes) %>%
+               pull(file_id)
+             return(keep_uuids)
+           },
+
            ## Filter samples indicated by *TCGA_barcodes* based on the method *method* and threshold *threshold*
            ## Returns a list of indices indicating which samples should be kept
 	         filterPurity = function(TCGA_barcodes, method="ESTIMATE", threshold=.6){
@@ -813,11 +831,8 @@ NetworkDataCompanion=setRefClass("NetworkDataCompanion",
 #' @export CreateNetworkDataCompanionObject
 CreateNetworkDataCompanionObject <- function(clinical_patient_file=NULL, project_name="default_project"){
 
-  ## Load purities for purity package
+  ## Load purities from purity package
   obj <- CreateTCGAPurityFilteringObject()
-
-  #this is an easy hack for not breaking, but something smarter would be great
-  #TODO skip purityfiltering completely and do it here instead
   with_purity = c("ACC","BLCA","BRCA","CESC","COAD","GBM",
                   "HNSC","KIRC","KIRP","KICH","LGG","LIHC",
                   "LUAD","LUSC","OV","PRAD","READ","SKCM",
@@ -826,6 +841,7 @@ CreateNetworkDataCompanionObject <- function(clinical_patient_file=NULL, project
 
   if(project_name %in%  with_purity){
     purities <- obj$get_tissue_purities(cancer_type = project_name)
+    purities$TCGA_barcode = row.names(purities)
   }
 
   ## Load patient's clinical data

diff --git a/tests/testthat/test_filterDuplicatesRandom.R b/tests/testthat/test_filterDuplicatesRandom.R
@@ -0,0 +1,19 @@
+context("[NetworkDataCompanion] Testing filterDuplicatesRandom function ... ")
+
+test_that("Testing filterDuplicatesRandom",{
+
+  my_friend = NetworkDataCompanion::CreateNetworkDataCompanionObject()
+  barcode1 = "TCGA-A1-1234-01A"
+  barcode2 = "TCGA-A1-1234-11A"
+  barcode3 = "TCGA-A1-1234-21A"
+  barcode4 = "TCGA-A1-1234-01B"
+  barcode5 = "TCGA-A1-1234-21B"
+  barcode6 = "TCGA-A1-1234-21C"
+  barcodes = c(barcode1, barcode2, barcode3, barcode4, barcode5, barcode6)
+  set.seed(1989)
+  permuted_barcodes = sample(my_friend$extractSampleAndType(barcodes))
+  keep_barcodes_manual = names(permuted_barcodes[!duplicated(permuted_barcodes)])
+  keep_barcodes_function = my_friend$filterDuplicatesRandom(barcodes,seed=1989)
+  expect_equal(keep_barcodes_manual, keep_barcodes_function)
+
+})
diff --git a/tests/testthat/test_filterDuplicatesSeqDepthOther.R b/tests/testthat/test_filterDuplicatesSeqDepthOther.R