Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 58 additions & 42 deletions R/NetworkDataCompanion.R
Original file line number Diff line number Diff line change
Expand Up @@ -115,9 +115,6 @@ NetworkDataCompanion=setRefClass("NetworkDataCompanion",
logCPM=log(cpm_vals+1)))
},

#### more methods go here

# maybe have this presaved in class
extractSampleOnly = function(TCGA_barcodes){
return(sapply(TCGA_barcodes, substr, 1, 12))
},
Expand Down Expand Up @@ -240,7 +237,6 @@ NetworkDataCompanion=setRefClass("NetworkDataCompanion",
colnames(mymap) = c("probeID","geneNames","ensemblID","distToTSS")

mymap = as.data.frame(mymap)

# iterate through map with for loop
# please feel free to vectorize this etc
for(i in 1:nrow(smallManifest))
Expand Down Expand Up @@ -271,7 +267,9 @@ NetworkDataCompanion=setRefClass("NetworkDataCompanion",


if(!mapToNearest)
{
return(mymap)
}
if(mapToNearest)
{
processRow = function(x) # x is one row of mymap
Expand Down Expand Up @@ -479,43 +477,63 @@ NetworkDataCompanion=setRefClass("NetworkDataCompanion",
return(which(!duplicate_throwout))
},

## Filter out all duplicates based on sequencing depth, take random one if no info on seq depth for all vials
## Returns indices in given tcga barcodes to KEEP
filterDuplicatesSeqDepthOther = function(expression_count_matrix, tcga_barcodes){
sample_vials_ge <- extractSampleAndTypeAndVial(colnames(expression_count_matrix))
seq_depth <- colSums(expression_count_matrix)
duplicate_throwout <- rep(NA, length(tcga_barcodes))
for (idx in 1:length(tcga_barcodes))
## filter duplicates based on tumor purity
## returns a list of TCGA barcodes to keep
## there may still be some duplicates, where purity info is not available
filterDuplicatesPurity = function(TCGA_barcodes,method="ESTIMATE")
{
if (!(method %in% c("ESTIMATE",
"ABSOLUTE", "LUMP", "IHC", "CPE")))
{
if (is.na(duplicate_throwout[idx]))
{
## find all vials and replicates of current barcode
rep_idcs <- which(extractSampleOnly(tcga_barcodes[idx]) == extractSampleOnly(tcga_barcodes))
rep_vials <- extractSampleAndTypeAndVial(tcga_barcodes[rep_idcs])
## match with vials in expression matrix
mIdx <- match(rep_vials, sample_vials_ge)
## get matched vial with highest seqdepth, just take first one if no match at all
max_sample_idx <- 1
curr_max <- 0
for (j in 1:length(mIdx))
{
if (!is.na(mIdx[j]))
{
if (seq_depth[mIdx[j]] > curr_max)
{
curr_max <- seq_depth[mIdx[j]]
max_sample_idx <- rep_idcs[j]
}
}
}
## throw out all but maximum one
duplicate_throwout[rep_idcs] <- T
duplicate_throwout[max_sample_idx] <- F
}
stop("Error: Expected method name should be ESTIMATE, ABSOLUTE, LUMP, IHC, CPE")
}
return(which(!duplicate_throwout))

out_df = TCGA_purities %>% dplyr::select(all_of(c("TCGA_barcode",method))) %>%
na.omit() %>%
inner_join(data.frame("TCGA_barcode"=TCGA_barcodes),by="TCGA_barcode") %>%
mutate("TCGA_sample_and_type" = extractSampleAndType(TCGA_barcode)) %>%
rename("purity"=method) %>%
group_by(TCGA_sample_and_type) %>%
summarize("TCGA_barcode_max_purity"=TCGA_barcode[which.max(purity)]) %>%
pull(TCGA_barcode_max_purity) %>%
return()
},


filterDuplicatesRandom = function(TCGA_barcodes,seed = 1989){
set.seed(seed)
out_df = data.frame("TCGA_barcode"=TCGA_barcodes) %>%
mutate(TCGA_sample_and_type = extractSampleAndType(TCGA_barcodes))

permuted_df = out_df[sample(1:nrow(out_df)),]
permuted_df %>%
dplyr::filter(!duplicated(TCGA_sample_and_type)) %>%
pull(TCGA_barcode) %>%
return()
},

# filter methylation duplicates based on less
# overall missingness in the measured beta values
# methylation_betas should be a data frame with
# probes in rows and UUIDs in columns, with the
# exception of the first column which is probeID
filterDuplicatesMethylationMissingness = function(methylation_betas)
{
missing_df = data.frame("uuid"=names(methylation_betas[,-1]),
"prop_miss"=apply(methylation_betas[,-1],2,
function(x){sum(is.na(x))/length(x)}))
tcga_barcodes = ndc$mapUUIDtoTCGA(missing_df$uuid)
keep_barcodes = missing_df %>% inner_join(tcga_barcodes, by=c("uuid"="file_id")) %>%
dplyr::rename("TCGA_barcode"=submitter_id) %>%
mutate(TCGA_sample_and_type = ndc$extractSampleAndType(TCGA_barcode)) %>%
group_by(TCGA_sample_and_type) %>%
summarize("TCGA_barcode_min_prop_miss"=TCGA_barcode[which.min(prop_miss)]) %>%
pull(TCGA_barcode_min_prop_miss)

keep_uuids = tcga_barcodes %>% dplyr::filter(submitter_id %in% keep_barcodes) %>%
pull(file_id)
return(keep_uuids)
},

## Filter samples indicated by *TCGA_barcodes* based on the method *method* and threshold *threshold*
## Returns a list of indices indicating which samples should be kept
filterPurity = function(TCGA_barcodes, method="ESTIMATE", threshold=.6){
Expand Down Expand Up @@ -813,11 +831,8 @@ NetworkDataCompanion=setRefClass("NetworkDataCompanion",
#' @export CreateNetworkDataCompanionObject
CreateNetworkDataCompanionObject <- function(clinical_patient_file=NULL, project_name="default_project"){

## Load purities for purity package
## Load purities from purity package
obj <- CreateTCGAPurityFilteringObject()

#this is an easy hack for not breaking, but something smarter would be great
#TODO skip purityfiltering completely and do it here instead
with_purity = c("ACC","BLCA","BRCA","CESC","COAD","GBM",
"HNSC","KIRC","KIRP","KICH","LGG","LIHC",
"LUAD","LUSC","OV","PRAD","READ","SKCM",
Expand All @@ -826,6 +841,7 @@ CreateNetworkDataCompanionObject <- function(clinical_patient_file=NULL, project

if(project_name %in% with_purity){
purities <- obj$get_tissue_purities(cancer_type = project_name)
purities$TCGA_barcode = row.names(purities)
}

## Load patient's clinical data
Expand Down
19 changes: 19 additions & 0 deletions tests/testthat/test_filterDuplicatesRandom.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
context("[NetworkDataCompanion] Testing filterDuplicatesRandom function ... ")

test_that("Testing filterDuplicatesRandom",{

my_friend = NetworkDataCompanion::CreateNetworkDataCompanionObject()
barcode1 = "TCGA-A1-1234-01A"
barcode2 = "TCGA-A1-1234-11A"
barcode3 = "TCGA-A1-1234-21A"
barcode4 = "TCGA-A1-1234-01B"
barcode5 = "TCGA-A1-1234-21B"
barcode6 = "TCGA-A1-1234-21C"
barcodes = c(barcode1, barcode2, barcode3, barcode4, barcode5, barcode6)
set.seed(1989)
permuted_barcodes = sample(my_friend$extractSampleAndType(barcodes))
keep_barcodes_manual = names(permuted_barcodes[!duplicated(permuted_barcodes)])
keep_barcodes_function = my_friend$filterDuplicatesRandom(barcodes,seed=1989)
expect_equal(keep_barcodes_manual, keep_barcodes_function)

})
17 changes: 0 additions & 17 deletions tests/testthat/test_filterDuplicatesSeqDepthOther.R

This file was deleted.

Loading