diff --git a/ase/fdr.R b/ase/fdr.R index 440eff7..cd4bfbc 100644 --- a/ase/fdr.R +++ b/ase/fdr.R @@ -1,73 +1,73 @@ - - +sys = 'kalk' sys = 'local' + if(sys == 'kalk'){ + syntdir = '/proj/b2012046/edsgard/ase/sim/data/synt/ase' + ase.method.resdir = 'FILLMEIN' } if(sys == 'local'){ - simdir = '~/Dropbox/postdoc/projects/ase/data/sim/synt' - snpdir = '~/Documents/postdoc/ase/sim/snps' - exprdir = file.path(simdir, 'expr') - asedir = file.path(simdir, 'ase') - number2sample.map.file = file.path(simdir, 'customref2enstcounts.filemapping.tab') - snp2count.file = file.path(simdir, 'snps', 'snp2count.RData') - snp2count.uniq.file = file.path(simdir, 'snps', 'snp2count.uniq.RData') - ase.res.file = file.path(asedir, 'ase.res.RData') - annovardir = '~/Dropbox/postdoc/projects/ase/data/sim/synt/annovar' - #annovar.tab.file = file.path(annovardir, 'synt.snps.tab.variant_function') - #annovar.file = file.path(annovardir, 'vars.refgene.annot.RData') - annovar.file = file.path(annovardir, 'synt.snps.geneannot.RData') - vars.annot.list.file = file.path(asedir, 'ase.res.annot.RData') - genes.pass.file = file.path(asedir, 'multisnp.genes.pass.RData') - nonuniq.genes.file = file.path(asedir, 'nonuniq.genes.RData') - induced.ase.res.file = file.path(asedir, 'induced.ase.res.RData') - induced.ase.sig.file = file.path(asedir, 'induced.ase.sig.RData') + syntdir = '~/Dropbox/postdoc/projects/ase/data/sim/synt/ase' + ase.method.resdir = '~/Documents/postdoc/ase/asebenchmark/novercontrol/data/gsnap/ase' } +#Variant-based analysis +synt.sig.ase.res.file = file.path(syntdir, 'ase.noninduced.vars.RData') +sig.ase.res.file = file.path(ase.method.resdir, 'sig.ase.res.RData') + +#Gene-based analysis +synt.genes.pass.file = file.path(syntdir, 'multisnp.genes.pass.RData') +method.genes.pass.file = file.path(ase.method.resdir, 'multisnp.genes.pass.RData') + +#Cond-dep ASE +induced.ase.sig.file = file.path(syntdir, 'induced.ase.sig.RData') + + main <- function(){ + ###### #1. Condition-INDEPENDENT ASE ###### - ### #1.1 Variants ### - #Without test for sig diff number of treat vs untreat - ase.noninduced.vars.file = file.path(asedir, 'ase.noninduced.vars.RData') - load(ase.noninduced.vars.file) #alt.sig.vars + + #Load "TRUTH" + load(synt.sig.ase.res.file) #sig.vars, alt.sig.vars + true.sig.vars = unique(unlist(lapply(sig.vars, '[[', 'chrpos'))) true.alt.sig.vars = alt.sig.vars - resdir = '~/Dropbox/postdoc/projects/ase/res/sim' - ase.cond.diff.file = file.path(resdir, 'ase.cond.diff.tab') - load(ase.cond.diff.file) #alt.sig.vars + + #Load significant variants after application of ASE method + load(sig.ase.res.file) #sig.vars, alt.sig.vars + sig.vars = unique(unlist(lapply(sig.vars, rownames))) + sig.vars = gsub('^chr', '', sig.vars) + alt.sig.vars = gsub('^chr', '', alt.sig.vars) - #get fdr + #get fdr, alt allele direction filtered: NO + fdr = get.fdr(sig.vars, true.sig.vars) + print(fdr) #7.4% + + #get fdr, alt allele direction filtered: YES fdr = get.fdr(alt.sig.vars, true.alt.sig.vars) - print(fdr) #29%, 6049, 21092 - - #With test for sig diff treat vs untreat - ase.noninduced.vars.file = file.path(asedir, 'ase.noninduced.vars.RData') - load(ase.noninduced.vars.file) - true.sig.alt.ase = sig.alt.ase - resdir = '~/Dropbox/postdoc/projects/ase/res/sim' - ase.cond.diff.file = file.path(resdir, 'ase.cond.diff.tab') - load(ase.cond.diff.file) #sig.alt.ase - #get fdr - fdr = get.fdr(rownames(sig.alt.ase), rownames(true.sig.alt.ase)) - print(fdr) #96%, 25, 26 + print(fdr) #6.3% + #get sensitivity + sens = get.sense(alt.sig.vars, true.alt.sig.vars) + print(sens) ### #1.2 Genes ### - #load data - load(genes.pass.file) + + #Load "TRUTH" + load(synt.genes.pass.file) #genes.pass, gene2nsamples true.genes.pass = genes.pass true.gene2nsamples = gene2nsamples - run.ase.datadir = '~/Dropbox/postdoc/projects/ase/data/sim/ase' - genes.pass.file = file.path(run.ase.datadir, 'multisnp.genes.pass.RData') - load(genes.pass.file) #genes.pass + #Load significant variants after application of ASE method + load(method.genes.pass.file) #genes.pass, gene2nsamples + #get fdr, n.samples = 2 fdr = get.fdr(genes.pass, true.genes.pass) print(fdr) #39%, 146, 373 @@ -87,8 +87,7 @@ main <- function(){ true.genes.pass = genes.pass true.sig.vars.nsamples1 = sig.vars.nsamples1 true.genes.pass.nsamples1 = genes.pass.nsamples1 - ase.datadir = '~/Dropbox/postdoc/projects/ase/data/sim/ase' - induced.ase.sig.file = file.path(ase.datadir, 'induced.ase.sig.RData') + induced.ase.sig.file = file.path(ase.method.resdir, 'induced.ase.sig.RData') load(induced.ase.sig.file) #sig.vars, genes.pass ## @@ -181,6 +180,17 @@ get.fdr <- function(feat.pass, true.feat.pass){ return(fdr) } +get.sens <- function(feat.pass, true.feat.pass){ + tp = intersect(feat.pass, true.feat.pass) + fn = setdiff(true.feat.pass, feat.pass) + n.tp = length(tp) + n.fn = length(fn) + sens = n.tp / (n.tp + n.fn) + + sens = c(sens, n.tp, n.fn) + return(sens) +} + filter.altallele <- function(vars, frac.thr = 0.5, gene.col = 'gene annot', frac.col = 'frac', pval.col = 'mod.padj', min.alt = 1, alpha = 0.05){ sig.alt.vars = vars[which(vars[, pval.col] < alpha & vars[, frac.col] > frac.thr), ] diff --git a/ase/snv.ase.R b/ase/snv.ase.R index 5132ffb..1eedcad 100644 --- a/ase/snv.ase.R +++ b/ase/snv.ase.R @@ -4,12 +4,14 @@ sys = 'local' if(sys == 'kalkyl'){ } -if(sys == 'local'){ - varcalls.datadir = '~/Dropbox/postdoc/projects/ase/data/sim/varcalls' - basecount.datadir='~/Dropbox/postdoc/projects/ase/data/sim/basecount' - ase.datadir = '~/Dropbox/postdoc/projects/ase/data/sim/ase' - resdir = '~/Dropbox/postdoc/projects/ase/res/sim' - annovar.datadir = '~/Dropbox/postdoc/projects/ase/data/sim/annovar' +if(sys == 'local'){ + basedir = '~/Documents/postdoc/ase/asebenchmark/novercontrol/data/gsnap' + varcalls.datadir = file.path(basedir, 'varcalls') + basecount.datadir = file.path(basedir, 'basecount') + annovar.datadir = file.path(basedir, 'annovar') + ase.datadir = file.path(basedir, 'ase') + resdir = '~/Dropbox/postdoc/projects/ase_msc_benchmarking/res/gsnap/ase' + dbsnpdir = '~/Documents/postdoc/ase/asebenchmark/novercontrol/data/annot/dbsnp' } #mpileup calls @@ -21,14 +23,16 @@ vars.regex = '.*hetvars$' #basecounts bc.regex = '.*basecount$' -#annovar +#annot annovar.file = file.path(annovar.datadir, 'annovar_all_variants.RData') +dbsnp.file = file.path(dbsnpdir, 'dbsnp.v135.common.pos.txt') + #Res bc.file = file.path(basecount.datadir, 'bc.RData') +bc.dbsnp.file = file.path(basecount.datadir, 'bc.dbsnp.RData') ase.res.file = file.path(ase.datadir, 'ase.res.RData') -ase.sample.cond.diff.file = file.path(ase.datadir, 'induced.ase.res.RData') -induced.ase.sig.file = file.path(ase.datadir, 'induced.ase.sig.RData') +sig.ase.res.file = file.path(ase.datadir, 'sig.ase.res.RData') main <- function(){ @@ -53,7 +57,7 @@ main <- function(){ #Dump save(basecount.list, file = bc.file) - + ### #Filter @@ -66,7 +70,20 @@ main <- function(){ bc.filt = lapply(bc.filt, function(bc){bc[which(bc[, 'site.dp'] >= min.dp), ]}) #total number of vars - length(unique(unlist(lapply(bc.filt, '[[', 'site')))) #112756 + length(unique(unlist(lapply(bc.filt, '[[', 'site')))) #97628 + + #Filter on dbSNP presence + dbsnp.vars = read.table(dbsnp.file, stringsAsFactors = FALSE) + colnames(dbsnp.vars) = 'site' + bc.filt = lapply(bc.filt, function(jsample.vars, dbsnp.vars){merge(jsample.vars, dbsnp.vars, by = 'site')}, dbsnp.vars = dbsnp.vars) + + #total number of vars + length(unique(unlist(lapply(bc.filt, '[[', 'site')))) #77969 + + #Dump + save(bc.filt, file = bc.dbsnp.file) + + ### #DP dist @@ -104,7 +121,7 @@ main <- function(){ #Dump save(ase.res, file = ase.res.file) - + ### #Sig hits stats @@ -115,7 +132,14 @@ main <- function(){ #Get sig hits sig.vars = lapply(ase.res, function(sample.vars, alpha, pval.col){sample.vars = sample.vars[which(sample.vars[, pval.col] <= alpha), ]; return(sample.vars);}, alpha = alpha, pval.col = pval.col) - sig.vars.mat = get.sig.mat(ase.res, alpha, pval.col = pval.col) + + #Filter on alternative allele direction + alt.vars = lapply(sig.vars, alt.filter) + alt.sig.vars = unique(unlist(alt.vars)) + length(alt.sig.vars) #17832 + + #Dump + save(sig.vars, alt.sig.vars, file = sig.ase.res.file) ### @@ -129,14 +153,7 @@ main <- function(){ #number of uniq sig.vars length(unique(unlist(lapply(sig.vars, '[[', 'site')))) #45952 - - - - #((see varstats.R for inspiration, or its enough with the stuff in multiplesnsps2gene.R)) - #gene annot - #DE - ### #Gene based analysis. diff --git a/bam/bedtoolshist2pdf.R b/bam/bedtoolshist2pdf.R index 9ec9e85..3a6375a 100644 --- a/bam/bedtoolshist2pdf.R +++ b/bam/bedtoolshist2pdf.R @@ -42,8 +42,8 @@ main <- function(histfile2id, tab.outfile, hist.pdf.file){ hist.tabs = read.files(histfile2id) dp2frac = get.dp2frac(hist.tabs) dp2bases = get.dp2bases(hist.tabs) - plot.dp2frac(dp2frac, ltypes = histfile2id[names(dp2frac), 'ltypes'], colors = histfile2id[names(dp2frac), 'colors']) - plot.dp2bases(dp2bases) + plot.dp2y(dp2frac, ltypes = histfile2id[names(dp2frac), 'ltypes'], colors = histfile2id[names(dp2frac), 'colors']) + plot.dp2y(dp2bases) #Plot @@ -66,11 +66,11 @@ main <- function(histfile2id, tab.outfile, hist.pdf.file){ par(mfrow = c(1, 2)) unstim.ind = grep('unstim', names(dp2frac)) dp2frac.unstim = dp2frac[unstim.ind] - plot.dp2frac(dp2frac.unstim, ltypes = histfile2id[names(dp2frac.unstim), 'ltypes'], colors = histfile2id[names(dp2frac.unstim), 'colors']) + plot.dp2y(dp2frac.unstim, ltypes = histfile2id[names(dp2frac.unstim), 'ltypes'], colors = histfile2id[names(dp2frac.unstim), 'colors']) lps.ind = grep('LPS', names(dp2frac)) dp2frac.lps = dp2frac[lps.ind] - plot.dp2frac(dp2frac.lps, ltypes = histfile2id[names(dp2frac.lps), 'ltypes'], colors = histfile2id[names(dp2frac.lps), 'colors']) + plot.dp2y(dp2frac.lps, ltypes = histfile2id[names(dp2frac.lps), 'ltypes'], colors = histfile2id[names(dp2frac.lps), 'colors']) dev.off() } @@ -145,7 +145,7 @@ plot.dp2y <- function(dp2frac, max.plot.dp = 100, ylab = 'fraction covered', lty n.hist = length(dp2frac) ylim = range(unlist(dp2frac)) - ylim = c(0, 5e8) + #ylim = c(0, 5e8) jhist = 1 plot(dp2frac[[jhist]][min.plot.dp:max.plot.dp], xlab = 'depth', ylab = ylab, col=colors[jhist], type = 'l', ylim = ylim, lty = ltypes[jhist]) for(jhist in 2:n.hist){