From 0d36cc37ba716197a3228459f562ef41ee8cfbaa Mon Sep 17 00:00:00 2001 From: shengq1 Date: Tue, 7 Feb 2017 14:26:14 -0600 Subject: [PATCH 1/6] change absolute path to option of main function --- enhancerPromoter.py | 68 ++++++++++++++++----------------------------- 1 file changed, 24 insertions(+), 44 deletions(-) diff --git a/enhancerPromoter.py b/enhancerPromoter.py index de18fa0..59f7efc 100644 --- a/enhancerPromoter.py +++ b/enhancerPromoter.py @@ -45,23 +45,21 @@ print "Using python version %s" % sys.version +import os +whereAmI = os.path.dirname(os.path.realpath(__file__)) #importing utils package -sys.path.append('/home/chazlin/src/pipeline/') +sys.path.append(whereAmI) import argparse import cPickle import utils -import pipeline_dfci -import subprocess -import os import string import tempfile import zlib import numpy import re import time -from distutils.spawn import find_executable from collections import defaultdict @@ -70,21 +68,9 @@ #================================================================================ #add locations of files and global parameters in this section -whereAmI = os.path.dirname(os.path.realpath(__file__)) bamliquidator_path = 'bamliquidator_batch.py' -#using a paramater dictionary in liue of a yaml or json for now - - - -paramDict = {'cpgPath': '/grail/projects/mycn_cyl/beds/cpg_islands.bed', - - - } - - - #================================================================================ #===================================CLASSES====================================== #================================================================================ @@ -110,16 +96,8 @@ def loadAnnotFile(genome,window,geneList=[],skip_cache=False): 'RN6': 'annotation/rn6_refseq.ucsc', } - genomeDirectoryDict = { - 'HG19':'/grail/genomes/Homo_sapiens/UCSC/hg19/Sequence/Chromosomes/', - 'RN6':'/grail/genomes/Rattus_norvegicus/UCSC/rn6/Sequence/Chromosomes/', - } - - genomeDirectory = genomeDirectoryDict[string.upper(genome)] - annotFile = whereAmI + '/' + genomeDict[string.upper(genome)] - if not skip_cache: # Try loading from a cache, if the crc32 matches annotPathHash = zlib.crc32(annotFile) & 0xFFFFFFFF # hash the entire location of this script @@ -158,7 +136,7 @@ def loadAnnotFile(genome,window,geneList=[],skip_cache=False): with open(cache_file_path, 'wb') as cache_fh: cPickle.dump((startDict, tssCollection), cache_fh, cPickle.HIGHEST_PROTOCOL) - return startDict, tssCollection, genomeDirectory + return startDict, tssCollection @@ -168,7 +146,6 @@ def splitRegions(inputGFF,tssCollection): #if even a single coordinate is shared with the +/-1kb splitGFF = [] - debugCount = 0 for line in inputGFF: chrom = line[0] @@ -341,7 +318,7 @@ def makeAverageTable(outputFolder,analysisName,useBackground = False): -def makePeakTable(paramDict,splitGFFPath,averageTablePath,startDict,geneList,genomeDirectory,tads_path=''): +def makePeakTable(bedFile,splitGFFPath,averageTablePath,startDict,geneList,genomeDirectory,tads_path=''): ''' makes the final peak table with ebox info @@ -357,7 +334,7 @@ def makePeakTable(paramDict,splitGFFPath,averageTablePath,startDict,geneList,gen signalTable = utils.parseTable(averageTablePath,'\t') print('LOADING CPGS ISLANDS') - cpgBed = utils.parseTable(paramDict['cpgPath'],'\t') + cpgBed = utils.parseTable(bedFile,'\t') cpgLoci = [] for line in cpgBed: cpgLoci.append(utils.Locus(line[0],line[1],line[2],'.',line[-1])) @@ -617,13 +594,13 @@ def callRWaterfall(geneTablePath,outputFolder,analysisName,top): sys.exit() -def callGSEA(outputFolder,analysisName,top): +def callGSEA(gseaPath, gmxPath, outputFolder,analysisName,top): ''' runs C2 GSEA ''' - gseaPath = '/usr/local/bin/gsea/gsea2-2.2.2.jar' - gmxPath = '/grail/annotations/gsea/c2.all.v5.1.symbols.gmt' #C2 set + #gseaPath = '/usr/local/bin/gsea/gsea2-2.2.2.jar' + #gmxPath = '/grail/annotations/gsea/c2.all.v5.1.symbols.gmt' #C2 set gseaBashFilePath = '%s%s_GSEA_cmd.sh' % (outputFolder,analysisName) @@ -749,8 +726,6 @@ def main(): help="Enter .gff or .bed file of regions to analyze", required=True) parser.add_argument("-g", "--genome", dest="genome", type=str, help="specify a genome, HG18,HG19,MM8,MM9,MM10,RN6 are currently supported", required=True) - - # output flag parser.add_argument("-o", "--output", dest="output", type=str, help="Enter the output folder.", required=True) @@ -776,19 +751,24 @@ def main(): parser.add_argument("--tads", dest="tads", type=str, help="Include a .bed of tad regions to restrict enhancer/gene association", required=False,default=None) - + #add by Quanhu Sheng + parser.add_argument("--genomeDirectory", dest="genomeDirectory", type=str, + help="Enter the folder contains chromosome sequence in fasta format", required=True) + #gseaPath = '/usr/local/bin/gsea/gsea2-2.2.2.jar' + #gmxPath = '/grail/annotations/gsea/c2.all.v5.1.symbols.gmt' #C2 set + parser.add_argument("--gseaPath", dest="gseaPath", type=str, help="Enter GSEA jar file location", required=True) + parser.add_argument("--gmxPath", dest="gmxPath", type=str, help="Enter GSEA gmt file location, such as c2.all.v5.1.symbols.gmt", required=True) + parser.add_argument("--cpgPath", dest="cpgPath", type=str, help="Enter cpg coordinates in bed format", required=True) args = parser.parse_args() print(args) #minimum arguments needed to proceed - if args.bam and args.input and args.genome and args.output: - + if args.bam and args.input and args.genome and args.genomeDirectory and args.output and args.gseaPath and args.gmxPath and args.cpgPath: #top analysis subset top = args.top - #input genome genome = args.genome print('PERFORMING ANALYSIS ON %s GENOME BUILD' % (genome)) @@ -840,16 +820,16 @@ def main(): #check if tads are being invoked if args.tads: print('LOADING TAD LOCATIONS FROM %s' % (args.tads)) - use_tads = True tads_path = args.tads else: - use_tads = False tads_path = '' print('LOADING ANNOTATION DATA FOR GENOME %s' % (genome)) + genomeDirectory=args.genomeDirectory + #important here to define the window - startDict,tssCollection,genomeDirectory = loadAnnotFile(genome,window,geneList,True) + startDict,tssCollection = loadAnnotFile(genome,window,geneList,True) print(len(startDict)) #now we need to split the input region @@ -892,7 +872,7 @@ def main(): print('PEAK TABLE OUTPUT ALREADY EXISTS') peakTable = utils.parseTable(peakTablePath,'\t') else: - peakTable = makePeakTable(paramDict,splitGFFPath,averageTablePath,startDict,geneList,genomeDirectory,tads_path) + peakTable = makePeakTable(args.cpgPath,splitGFFPath,averageTablePath,startDict,geneList,genomeDirectory,tads_path) utils.unParseTable(peakTable,peakTablePath,'\t') geneTable = makeGeneTable(peakTable,analysisName) @@ -906,7 +886,7 @@ def main(): #now let's call gsea print('RUNNING GSEA ON C2') - callGSEA(outputFolder,analysisName,top) + callGSEA(args.gseaPath, args.gmxPath, outputFolder,analysisName,top) print('DETECTING GSEA OUTPUT FOR TOP %s GENES' % (top)) @@ -919,7 +899,7 @@ def main(): print('DETECTING GSEA OUTPUT FOR ALL GENES') #for top - all_promoterTablePath,all_distalTablePath = detectGSEAOutput(analysisName,outputFolder,'all') + detectGSEAOutput(analysisName,outputFolder,'all') print('MAKING NES PLOTS FOR ALL GENES') callR_GSEA(top_promoterTablePath,top_distalTablePath,outputFolder,analysisName,'all') From e63f9fd4496d4dbf872c9fb8f1c67a63facd297f Mon Sep 17 00:00:00 2001 From: shengq1 Date: Fri, 10 Feb 2017 11:41:08 -0600 Subject: [PATCH 2/6] for refgeneid in activity file which not in the annotation database, print warning message and ignore it. --- enhancerPromoter.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/enhancerPromoter.py b/enhancerPromoter.py index 59f7efc..15b3fe9 100644 --- a/enhancerPromoter.py +++ b/enhancerPromoter.py @@ -127,7 +127,10 @@ def loadAnnotFile(genome,window,geneList=[],skip_cache=False): startDict = utils.makeStartDict(annotFile, geneList) tssLoci =[] for gene in geneList: - tssLoci.append(utils.makeTSSLocus(gene,startDict,window,window)) + if gene in startDict: + tssLoci.append(utils.makeTSSLocus(gene,startDict,window,window)) + else: + print('\tWARNING: gene %s not in annotation database. Ignoring.' % gene) tssCollection = utils.LocusCollection(tssLoci,50) From 5ccfc19d713ca34953127a40f787195c54381b2d Mon Sep 17 00:00:00 2001 From: shengq1 Date: Fri, 10 Feb 2017 11:43:44 -0600 Subject: [PATCH 3/6] ignore the refseqid not in annotation database --- enhancerPromoter.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/enhancerPromoter.py b/enhancerPromoter.py index 15b3fe9..f2405b8 100644 --- a/enhancerPromoter.py +++ b/enhancerPromoter.py @@ -350,8 +350,9 @@ def makePeakTable(bedFile,splitGFFPath,averageTablePath,startDict,geneList,genom tss_1kb_loci = [] tss_50kb_loci = [] for refID in geneList: - tss_1kb_loci.append(utils.makeTSSLocus(refID,startDict,1000,1000)) - tss_50kb_loci.append(utils.makeTSSLocus(refID,startDict,50000,50000)) + if refID in startDict: + tss_1kb_loci.append(utils.makeTSSLocus(refID,startDict,1000,1000)) + tss_50kb_loci.append(utils.makeTSSLocus(refID,startDict,50000,50000)) #make a 1kb flanking and 50kb flanking collection tss_1kb_collection = utils.LocusCollection(tss_1kb_loci,50) From 8f164220294dc399fbc44e8dbcffac8b1e326717 Mon Sep 17 00:00:00 2001 From: shengq1 Date: Mon, 13 Feb 2017 13:07:12 -0600 Subject: [PATCH 4/6] bugfix --- enhancerPromoter_waterfall.R | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/enhancerPromoter_waterfall.R b/enhancerPromoter_waterfall.R index 803c7bc..d36ee6c 100644 --- a/enhancerPromoter_waterfall.R +++ b/enhancerPromoter_waterfall.R @@ -44,8 +44,7 @@ plotContribution <- function(geneTable,analysisName,outputFolder,top=0,nBins=100 if(top == 0){ top = nrow(geneTable) topString = 'all' - } - else if(top > nrow(geneTable)){ + }else if(top > nrow(geneTable)){ top = nrow(geneTable) topString = 'all' }else{ @@ -127,11 +126,11 @@ runWaterfall <- function(geneTable,analysisName,outputFolder,top=0,geneList = c( if(top == 0){ top = nrow(geneTable) topString = 'all' - }else if(top > nrow(geneTable)){ - top = nrow(geneTable) - topString = 'all' - }else{ - topString = as.character(top) + }else { + topString = as.character(top) + if(top > nrow(geneTable)){ + top = nrow(geneTable) + } } #get the signal for tss, distal and total From c2f7e5c76516b88842c8454bd4ae28f0123c7609 Mon Sep 17 00:00:00 2001 From: shengq1 Date: Mon, 13 Feb 2017 15:25:41 -0600 Subject: [PATCH 5/6] compatible with top gene less than user assigned value (or default 5000). Remove previous gsea folder before running gsea analysis. --- enhancerPromoter.py | 56 ++++++++++++++++++++---------------- enhancerPromoter_waterfall.R | 11 +++---- 2 files changed, 38 insertions(+), 29 deletions(-) diff --git a/enhancerPromoter.py b/enhancerPromoter.py index f2405b8..a1d01f5 100644 --- a/enhancerPromoter.py +++ b/enhancerPromoter.py @@ -85,6 +85,8 @@ def loadAnnotFile(genome,window,geneList=[],skip_cache=False): """ load in the annotation and create a startDict and tss collection for a set of refseq IDs a given genome + 20170213, add by Quanhu Sheng + return validGenes """ genomeDict = { 'HG18': 'annotation/hg18_refseq.ucsc', @@ -126,9 +128,11 @@ def loadAnnotFile(genome,window,geneList=[],skip_cache=False): startDict = utils.makeStartDict(annotFile, geneList) tssLoci =[] + validGenes = [] for gene in geneList: if gene in startDict: tssLoci.append(utils.makeTSSLocus(gene,startDict,window,window)) + validGenes.append(gene) else: print('\tWARNING: gene %s not in annotation database. Ignoring.' % gene) @@ -139,7 +143,7 @@ def loadAnnotFile(genome,window,geneList=[],skip_cache=False): with open(cache_file_path, 'wb') as cache_fh: cPickle.dump((startDict, tssCollection), cache_fh, cPickle.HIGHEST_PROTOCOL) - return startDict, tssCollection + return startDict, tssCollection, validGenes @@ -621,21 +625,24 @@ def callGSEA(gseaPath, gmxPath, outputFolder,analysisName,top): gseaOutputFolder = utils.formatFolder('%sgsea_top_all_c2' % (outputFolder),True) rptLabel = '%s_top_all' % (analysisName) + gseaBashFile.write('rm -rf %s/%s.Gsea* \n' % (gseaOutputFolder, rptLabel)) gseaCmd_all = 'java -Xmx4000m -cp %s xtools.gsea.Gsea -res %s -cls %s#PROMOTER_versus_DISTAL -gmx %s -collapse false -mode Max_probe -norm meandiv -nperm 1000 -permute gene_set -rnd_type no_balance -scoring_scheme weighted -rpt_label %s -metric Diff_of_Classes -sort real -order descending -include_only_symbols true -make_sets true -median false -num 100 -plot_top_x 20 -rnd_seed timestamp -save_rnd_lists false -set_max 500 -set_min 15 -zip_report false -out %s -gui false' % (gseaPath,gctPath,clsPath,gmxPath,rptLabel,gseaOutputFolder) gseaBashFile.write(gseaCmd_all) gseaBashFile.write('\n') + + if top != 'all': + #for top N + gctPath = '%s%s_top_%s.gct' % (outputFolder,analysisName,top) + clsPath = '%s%s_top_%s.cls' % (outputFolder,analysisName,top) + gseaOutputFolder = utils.formatFolder('%sgsea_top_%s_c2' % (outputFolder,top),True) + rptLabel = '%s_top_%s' % (analysisName,top) - #for top N - gctPath = '%s%s_top_%s.gct' % (outputFolder,analysisName,top) - clsPath = '%s%s_top_%s.cls' % (outputFolder,analysisName,top) - gseaOutputFolder = utils.formatFolder('%sgsea_top_%s_c2' % (outputFolder,top),True) - rptLabel = '%s_top_%s' % (analysisName,top) - - gseaCmd_top = 'java -Xmx4000m -cp %s xtools.gsea.Gsea -res %s -cls %s#PROMOTER_versus_DISTAL -gmx %s -collapse false -mode Max_probe -norm meandiv -nperm 1000 -permute gene_set -rnd_type no_balance -scoring_scheme weighted -rpt_label %s -metric Diff_of_Classes -sort real -order descending -include_only_symbols true -make_sets true -median false -num 100 -plot_top_x 20 -rnd_seed timestamp -save_rnd_lists false -set_max 500 -set_min 15 -zip_report false -out %s -gui false' % (gseaPath,gctPath,clsPath,gmxPath,rptLabel,gseaOutputFolder) + gseaBashFile.write('rm -rf %s/%s.Gsea* \n' % (gseaOutputFolder, rptLabel)) + gseaCmd_top = 'java -Xmx4000m -cp %s xtools.gsea.Gsea -res %s -cls %s#PROMOTER_versus_DISTAL -gmx %s -collapse false -mode Max_probe -norm meandiv -nperm 1000 -permute gene_set -rnd_type no_balance -scoring_scheme weighted -rpt_label %s -metric Diff_of_Classes -sort real -order descending -include_only_symbols true -make_sets true -median false -num 100 -plot_top_x 20 -rnd_seed timestamp -save_rnd_lists false -set_max 500 -set_min 15 -zip_report false -out %s -gui false' % (gseaPath,gctPath,clsPath,gmxPath,rptLabel,gseaOutputFolder) - gseaBashFile.write(gseaCmd_top) - gseaBashFile.write('\n') + gseaBashFile.write(gseaCmd_top) + gseaBashFile.write('\n') gseaBashFile.close() os.system('bash %s' % (gseaBashFilePath)) @@ -646,7 +653,7 @@ def detectGSEAOutput(analysisName,outputFolder,top): ''' tries to detect the .xls files that show up when GSEA is done running ''' - + #first figure out the friggin output folder gseaParentFolder = '%sgsea_top_%s_c2/' % (outputFolder,top) @@ -656,7 +663,7 @@ def detectGSEAOutput(analysisName,outputFolder,top): candidateFolderList = [folder for folder in folderList if folder.count('%s_top_%s.Gsea' % (analysisName,top)) == 1] if len(candidateFolderList) > 1: - print('ERROR: MULTIPLE GSEA OUTPUT FOLDERS DETECTED FOR %s WITH TOP %s GENES' % (analysisName,string.upper(str(top)))) + print('ERROR: MULTIPLE GSEA OUTPUT FOLDERS DETECTED FOR %s WITH TOP %s GENES' % (analysisName,string.upper(top))) sys.exit() elif len(candidateFolderList) == 0: time.sleep(10) @@ -833,7 +840,8 @@ def main(): genomeDirectory=args.genomeDirectory #important here to define the window - startDict,tssCollection = loadAnnotFile(genome,window,geneList,True) + startDict,tssCollection,geneList = loadAnnotFile(genome,window,geneList,True) + print('IDENTIFIED %s valid ACTIVE GENES' % (len(geneList))) print(len(startDict)) #now we need to split the input region @@ -883,27 +891,27 @@ def main(): geneTablePath = '%s%s_GENE_TABLE.txt' % (outputFolder,analysisName) utils.unParseTable(geneTable,geneTablePath,'\t') - + + if(top > len(geneTable)): + top = 'all' + #now call the R code print('CALLING R PLOTTING SCRIPTS') callRWaterfall(geneTablePath,outputFolder,analysisName,top) #now let's call gsea print('RUNNING GSEA ON C2') - callGSEA(args.gseaPath, args.gmxPath, outputFolder,analysisName,top) - + callGSEA(args.gseaPath, args.gmxPath, outputFolder,analysisName,top) - print('DETECTING GSEA OUTPUT FOR TOP %s GENES' % (top)) - #for top - top_promoterTablePath,top_distalTablePath = detectGSEAOutput(analysisName,outputFolder,top) - - print('MAKING NES PLOTS FOR TOP %s GENES' % (top)) - callR_GSEA(top_promoterTablePath,top_distalTablePath,outputFolder,analysisName,top) + if top != 'all': + print('DETECTING GSEA OUTPUT FOR TOP %s GENES' % (top)) + top_promoterTablePath,top_distalTablePath = detectGSEAOutput(analysisName,outputFolder,top) + print('MAKING NES PLOTS FOR TOP %s GENES' % (top)) + callR_GSEA(top_promoterTablePath,top_distalTablePath,outputFolder,analysisName,top) print('DETECTING GSEA OUTPUT FOR ALL GENES') - #for top - detectGSEAOutput(analysisName,outputFolder,'all') + top_promoterTablePath,top_distalTablePath = detectGSEAOutput(analysisName,outputFolder,'all') print('MAKING NES PLOTS FOR ALL GENES') callR_GSEA(top_promoterTablePath,top_distalTablePath,outputFolder,analysisName,'all') diff --git a/enhancerPromoter_waterfall.R b/enhancerPromoter_waterfall.R index d36ee6c..67fe9fb 100644 --- a/enhancerPromoter_waterfall.R +++ b/enhancerPromoter_waterfall.R @@ -12,7 +12,7 @@ print(args) geneTablePath = args[3] outputFolder = args[4] analysisName = args[5] -top = as.numeric(args[6]) +top = args[6] #========================================================= #===========================DEBUG SECTION================= @@ -311,8 +311,9 @@ geneTable = read.delim(geneTablePath,sep='\t') plotContribution(geneTable,analysisName,outputFolder) runWaterfall(geneTable,analysisName,outputFolder) - -#top N -plotContribution(geneTable,analysisName,outputFolder,as.numeric(top)) -runWaterfall(geneTable,analysisName,outputFolder,as.numeric(top)) +if(top != 'all'){ + #top N + plotContribution(geneTable,analysisName,outputFolder,as.numeric(top)) + runWaterfall(geneTable,analysisName,outputFolder,as.numeric(top)) +} From dbd49bb699ffc46f9221c7d44580d924bf4e005b Mon Sep 17 00:00:00 2001 From: Quanhu Sheng Date: Wed, 12 Jul 2023 09:29:06 -0500 Subject: [PATCH 6/6] use genomeDirectory from arguments --- enhancerPromoter.py | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/enhancerPromoter.py b/enhancerPromoter.py index c591a8d..faff2bd 100755 --- a/enhancerPromoter.py +++ b/enhancerPromoter.py @@ -101,7 +101,7 @@ #================================================================================ -def loadAnnotFile(genome,tss_window,geneList=[],skip_cache=False): +def loadAnnotFile(genome,genomeDirectory,tss_window,geneList=[],skip_cache=False): """ load in the annotation and create a startDict and tss collection for a set of refseq IDs a given genome 20170213, add by Quanhu Sheng @@ -118,14 +118,6 @@ def loadAnnotFile(genome,tss_window,geneList=[],skip_cache=False): 'HG38': 'annotation/hg38_refseq.ucsc', } - genomeDirectoryDict = { - 'HG19':'/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg19/Sequence/Chromosomes/', - 'RN6':'/storage/cylin/grail/genomes/Rattus_norvegicus/UCSC/rn6/Sequence/Chromosomes/', - 'MM9':'/storage/cylin/grail/genomes/Mus_musculus/UCSC/mm9/Sequence/Chromosomes/', - 'MM10':'/storage/cylin/grail/genomes/Mus_musculus/UCSC/mm10/Sequence/Chromosomes/', - 'HG38': '/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg38/Sequence/Chromosomes/', - } - mouse_convert_file = '%s/annotation/HMD_HumanPhenotype.rpt' % (whereAmI) #making a dictionary for mouse to human conversion @@ -135,9 +127,6 @@ def loadAnnotFile(genome,tss_window,geneList=[],skip_cache=False): for line in mouse_convert_table: mouse_convert_dict[line[4]] = line[0] - genomeDirectory = genomeDirectoryDict[string.upper(genome)] - - #making a chrom_dict that is a list of all chroms with sequence chrom_list = utils.uniquify([name.split('.')[0] for name in os.listdir(genomeDirectory) if len(name) >0]) @@ -950,7 +939,7 @@ def main(): #Quanhu Sheng, assign valid_genes to geneList, 20230712 #important here to define the window - startDict,tssCollection,genomeDirectory,chrom_list,mouse_convert_dict,geneList = loadAnnotFile(genome,tss_window,geneList,True) + startDict,tssCollection,genomeDirectory,chrom_list,mouse_convert_dict,geneList = loadAnnotFile(genome,genomeDirectory,tss_window,geneList,True) #print(tssCollection.getOverlap(utils.Locus('chr5',171387630,171388066,'.'))) #sys.exit()