diff --git a/enhancerPromoter.py b/enhancerPromoter.py index e58b563..faff2bd 100755 --- a/enhancerPromoter.py +++ b/enhancerPromoter.py @@ -43,8 +43,7 @@ import sys import os -print "Using python version %s" % sys.version - +whereAmI = os.path.dirname(os.path.realpath(__file__)) #importing utils package #add locations of files and global parameters in this section @@ -66,7 +65,6 @@ import numpy import re import time -from distutils.spawn import find_executable from collections import defaultdict @@ -75,7 +73,6 @@ #================================================================================ #add locations of files and global parameters in this section -whereAmI = os.path.dirname(os.path.realpath(__file__)) pipeline_dir = whereAmI + '/' @@ -104,9 +101,11 @@ #================================================================================ -def loadAnnotFile(genome,tss_window,geneList=[],skip_cache=False): +def loadAnnotFile(genome,genomeDirectory,tss_window,geneList=[],skip_cache=False): """ load in the annotation and create a startDict and tss collection for a set of refseq IDs a given genome + 20170213, add by Quanhu Sheng + return validGenes """ genomeDict = { 'HG18': 'annotation/hg18_refseq.ucsc', @@ -119,14 +118,6 @@ def loadAnnotFile(genome,tss_window,geneList=[],skip_cache=False): 'HG38': 'annotation/hg38_refseq.ucsc', } - genomeDirectoryDict = { - 'HG19':'/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg19/Sequence/Chromosomes/', - 'RN6':'/storage/cylin/grail/genomes/Rattus_norvegicus/UCSC/rn6/Sequence/Chromosomes/', - 'MM9':'/storage/cylin/grail/genomes/Mus_musculus/UCSC/mm9/Sequence/Chromosomes/', - 'MM10':'/storage/cylin/grail/genomes/Mus_musculus/UCSC/mm10/Sequence/Chromosomes/', - 'HG38': '/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg38/Sequence/Chromosomes/', - } - mouse_convert_file = '%s/annotation/HMD_HumanPhenotype.rpt' % (whereAmI) #making a dictionary for mouse to human conversion @@ -136,15 +127,11 @@ def loadAnnotFile(genome,tss_window,geneList=[],skip_cache=False): for line in mouse_convert_table: mouse_convert_dict[line[4]] = line[0] - genomeDirectory = genomeDirectoryDict[string.upper(genome)] - - #making a chrom_dict that is a list of all chroms with sequence chrom_list = utils.uniquify([name.split('.')[0] for name in os.listdir(genomeDirectory) if len(name) >0]) annotFile = whereAmI + '/' + genomeDict[string.upper(genome)] - if not skip_cache: # Try loading from a cache, if the crc32 matches annotPathHash = zlib.crc32(annotFile) & 0xFFFFFFFF # hash the entire location of this script @@ -175,8 +162,13 @@ def loadAnnotFile(genome,tss_window,geneList=[],skip_cache=False): tssLoci =[] if geneList==[]: geneList = startDict.keys() + validGenes = [] for gene in geneList: - tssLoci.append(utils.makeTSSLocus(gene,startDict,tss_window,tss_window)) + if gene in startDict: + tssLoci.append(utils.makeTSSLocus(gene,startDict,window,window)) + validGenes.append(gene) + else: + print('\tWARNING: gene %s not in annotation database. Ignoring.' % gene) tssCollection = utils.LocusCollection(tssLoci,50) @@ -185,7 +177,7 @@ def loadAnnotFile(genome,tss_window,geneList=[],skip_cache=False): with open(cache_file_path, 'wb') as cache_fh: cPickle.dump((startDict, tssCollection), cache_fh, cPickle.HIGHEST_PROTOCOL) - return startDict, tssCollection, genomeDirectory, chrom_list, mouse_convert_dict + return startDict, tssCollection, genomeDirectory, chrom_list, mouse_convert_dict, validGenes @@ -195,7 +187,6 @@ def splitRegions(inputGFF,tssCollection): #if even a single coordinate is shared with the +/-1kb splitGFF = [] - debugCount = 0 for line in inputGFF: chrom = line[0] @@ -384,7 +375,7 @@ def makePeakTable(paramDict,splitGFFPath,averageTablePath,startDict,geneList,gen signalTable = utils.parseTable(averageTablePath,'\t') print('LOADING CPGS ISLANDS') - cpgBed = utils.parseTable(paramDict['cpgPath'],'\t') + cpgBed = utils.parseTable(bedFile,'\t') cpgLoci = [] for line in cpgBed: cpgLoci.append(utils.Locus(line[0],line[1],line[2],'.',line[-1])) @@ -397,8 +388,9 @@ def makePeakTable(paramDict,splitGFFPath,averageTablePath,startDict,geneList,gen tss_prox_loci = [] tss_distal_loci = [] for refID in geneList: - tss_prox_loci.append(utils.makeTSSLocus(refID,startDict,tss_window,tss_window)) - tss_distal_loci.append(utils.makeTSSLocus(refID,startDict,distal_window,distal_window)) + if refID in startDict: + tss_prox_loci.append(utils.makeTSSLocus(refID,startDict,tss_window,tss_window)) + tss_distal_loci.append(utils.makeTSSLocus(refID,startDict,distal_window,distal_window)) #make a 1kb flanking and 50kb flanking collection tss_prox_collection = utils.LocusCollection(tss_prox_loci,50) @@ -680,10 +672,18 @@ def callGSEA(outputFolder,analysisName,top,analysis_type ='enhancer_vs_promoter' gseaOutputFolder = utils.formatFolder('%sgsea_top_all_c2%s' % (outputFolder,suffix),True) rptLabel = '%s_top_all%s' % (analysisName,suffix) + gseaBashFile.write('rm -rf %s/%s.Gsea* \n' % (gseaOutputFolder, rptLabel)) gseaCmd_all = 'java -Xmx4000m -cp %s xtools.gsea.Gsea -res %s -cls %s%s -gmx %s -collapse false -mode Max_probe -norm meandiv -nperm 1000 -permute gene_set -rnd_type no_balance -scoring_scheme weighted -rpt_label %s -metric Diff_of_Classes -sort real -order descending -include_only_symbols true -make_sets true -median false -num 100 -plot_top_x 20 -rnd_seed timestamp -save_rnd_lists false -set_max 500 -set_min 15 -zip_report false -out %s -gui false' % (gseaPath,gctPath,clsPath,analysis_dict[analysis_type][1],gmxPath,rptLabel,gseaOutputFolder) gseaBashFile.write(gseaCmd_all) gseaBashFile.write('\n') + + if top != 'all': + #for top N + gctPath = '%s%s_top_%s.gct' % (outputFolder,analysisName,top) + clsPath = '%s%s_top_%s.cls' % (outputFolder,analysisName,top) + gseaOutputFolder = utils.formatFolder('%sgsea_top_%s_c2' % (outputFolder,top),True) + rptLabel = '%s_top_%s' % (analysisName,top) if use_top: #for top N @@ -692,6 +692,7 @@ def callGSEA(outputFolder,analysisName,top,analysis_type ='enhancer_vs_promoter' gseaOutputFolder = utils.formatFolder('%sgsea_top_%s_c2%s' % (outputFolder,top,suffix),True) rptLabel = '%s_top_%s%s' % (analysisName,top,suffix) + gseaBashFile.write('rm -rf %s/%s.Gsea* \n' % (gseaOutputFolder, rptLabel)) gseaCmd_top = 'java -Xmx4000m -cp %s xtools.gsea.Gsea -res %s -cls %s%s -gmx %s -collapse false -mode Max_probe -norm meandiv -nperm 1000 -permute gene_set -rnd_type no_balance -scoring_scheme weighted -rpt_label %s -metric Diff_of_Classes -sort real -order descending -include_only_symbols true -make_sets true -median false -num 100 -plot_top_x 20 -rnd_seed timestamp -save_rnd_lists false -set_max 500 -set_min 15 -zip_report false -out %s -gui false' % (gseaPath,gctPath,clsPath,analysis_dict[analysis_type][1],gmxPath,rptLabel,gseaOutputFolder) gseaBashFile.write(gseaCmd_top) @@ -728,7 +729,7 @@ def detectGSEAOutput(analysisName,outputFolder,top,analysis_type ='enhancer_vs_p candidateFolderList = [folder for folder in folderList if folder.count('%s_top_%s%s.Gsea' % (analysisName,top,suffix)) == 1] if len(candidateFolderList) > 1: - print('ERROR: MULTIPLE GSEA OUTPUT FOLDERS DETECTED FOR %s WITH TOP %s GENES' % (analysisName,string.upper(str(top)))) + print('ERROR: MULTIPLE GSEA OUTPUT FOLDERS DETECTED FOR %s WITH TOP %s GENES' % (analysisName,string.upper(top))) sys.exit() elif len(candidateFolderList) == 0: time.sleep(10) @@ -838,7 +839,14 @@ def main(): parser.add_argument("--tads", dest="tads", type=str, help="Include a .bed of tad regions to restrict enhancer/gene association", required=False,default=None) - + #add by Quanhu Sheng + parser.add_argument("--genomeDirectory", dest="genomeDirectory", type=str, + help="Enter the folder contains chromosome sequence in fasta format", required=True) + #gseaPath = '/usr/local/bin/gsea/gsea2-2.2.2.jar' + #gmxPath = '/grail/annotations/gsea/c2.all.v5.1.symbols.gmt' #C2 set + parser.add_argument("--gseaPath", dest="gseaPath", type=str, help="Enter GSEA jar file location", required=True) + parser.add_argument("--gmxPath", dest="gmxPath", type=str, help="Enter GSEA gmt file location, such as c2.all.v5.1.symbols.gmt", required=True) + parser.add_argument("--cpgPath", dest="cpgPath", type=str, help="Enter cpg coordinates in bed format", required=True) @@ -858,7 +866,6 @@ def main(): #top analysis subset top = args.top - #input genome genome = args.genome.upper() print('PERFORMING ANALYSIS ON %s GENOME BUILD' % (genome)) @@ -922,16 +929,17 @@ def main(): #check if tads are being invoked if args.tads: print('LOADING TAD LOCATIONS FROM %s' % (args.tads)) - use_tads = True tads_path = args.tads else: - use_tads = False tads_path = '' print('LOADING ANNOTATION DATA FOR GENOME %s' % (genome)) + genomeDirectory=args.genomeDirectory + + #Quanhu Sheng, assign valid_genes to geneList, 20230712 #important here to define the window - startDict,tssCollection,genomeDirectory,chrom_list,mouse_convert_dict = loadAnnotFile(genome,tss_window,geneList,True) + startDict,tssCollection,genomeDirectory,chrom_list,mouse_convert_dict,geneList = loadAnnotFile(genome,genomeDirectory,tss_window,geneList,True) #print(tssCollection.getOverlap(utils.Locus('chr5',171387630,171388066,'.'))) #sys.exit() @@ -1054,10 +1062,11 @@ def main(): callR_GSEA(top_promoterTablePath,top_distalTablePath,outputFolder,analysisName+'_enhancer_vs_promoter',top) callR_GSEA(top_signalTablePath,top_backgroundTablePath,outputFolder,analysisName+'_total_contribution',top) + print('MAKING NES PLOTS FOR TOP %s GENES' % (top)) + callR_GSEA(top_promoterTablePath,top_distalTablePath,outputFolder,analysisName,top) print('DETECTING GSEA OUTPUT FOR ALL GENES') - #for top - all_promoterTablePath,all_distalTablePath = detectGSEAOutput(analysisName,outputFolder,'all') + top_promoterTablePath,top_distalTablePath = detectGSEAOutput(analysisName,outputFolder,'all') print('MAKING NES PLOTS FOR ALL GENES') callR_GSEA(all_promoterTablePath,all_distalTablePath,outputFolder,analysisName,'all') diff --git a/enhancerPromoter_waterfall.R b/enhancerPromoter_waterfall.R index 064a7fb..1d4390e 100755 --- a/enhancerPromoter_waterfall.R +++ b/enhancerPromoter_waterfall.R @@ -44,8 +44,7 @@ plotContribution <- function(geneTable,analysisName,outputFolder,top=0,nBins=100 if(top == 0){ top = nrow(geneTable) topString = 'all' - } - else if(top > nrow(geneTable)){ + }else if(top > nrow(geneTable)){ top = nrow(geneTable) topString = 'all' }else{ @@ -127,11 +126,11 @@ runWaterfall <- function(geneTable,analysisName,outputFolder,top=0,geneList = c( if(top == 0){ top = nrow(geneTable) topString = 'all' - }else if(top > nrow(geneTable)){ - top = nrow(geneTable) - topString = 'all' - }else{ - topString = as.character(top) + }else { + topString = as.character(top) + if(top > nrow(geneTable)){ + top = nrow(geneTable) + } } #get the signal for tss, distal and total promoterSignal =geneTable[,2] @@ -342,9 +341,9 @@ geneTable = read.delim(geneTablePath,sep='\t') plotContribution(geneTable,analysisName,outputFolder) runWaterfall(geneTable,analysisName,outputFolder) - #top N print('working on top genes') print(top) -plotContribution(geneTable,analysisName,outputFolder,as.numeric(top)) -runWaterfall(geneTable,analysisName,outputFolder,as.numeric(top)) +plotContribution(geneTable,analysisName,outputFolder,top) +runWaterfall(geneTable,analysisName,outputFolder,top) +