linlabcode · shengqh · Feb 7, 2017 · Feb 10, 2017 · Feb 10, 2017 · Feb 13, 2017
diff --git a/enhancerPromoter.py b/enhancerPromoter.py
@@ -43,8 +43,7 @@
 
 import sys
 import os
-print "Using python version %s" % sys.version
-
+whereAmI = os.path.dirname(os.path.realpath(__file__))
 
 #importing utils package
 #add locations of files and global parameters in this section
@@ -66,7 +65,6 @@
 import numpy
 import re
 import time
-from distutils.spawn import find_executable
 from collections import defaultdict
 
 
@@ -75,7 +73,6 @@
 #================================================================================
 
 #add locations of files and global parameters in this section
-whereAmI = os.path.dirname(os.path.realpath(__file__))
 
 pipeline_dir = whereAmI + '/'
 
@@ -104,9 +101,11 @@
 #================================================================================
 
 
-def loadAnnotFile(genome,tss_window,geneList=[],skip_cache=False):
+def loadAnnotFile(genome,genomeDirectory,tss_window,geneList=[],skip_cache=False):
     """
     load in the annotation and create a startDict and tss collection for a set of refseq IDs a given genome
+    20170213, add by Quanhu Sheng
+    return validGenes
     """
     genomeDict = {
         'HG18': 'annotation/hg18_refseq.ucsc',
@@ -119,14 +118,6 @@ def loadAnnotFile(genome,tss_window,geneList=[],skip_cache=False):
         'HG38': 'annotation/hg38_refseq.ucsc',
         }
 
-    genomeDirectoryDict = {
-        'HG19':'/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg19/Sequence/Chromosomes/',
-        'RN6':'/storage/cylin/grail/genomes/Rattus_norvegicus/UCSC/rn6/Sequence/Chromosomes/',
-        'MM9':'/storage/cylin/grail/genomes/Mus_musculus/UCSC/mm9/Sequence/Chromosomes/',
-        'MM10':'/storage/cylin/grail/genomes/Mus_musculus/UCSC/mm10/Sequence/Chromosomes/',
-        'HG38': '/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg38/Sequence/Chromosomes/',
-        }
-
     mouse_convert_file = '%s/annotation/HMD_HumanPhenotype.rpt' % (whereAmI)
 
     #making a dictionary for mouse to human conversion
@@ -136,15 +127,11 @@ def loadAnnotFile(genome,tss_window,geneList=[],skip_cache=False):
     for line in mouse_convert_table:
         mouse_convert_dict[line[4]] = line[0]
 
-    genomeDirectory = genomeDirectoryDict[string.upper(genome)]
-
-
     #making a chrom_dict that is a list of all chroms with sequence
     chrom_list = utils.uniquify([name.split('.')[0] for name in os.listdir(genomeDirectory) if len(name) >0])
 
     annotFile = whereAmI + '/' + genomeDict[string.upper(genome)]
 
-
     if not skip_cache:
         # Try loading from a cache, if the crc32 matches
         annotPathHash = zlib.crc32(annotFile) & 0xFFFFFFFF  # hash the entire location of this script
@@ -175,8 +162,13 @@ def loadAnnotFile(genome,tss_window,geneList=[],skip_cache=False):
     tssLoci =[]
     if geneList==[]:
         geneList = startDict.keys()
+    validGenes = []
     for gene in geneList:
-        tssLoci.append(utils.makeTSSLocus(gene,startDict,tss_window,tss_window))
+        if gene in startDict:
+            tssLoci.append(utils.makeTSSLocus(gene,startDict,window,window))
+            validGenes.append(gene)
+        else:
+            print('\tWARNING: gene %s not in annotation database. Ignoring.' % gene)
 
     tssCollection = utils.LocusCollection(tssLoci,50)
 
@@ -185,7 +177,7 @@ def loadAnnotFile(genome,tss_window,geneList=[],skip_cache=False):
         with open(cache_file_path, 'wb') as cache_fh:
             cPickle.dump((startDict, tssCollection), cache_fh, cPickle.HIGHEST_PROTOCOL)
 
-    return startDict, tssCollection, genomeDirectory, chrom_list, mouse_convert_dict
+    return startDict, tssCollection, genomeDirectory, chrom_list, mouse_convert_dict, validGenes
 
 
 
@@ -195,7 +187,6 @@ def splitRegions(inputGFF,tssCollection):
 
     #if even a single coordinate is shared with the +/-1kb 
     splitGFF = []
-    debugCount = 0
     for line in inputGFF:
 
         chrom = line[0]
@@ -384,7 +375,7 @@ def makePeakTable(paramDict,splitGFFPath,averageTablePath,startDict,geneList,gen
     signalTable = utils.parseTable(averageTablePath,'\t')
 
     print('LOADING CPGS ISLANDS')
-    cpgBed = utils.parseTable(paramDict['cpgPath'],'\t')
+    cpgBed = utils.parseTable(bedFile,'\t')
     cpgLoci = []
     for line in cpgBed:
         cpgLoci.append(utils.Locus(line[0],line[1],line[2],'.',line[-1]))
@@ -397,8 +388,9 @@ def makePeakTable(paramDict,splitGFFPath,averageTablePath,startDict,geneList,gen
     tss_prox_loci = []
     tss_distal_loci = []
     for refID in geneList:
-        tss_prox_loci.append(utils.makeTSSLocus(refID,startDict,tss_window,tss_window))
-        tss_distal_loci.append(utils.makeTSSLocus(refID,startDict,distal_window,distal_window))
+        if refID in startDict:
+          tss_prox_loci.append(utils.makeTSSLocus(refID,startDict,tss_window,tss_window))
+          tss_distal_loci.append(utils.makeTSSLocus(refID,startDict,distal_window,distal_window))
 
     #make a 1kb flanking and 50kb flanking collection
     tss_prox_collection = utils.LocusCollection(tss_prox_loci,50)
@@ -680,10 +672,18 @@ def callGSEA(outputFolder,analysisName,top,analysis_type ='enhancer_vs_promoter'
     gseaOutputFolder = utils.formatFolder('%sgsea_top_all_c2%s' % (outputFolder,suffix),True)
     rptLabel = '%s_top_all%s' % (analysisName,suffix)
 
+    gseaBashFile.write('rm -rf %s/%s.Gsea* \n' % (gseaOutputFolder, rptLabel))
     gseaCmd_all = 'java -Xmx4000m -cp %s xtools.gsea.Gsea -res %s -cls %s%s -gmx %s -collapse false -mode Max_probe -norm meandiv -nperm 1000 -permute gene_set -rnd_type no_balance -scoring_scheme weighted -rpt_label %s -metric Diff_of_Classes -sort real -order descending -include_only_symbols true -make_sets true -median false -num 100 -plot_top_x 20 -rnd_seed timestamp -save_rnd_lists false -set_max 500 -set_min 15 -zip_report false -out %s -gui false' % (gseaPath,gctPath,clsPath,analysis_dict[analysis_type][1],gmxPath,rptLabel,gseaOutputFolder)
 
     gseaBashFile.write(gseaCmd_all)
     gseaBashFile.write('\n')
+
+    if top != 'all':
+      #for top N
+      gctPath = '%s%s_top_%s.gct' % (outputFolder,analysisName,top)
+      clsPath = '%s%s_top_%s.cls' % (outputFolder,analysisName,top)
+      gseaOutputFolder = utils.formatFolder('%sgsea_top_%s_c2' % (outputFolder,top),True)
+      rptLabel = '%s_top_%s' % (analysisName,top)
 
     if use_top:
         #for top N
@@ -692,6 +692,7 @@ def callGSEA(outputFolder,analysisName,top,analysis_type ='enhancer_vs_promoter'
         gseaOutputFolder = utils.formatFolder('%sgsea_top_%s_c2%s' % (outputFolder,top,suffix),True)
         rptLabel = '%s_top_%s%s' % (analysisName,top,suffix)
 
+        gseaBashFile.write('rm -rf %s/%s.Gsea* \n' % (gseaOutputFolder, rptLabel))
         gseaCmd_top = 'java -Xmx4000m -cp %s xtools.gsea.Gsea -res %s -cls %s%s -gmx %s -collapse false -mode Max_probe -norm meandiv -nperm 1000 -permute gene_set -rnd_type no_balance -scoring_scheme weighted -rpt_label %s -metric Diff_of_Classes -sort real -order descending -include_only_symbols true -make_sets true -median false -num 100 -plot_top_x 20 -rnd_seed timestamp -save_rnd_lists false -set_max 500 -set_min 15 -zip_report false -out %s -gui false' % (gseaPath,gctPath,clsPath,analysis_dict[analysis_type][1],gmxPath,rptLabel,gseaOutputFolder)
 
         gseaBashFile.write(gseaCmd_top)
@@ -728,7 +729,7 @@ def detectGSEAOutput(analysisName,outputFolder,top,analysis_type ='enhancer_vs_p
 
         candidateFolderList = [folder for folder in folderList if folder.count('%s_top_%s%s.Gsea' % (analysisName,top,suffix)) == 1]
         if len(candidateFolderList) > 1:
-            print('ERROR: MULTIPLE GSEA OUTPUT FOLDERS DETECTED FOR %s WITH TOP %s GENES' % (analysisName,string.upper(str(top))))
+            print('ERROR: MULTIPLE GSEA OUTPUT FOLDERS DETECTED FOR %s WITH TOP %s GENES' % (analysisName,string.upper(top)))
             sys.exit()
         elif len(candidateFolderList) == 0:
             time.sleep(10)
@@ -838,7 +839,14 @@ def main():
     parser.add_argument("--tads", dest="tads", type=str,
                         help="Include a .bed of tad regions to restrict enhancer/gene association", required=False,default=None)
 
-
+    #add by Quanhu Sheng
+    parser.add_argument("--genomeDirectory", dest="genomeDirectory", type=str,
+                        help="Enter the folder contains chromosome sequence in fasta format", required=True)
+    #gseaPath = '/usr/local/bin/gsea/gsea2-2.2.2.jar'
+    #gmxPath = '/grail/annotations/gsea/c2.all.v5.1.symbols.gmt' #C2 set
+    parser.add_argument("--gseaPath", dest="gseaPath", type=str, help="Enter GSEA jar file location", required=True)
+    parser.add_argument("--gmxPath", dest="gmxPath", type=str, help="Enter GSEA gmt file location, such as c2.all.v5.1.symbols.gmt", required=True)
+    parser.add_argument("--cpgPath", dest="cpgPath", type=str, help="Enter cpg coordinates in bed format", required=True)
 
 
 
@@ -858,7 +866,6 @@ def main():
         #top analysis subset
         top = args.top
 
-
         #input genome
         genome = args.genome.upper()
         print('PERFORMING ANALYSIS ON %s GENOME BUILD' % (genome))
@@ -922,16 +929,17 @@ def main():
         #check if tads are being invoked
         if args.tads:
             print('LOADING TAD LOCATIONS FROM %s' % (args.tads))
-            use_tads = True
             tads_path = args.tads
         else:
-            use_tads = False
             tads_path = ''
 
         print('LOADING ANNOTATION DATA FOR GENOME %s' % (genome))
 
+        genomeDirectory=args.genomeDirectory
+
+        #Quanhu Sheng, assign valid_genes to geneList, 20230712
         #important here to define the window
-        startDict,tssCollection,genomeDirectory,chrom_list,mouse_convert_dict = loadAnnotFile(genome,tss_window,geneList,True)
+        startDict,tssCollection,genomeDirectory,chrom_list,mouse_convert_dict,geneList = loadAnnotFile(genome,genomeDirectory,tss_window,geneList,True)
         #print(tssCollection.getOverlap(utils.Locus('chr5',171387630,171388066,'.')))
         #sys.exit()
 
@@ -1054,10 +1062,11 @@ def main():
             callR_GSEA(top_promoterTablePath,top_distalTablePath,outputFolder,analysisName+'_enhancer_vs_promoter',top)
             callR_GSEA(top_signalTablePath,top_backgroundTablePath,outputFolder,analysisName+'_total_contribution',top)
 
+          print('MAKING NES PLOTS FOR TOP %s GENES' % (top))
+          callR_GSEA(top_promoterTablePath,top_distalTablePath,outputFolder,analysisName,top)
 
         print('DETECTING GSEA OUTPUT FOR ALL GENES')
-        #for top 
-        all_promoterTablePath,all_distalTablePath = detectGSEAOutput(analysisName,outputFolder,'all')
+        top_promoterTablePath,top_distalTablePath = detectGSEAOutput(analysisName,outputFolder,'all')
 
         print('MAKING NES PLOTS FOR ALL GENES')
         callR_GSEA(all_promoterTablePath,all_distalTablePath,outputFolder,analysisName,'all')

diff --git a/enhancerPromoter_waterfall.R b/enhancerPromoter_waterfall.R
@@ -44,8 +44,7 @@ plotContribution <- function(geneTable,analysisName,outputFolder,top=0,nBins=100
 	if(top == 0){
 		top = nrow(geneTable)
 		topString = 'all'
-	}
-	else if(top > nrow(geneTable)){
+	}else if(top > nrow(geneTable)){
 		top = nrow(geneTable)
 		topString = 'all'			
 	}else{
@@ -127,11 +126,11 @@ runWaterfall <- function(geneTable,analysisName,outputFolder,top=0,geneList = c(
 	if(top == 0){
 		top = nrow(geneTable)
 		topString = 'all'
-	}else if(top > nrow(geneTable)){
-		top = nrow(geneTable)
-		topString = 'all'			
-	}else{
-		topString = as.character(top)
+	}else {
+    topString = as.character(top)
+	  if(top > nrow(geneTable)){
+	    top = nrow(geneTable)
+	  }
 	}
 	#get the signal for tss, distal and total
 	promoterSignal =geneTable[,2]
@@ -342,9 +341,9 @@ geneTable = read.delim(geneTablePath,sep='\t')
 plotContribution(geneTable,analysisName,outputFolder)
 runWaterfall(geneTable,analysisName,outputFolder)
 
-
 #top N
 print('working on top genes')
 print(top)
-plotContribution(geneTable,analysisName,outputFolder,as.numeric(top)) 
-runWaterfall(geneTable,analysisName,outputFolder,as.numeric(top))
+plotContribution(geneTable,analysisName,outputFolder,top)
+runWaterfall(geneTable,analysisName,outputFolder,top)
+