From 0d36cc37ba716197a3228459f562ef41ee8cfbaa Mon Sep 17 00:00:00 2001
From: shengq1 <quanhu.sheng@vanderbilt.edu>
Date: Tue, 7 Feb 2017 14:26:14 -0600
Subject: [PATCH 1/6] change absolute path to option of main function

---
 enhancerPromoter.py | 68 ++++++++++++++++-----------------------------
 1 file changed, 24 insertions(+), 44 deletions(-)

diff --git a/enhancerPromoter.py b/enhancerPromoter.py
index de18fa0..59f7efc 100644
--- a/enhancerPromoter.py
+++ b/enhancerPromoter.py
@@ -45,23 +45,21 @@
 
 print "Using python version %s" % sys.version
 
+import os
+whereAmI = os.path.dirname(os.path.realpath(__file__))
 
 #importing utils package
-sys.path.append('/home/chazlin/src/pipeline/')
+sys.path.append(whereAmI)
 
 import argparse
 import cPickle
 import utils
-import pipeline_dfci
-import subprocess
-import os
 import string
 import tempfile
 import zlib
 import numpy
 import re
 import time
-from distutils.spawn import find_executable
 from collections import defaultdict
 
 
@@ -70,21 +68,9 @@
 #================================================================================
 
 #add locations of files and global parameters in this section
-whereAmI = os.path.dirname(os.path.realpath(__file__))
 
 bamliquidator_path = 'bamliquidator_batch.py'
 
-#using a paramater dictionary in liue of a yaml or json for now
-
-
-
-paramDict = {'cpgPath': '/grail/projects/mycn_cyl/beds/cpg_islands.bed',
-
-             
-             }
-
-
-
 #================================================================================
 #===================================CLASSES======================================
 #================================================================================
@@ -110,16 +96,8 @@ def loadAnnotFile(genome,window,geneList=[],skip_cache=False):
         'RN6': 'annotation/rn6_refseq.ucsc',
         }
 
-    genomeDirectoryDict = {
-        'HG19':'/grail/genomes/Homo_sapiens/UCSC/hg19/Sequence/Chromosomes/',
-        'RN6':'/grail/genomes/Rattus_norvegicus/UCSC/rn6/Sequence/Chromosomes/',
-        }
-        
-    genomeDirectory = genomeDirectoryDict[string.upper(genome)]
-
     annotFile = whereAmI + '/' + genomeDict[string.upper(genome)]
 
-
     if not skip_cache:
         # Try loading from a cache, if the crc32 matches
         annotPathHash = zlib.crc32(annotFile) & 0xFFFFFFFF  # hash the entire location of this script
@@ -158,7 +136,7 @@ def loadAnnotFile(genome,window,geneList=[],skip_cache=False):
         with open(cache_file_path, 'wb') as cache_fh:
             cPickle.dump((startDict, tssCollection), cache_fh, cPickle.HIGHEST_PROTOCOL)
 
-    return startDict, tssCollection, genomeDirectory
+    return startDict, tssCollection
 
 
 
@@ -168,7 +146,6 @@ def splitRegions(inputGFF,tssCollection):
 
     #if even a single coordinate is shared with the +/-1kb 
     splitGFF = []
-    debugCount = 0
     for line in inputGFF:
 
         chrom = line[0]
@@ -341,7 +318,7 @@ def makeAverageTable(outputFolder,analysisName,useBackground = False):
 
 
 
-def makePeakTable(paramDict,splitGFFPath,averageTablePath,startDict,geneList,genomeDirectory,tads_path=''):
+def makePeakTable(bedFile,splitGFFPath,averageTablePath,startDict,geneList,genomeDirectory,tads_path=''):
     
     '''
     makes the final peak table with ebox info
@@ -357,7 +334,7 @@ def makePeakTable(paramDict,splitGFFPath,averageTablePath,startDict,geneList,gen
     signalTable = utils.parseTable(averageTablePath,'\t')
 
     print('LOADING CPGS ISLANDS')
-    cpgBed = utils.parseTable(paramDict['cpgPath'],'\t')
+    cpgBed = utils.parseTable(bedFile,'\t')
     cpgLoci = []
     for line in cpgBed:
         cpgLoci.append(utils.Locus(line[0],line[1],line[2],'.',line[-1]))
@@ -617,13 +594,13 @@ def callRWaterfall(geneTablePath,outputFolder,analysisName,top):
         sys.exit()
 
 
-def callGSEA(outputFolder,analysisName,top):
+def callGSEA(gseaPath, gmxPath, outputFolder,analysisName,top):
 
     '''
     runs C2 GSEA
     '''
-    gseaPath = '/usr/local/bin/gsea/gsea2-2.2.2.jar'
-    gmxPath = '/grail/annotations/gsea/c2.all.v5.1.symbols.gmt' #C2 set
+    #gseaPath = '/usr/local/bin/gsea/gsea2-2.2.2.jar'
+    #gmxPath = '/grail/annotations/gsea/c2.all.v5.1.symbols.gmt' #C2 set
 
 
     gseaBashFilePath = '%s%s_GSEA_cmd.sh' % (outputFolder,analysisName)
@@ -749,8 +726,6 @@ def main():
                         help="Enter .gff or .bed file of regions to analyze", required=True)
     parser.add_argument("-g", "--genome", dest="genome", type=str,
                         help="specify a genome, HG18,HG19,MM8,MM9,MM10,RN6 are currently supported", required=True)
-    
-
     # output flag
     parser.add_argument("-o", "--output", dest="output", type=str,
                         help="Enter the output folder.", required=True)
@@ -776,19 +751,24 @@ def main():
     parser.add_argument("--tads", dest="tads", type=str,
                         help="Include a .bed of tad regions to restrict enhancer/gene association", required=False,default=None)
 
-
+    #add by Quanhu Sheng
+    parser.add_argument("--genomeDirectory", dest="genomeDirectory", type=str,
+                        help="Enter the folder contains chromosome sequence in fasta format", required=True)
+    #gseaPath = '/usr/local/bin/gsea/gsea2-2.2.2.jar'
+    #gmxPath = '/grail/annotations/gsea/c2.all.v5.1.symbols.gmt' #C2 set
+    parser.add_argument("--gseaPath", dest="gseaPath", type=str, help="Enter GSEA jar file location", required=True)
+    parser.add_argument("--gmxPath", dest="gmxPath", type=str, help="Enter GSEA gmt file location, such as c2.all.v5.1.symbols.gmt", required=True)
+    parser.add_argument("--cpgPath", dest="cpgPath", type=str, help="Enter cpg coordinates in bed format", required=True)
 
     args = parser.parse_args()
 
     print(args)
 
     #minimum arguments needed to proceed
-    if args.bam and args.input and args.genome and args.output:
-
+    if args.bam and args.input and args.genome and args.genomeDirectory and args.output and args.gseaPath and args.gmxPath and args.cpgPath:
         #top analysis subset
         top = args.top
 
-
         #input genome
         genome = args.genome
         print('PERFORMING ANALYSIS ON %s GENOME BUILD' % (genome))
@@ -840,16 +820,16 @@ def main():
         #check if tads are being invoked
         if args.tads:
             print('LOADING TAD LOCATIONS FROM %s' % (args.tads))
-            use_tads = True
             tads_path = args.tads
         else:
-            use_tads = False
             tads_path = ''
 
         print('LOADING ANNOTATION DATA FOR GENOME %s' % (genome))
         
+        genomeDirectory=args.genomeDirectory
+        
         #important here to define the window
-        startDict,tssCollection,genomeDirectory = loadAnnotFile(genome,window,geneList,True)
+        startDict,tssCollection = loadAnnotFile(genome,window,geneList,True)
         print(len(startDict))
 
         #now we need to split the input region 
@@ -892,7 +872,7 @@ def main():
             print('PEAK TABLE OUTPUT ALREADY EXISTS')
             peakTable = utils.parseTable(peakTablePath,'\t')
         else:
-            peakTable = makePeakTable(paramDict,splitGFFPath,averageTablePath,startDict,geneList,genomeDirectory,tads_path)        
+            peakTable = makePeakTable(args.cpgPath,splitGFFPath,averageTablePath,startDict,geneList,genomeDirectory,tads_path)        
             utils.unParseTable(peakTable,peakTablePath,'\t')
 
         geneTable = makeGeneTable(peakTable,analysisName)        
@@ -906,7 +886,7 @@ def main():
 
         #now let's call gsea
         print('RUNNING GSEA ON C2')
-        callGSEA(outputFolder,analysisName,top)        
+        callGSEA(args.gseaPath, args.gmxPath, outputFolder,analysisName,top)        
 
         
         print('DETECTING GSEA OUTPUT FOR TOP %s GENES' % (top))
@@ -919,7 +899,7 @@ def main():
 
         print('DETECTING GSEA OUTPUT FOR ALL GENES')
         #for top 
-        all_promoterTablePath,all_distalTablePath = detectGSEAOutput(analysisName,outputFolder,'all')
+        detectGSEAOutput(analysisName,outputFolder,'all')
 
         print('MAKING NES PLOTS FOR ALL GENES')
         callR_GSEA(top_promoterTablePath,top_distalTablePath,outputFolder,analysisName,'all')

From e63f9fd4496d4dbf872c9fb8f1c67a63facd297f Mon Sep 17 00:00:00 2001
From: shengq1 <quanhu.sheng@vanderbilt.edu>
Date: Fri, 10 Feb 2017 11:41:08 -0600
Subject: [PATCH 2/6] for refgeneid in activity file which not in the
 annotation database, print warning message and ignore it.

---
 enhancerPromoter.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/enhancerPromoter.py b/enhancerPromoter.py
index 59f7efc..15b3fe9 100644
--- a/enhancerPromoter.py
+++ b/enhancerPromoter.py
@@ -127,7 +127,10 @@ def loadAnnotFile(genome,window,geneList=[],skip_cache=False):
     startDict = utils.makeStartDict(annotFile, geneList)
     tssLoci =[]
     for gene in geneList:
-        tssLoci.append(utils.makeTSSLocus(gene,startDict,window,window))
+        if gene in startDict:
+            tssLoci.append(utils.makeTSSLocus(gene,startDict,window,window))
+        else:
+            print('\tWARNING: gene %s not in annotation database. Ignoring.' % gene)
 
     tssCollection = utils.LocusCollection(tssLoci,50)
 

From 5ccfc19d713ca34953127a40f787195c54381b2d Mon Sep 17 00:00:00 2001
From: shengq1 <quanhu.sheng@vanderbilt.edu>
Date: Fri, 10 Feb 2017 11:43:44 -0600
Subject: [PATCH 3/6] ignore the refseqid not in annotation database

---
 enhancerPromoter.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/enhancerPromoter.py b/enhancerPromoter.py
index 15b3fe9..f2405b8 100644
--- a/enhancerPromoter.py
+++ b/enhancerPromoter.py
@@ -350,8 +350,9 @@ def makePeakTable(bedFile,splitGFFPath,averageTablePath,startDict,geneList,genom
     tss_1kb_loci = []
     tss_50kb_loci = []
     for refID in geneList:
-        tss_1kb_loci.append(utils.makeTSSLocus(refID,startDict,1000,1000))
-        tss_50kb_loci.append(utils.makeTSSLocus(refID,startDict,50000,50000))
+        if refID in startDict:
+            tss_1kb_loci.append(utils.makeTSSLocus(refID,startDict,1000,1000))
+            tss_50kb_loci.append(utils.makeTSSLocus(refID,startDict,50000,50000))
 
     #make a 1kb flanking and 50kb flanking collection
     tss_1kb_collection = utils.LocusCollection(tss_1kb_loci,50)

From 8f164220294dc399fbc44e8dbcffac8b1e326717 Mon Sep 17 00:00:00 2001
From: shengq1 <quanhu.sheng@vanderbilt.edu>
Date: Mon, 13 Feb 2017 13:07:12 -0600
Subject: [PATCH 4/6] bugfix

---
 enhancerPromoter_waterfall.R | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/enhancerPromoter_waterfall.R b/enhancerPromoter_waterfall.R
index 803c7bc..d36ee6c 100644
--- a/enhancerPromoter_waterfall.R
+++ b/enhancerPromoter_waterfall.R
@@ -44,8 +44,7 @@ plotContribution <- function(geneTable,analysisName,outputFolder,top=0,nBins=100
 	if(top == 0){
 		top = nrow(geneTable)
 		topString = 'all'
-	}
-	else if(top > nrow(geneTable)){
+	}else if(top > nrow(geneTable)){
 		top = nrow(geneTable)
 		topString = 'all'			
 	}else{
@@ -127,11 +126,11 @@ runWaterfall <- function(geneTable,analysisName,outputFolder,top=0,geneList = c(
 	if(top == 0){
 		top = nrow(geneTable)
 		topString = 'all'
-	}else if(top > nrow(geneTable)){
-		top = nrow(geneTable)
-		topString = 'all'			
-	}else{
-		topString = as.character(top)
+	}else {
+    topString = as.character(top)
+	  if(top > nrow(geneTable)){
+	    top = nrow(geneTable)
+	  }
 	}
 
 	#get the signal for tss, distal and total

From c2f7e5c76516b88842c8454bd4ae28f0123c7609 Mon Sep 17 00:00:00 2001
From: shengq1 <quanhu.sheng@vanderbilt.edu>
Date: Mon, 13 Feb 2017 15:25:41 -0600
Subject: [PATCH 5/6] compatible with top gene less than user assigned value
 (or default 5000). Remove previous gsea folder before running gsea analysis.

---
 enhancerPromoter.py          | 56 ++++++++++++++++++++----------------
 enhancerPromoter_waterfall.R | 11 +++----
 2 files changed, 38 insertions(+), 29 deletions(-)

diff --git a/enhancerPromoter.py b/enhancerPromoter.py
index f2405b8..a1d01f5 100644
--- a/enhancerPromoter.py
+++ b/enhancerPromoter.py
@@ -85,6 +85,8 @@
 def loadAnnotFile(genome,window,geneList=[],skip_cache=False):
     """
     load in the annotation and create a startDict and tss collection for a set of refseq IDs a given genome
+    20170213, add by Quanhu Sheng
+    return validGenes
     """
     genomeDict = {
         'HG18': 'annotation/hg18_refseq.ucsc',
@@ -126,9 +128,11 @@ def loadAnnotFile(genome,window,geneList=[],skip_cache=False):
 
     startDict = utils.makeStartDict(annotFile, geneList)
     tssLoci =[]
+    validGenes = []
     for gene in geneList:
         if gene in startDict:
             tssLoci.append(utils.makeTSSLocus(gene,startDict,window,window))
+            validGenes.append(gene)
         else:
             print('\tWARNING: gene %s not in annotation database. Ignoring.' % gene)
 
@@ -139,7 +143,7 @@ def loadAnnotFile(genome,window,geneList=[],skip_cache=False):
         with open(cache_file_path, 'wb') as cache_fh:
             cPickle.dump((startDict, tssCollection), cache_fh, cPickle.HIGHEST_PROTOCOL)
 
-    return startDict, tssCollection
+    return startDict, tssCollection, validGenes
 
 
 
@@ -621,21 +625,24 @@ def callGSEA(gseaPath, gmxPath, outputFolder,analysisName,top):
     gseaOutputFolder = utils.formatFolder('%sgsea_top_all_c2' % (outputFolder),True)
     rptLabel = '%s_top_all' % (analysisName)
 
+    gseaBashFile.write('rm -rf %s/%s.Gsea* \n' % (gseaOutputFolder, rptLabel))
     gseaCmd_all = 'java -Xmx4000m -cp %s xtools.gsea.Gsea -res %s -cls %s#PROMOTER_versus_DISTAL -gmx %s -collapse false -mode Max_probe -norm meandiv -nperm 1000 -permute gene_set -rnd_type no_balance -scoring_scheme weighted -rpt_label %s -metric Diff_of_Classes -sort real -order descending -include_only_symbols true -make_sets true -median false -num 100 -plot_top_x 20 -rnd_seed timestamp -save_rnd_lists false -set_max 500 -set_min 15 -zip_report false -out %s -gui false' % (gseaPath,gctPath,clsPath,gmxPath,rptLabel,gseaOutputFolder)
 
     gseaBashFile.write(gseaCmd_all)
     gseaBashFile.write('\n')
+    
+    if top != 'all':
+      #for top N
+      gctPath = '%s%s_top_%s.gct' % (outputFolder,analysisName,top)
+      clsPath = '%s%s_top_%s.cls' % (outputFolder,analysisName,top)
+      gseaOutputFolder = utils.formatFolder('%sgsea_top_%s_c2' % (outputFolder,top),True)
+      rptLabel = '%s_top_%s' % (analysisName,top)
 
-    #for top N
-    gctPath = '%s%s_top_%s.gct' % (outputFolder,analysisName,top)
-    clsPath = '%s%s_top_%s.cls' % (outputFolder,analysisName,top)
-    gseaOutputFolder = utils.formatFolder('%sgsea_top_%s_c2' % (outputFolder,top),True)
-    rptLabel = '%s_top_%s' % (analysisName,top)
-
-    gseaCmd_top = 'java -Xmx4000m -cp %s xtools.gsea.Gsea -res %s -cls %s#PROMOTER_versus_DISTAL -gmx %s -collapse false -mode Max_probe -norm meandiv -nperm 1000 -permute gene_set -rnd_type no_balance -scoring_scheme weighted -rpt_label %s -metric Diff_of_Classes -sort real -order descending -include_only_symbols true -make_sets true -median false -num 100 -plot_top_x 20 -rnd_seed timestamp -save_rnd_lists false -set_max 500 -set_min 15 -zip_report false -out %s -gui false' % (gseaPath,gctPath,clsPath,gmxPath,rptLabel,gseaOutputFolder)
+      gseaBashFile.write('rm -rf %s/%s.Gsea* \n' % (gseaOutputFolder, rptLabel))
+      gseaCmd_top = 'java -Xmx4000m -cp %s xtools.gsea.Gsea -res %s -cls %s#PROMOTER_versus_DISTAL -gmx %s -collapse false -mode Max_probe -norm meandiv -nperm 1000 -permute gene_set -rnd_type no_balance -scoring_scheme weighted -rpt_label %s -metric Diff_of_Classes -sort real -order descending -include_only_symbols true -make_sets true -median false -num 100 -plot_top_x 20 -rnd_seed timestamp -save_rnd_lists false -set_max 500 -set_min 15 -zip_report false -out %s -gui false' % (gseaPath,gctPath,clsPath,gmxPath,rptLabel,gseaOutputFolder)
 
-    gseaBashFile.write(gseaCmd_top)
-    gseaBashFile.write('\n')
+      gseaBashFile.write(gseaCmd_top)
+      gseaBashFile.write('\n')
 
     gseaBashFile.close()
     os.system('bash %s' % (gseaBashFilePath))
@@ -646,7 +653,7 @@ def detectGSEAOutput(analysisName,outputFolder,top):
     '''
     tries to detect the .xls files that show up when GSEA is done running
     '''
-
+    
     #first figure out the friggin output folder 
     gseaParentFolder = '%sgsea_top_%s_c2/' % (outputFolder,top)
 
@@ -656,7 +663,7 @@ def detectGSEAOutput(analysisName,outputFolder,top):
         
         candidateFolderList = [folder for folder in folderList if folder.count('%s_top_%s.Gsea' % (analysisName,top)) == 1]
         if len(candidateFolderList) > 1:
-            print('ERROR: MULTIPLE GSEA OUTPUT FOLDERS DETECTED FOR %s WITH TOP %s GENES' % (analysisName,string.upper(str(top))))
+            print('ERROR: MULTIPLE GSEA OUTPUT FOLDERS DETECTED FOR %s WITH TOP %s GENES' % (analysisName,string.upper(top)))
             sys.exit()
         elif len(candidateFolderList) == 0:
             time.sleep(10)
@@ -833,7 +840,8 @@ def main():
         genomeDirectory=args.genomeDirectory
         
         #important here to define the window
-        startDict,tssCollection = loadAnnotFile(genome,window,geneList,True)
+        startDict,tssCollection,geneList = loadAnnotFile(genome,window,geneList,True)
+        print('IDENTIFIED %s valid ACTIVE GENES' % (len(geneList)))
         print(len(startDict))
 
         #now we need to split the input region 
@@ -883,27 +891,27 @@ def main():
 
         geneTablePath = '%s%s_GENE_TABLE.txt' % (outputFolder,analysisName)
         utils.unParseTable(geneTable,geneTablePath,'\t')
-
+        
+        if(top > len(geneTable)):
+          top = 'all'
+        
         #now call the R code
         print('CALLING R PLOTTING SCRIPTS')
         callRWaterfall(geneTablePath,outputFolder,analysisName,top)
 
         #now let's call gsea
         print('RUNNING GSEA ON C2')
-        callGSEA(args.gseaPath, args.gmxPath, outputFolder,analysisName,top)        
-
+        callGSEA(args.gseaPath, args.gmxPath, outputFolder,analysisName,top)
         
-        print('DETECTING GSEA OUTPUT FOR TOP %s GENES' % (top))
-        #for top 
-        top_promoterTablePath,top_distalTablePath = detectGSEAOutput(analysisName,outputFolder,top)
-
-        print('MAKING NES PLOTS FOR TOP %s GENES' % (top))
-        callR_GSEA(top_promoterTablePath,top_distalTablePath,outputFolder,analysisName,top)
+        if top != 'all':
+          print('DETECTING GSEA OUTPUT FOR TOP %s GENES' % (top))
+          top_promoterTablePath,top_distalTablePath = detectGSEAOutput(analysisName,outputFolder,top)
 
+          print('MAKING NES PLOTS FOR TOP %s GENES' % (top))
+          callR_GSEA(top_promoterTablePath,top_distalTablePath,outputFolder,analysisName,top)
 
         print('DETECTING GSEA OUTPUT FOR ALL GENES')
-        #for top 
-        detectGSEAOutput(analysisName,outputFolder,'all')
+        top_promoterTablePath,top_distalTablePath = detectGSEAOutput(analysisName,outputFolder,'all')
 
         print('MAKING NES PLOTS FOR ALL GENES')
         callR_GSEA(top_promoterTablePath,top_distalTablePath,outputFolder,analysisName,'all')
diff --git a/enhancerPromoter_waterfall.R b/enhancerPromoter_waterfall.R
index d36ee6c..67fe9fb 100644
--- a/enhancerPromoter_waterfall.R
+++ b/enhancerPromoter_waterfall.R
@@ -12,7 +12,7 @@ print(args)
 geneTablePath = args[3]
 outputFolder = args[4]
 analysisName = args[5]
-top = as.numeric(args[6])
+top = args[6]
 
 #=========================================================
 #===========================DEBUG SECTION=================
@@ -311,8 +311,9 @@ geneTable = read.delim(geneTablePath,sep='\t')
 plotContribution(geneTable,analysisName,outputFolder)
 runWaterfall(geneTable,analysisName,outputFolder)
 
-
-#top N
-plotContribution(geneTable,analysisName,outputFolder,as.numeric(top)) 
-runWaterfall(geneTable,analysisName,outputFolder,as.numeric(top))
+if(top != 'all'){
+  #top N
+  plotContribution(geneTable,analysisName,outputFolder,as.numeric(top)) 
+  runWaterfall(geneTable,analysisName,outputFolder,as.numeric(top))
+}
 

From dbd49bb699ffc46f9221c7d44580d924bf4e005b Mon Sep 17 00:00:00 2001
From: Quanhu Sheng <shengqh@gmail.com>
Date: Wed, 12 Jul 2023 09:29:06 -0500
Subject: [PATCH 6/6] use genomeDirectory from arguments

---
 enhancerPromoter.py | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/enhancerPromoter.py b/enhancerPromoter.py
index c591a8d..faff2bd 100755
--- a/enhancerPromoter.py
+++ b/enhancerPromoter.py
@@ -101,7 +101,7 @@
 #================================================================================
 
 
-def loadAnnotFile(genome,tss_window,geneList=[],skip_cache=False):
+def loadAnnotFile(genome,genomeDirectory,tss_window,geneList=[],skip_cache=False):
     """
     load in the annotation and create a startDict and tss collection for a set of refseq IDs a given genome
     20170213, add by Quanhu Sheng
@@ -118,14 +118,6 @@ def loadAnnotFile(genome,tss_window,geneList=[],skip_cache=False):
         'HG38': 'annotation/hg38_refseq.ucsc',
         }
 
-    genomeDirectoryDict = {
-        'HG19':'/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg19/Sequence/Chromosomes/',
-        'RN6':'/storage/cylin/grail/genomes/Rattus_norvegicus/UCSC/rn6/Sequence/Chromosomes/',
-        'MM9':'/storage/cylin/grail/genomes/Mus_musculus/UCSC/mm9/Sequence/Chromosomes/',
-        'MM10':'/storage/cylin/grail/genomes/Mus_musculus/UCSC/mm10/Sequence/Chromosomes/',
-        'HG38': '/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg38/Sequence/Chromosomes/',
-        }
-
     mouse_convert_file = '%s/annotation/HMD_HumanPhenotype.rpt' % (whereAmI)
 
     #making a dictionary for mouse to human conversion
@@ -135,9 +127,6 @@ def loadAnnotFile(genome,tss_window,geneList=[],skip_cache=False):
     for line in mouse_convert_table:
         mouse_convert_dict[line[4]] = line[0]
             
-    genomeDirectory = genomeDirectoryDict[string.upper(genome)]
-
-
     #making a chrom_dict that is a list of all chroms with sequence
     chrom_list = utils.uniquify([name.split('.')[0] for name in os.listdir(genomeDirectory) if len(name) >0])
     
@@ -950,7 +939,7 @@ def main():
         
         #Quanhu Sheng, assign valid_genes to geneList, 20230712
         #important here to define the window
-        startDict,tssCollection,genomeDirectory,chrom_list,mouse_convert_dict,geneList = loadAnnotFile(genome,tss_window,geneList,True)
+        startDict,tssCollection,genomeDirectory,chrom_list,mouse_convert_dict,geneList = loadAnnotFile(genome,genomeDirectory,tss_window,geneList,True)
         #print(tssCollection.getOverlap(utils.Locus('chr5',171387630,171388066,'.')))
         #sys.exit()