Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 40 additions & 31 deletions enhancerPromoter.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,7 @@

import sys
import os
print "Using python version %s" % sys.version

whereAmI = os.path.dirname(os.path.realpath(__file__))

#importing utils package
#add locations of files and global parameters in this section
Expand All @@ -66,7 +65,6 @@
import numpy
import re
import time
from distutils.spawn import find_executable
from collections import defaultdict


Expand All @@ -75,7 +73,6 @@
#================================================================================

#add locations of files and global parameters in this section
whereAmI = os.path.dirname(os.path.realpath(__file__))

pipeline_dir = whereAmI + '/'

Expand Down Expand Up @@ -104,9 +101,11 @@
#================================================================================


def loadAnnotFile(genome,tss_window,geneList=[],skip_cache=False):
def loadAnnotFile(genome,genomeDirectory,tss_window,geneList=[],skip_cache=False):
"""
load in the annotation and create a startDict and tss collection for a set of refseq IDs a given genome
20170213, add by Quanhu Sheng
return validGenes
"""
genomeDict = {
'HG18': 'annotation/hg18_refseq.ucsc',
Expand All @@ -119,14 +118,6 @@ def loadAnnotFile(genome,tss_window,geneList=[],skip_cache=False):
'HG38': 'annotation/hg38_refseq.ucsc',
}

genomeDirectoryDict = {
'HG19':'/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg19/Sequence/Chromosomes/',
'RN6':'/storage/cylin/grail/genomes/Rattus_norvegicus/UCSC/rn6/Sequence/Chromosomes/',
'MM9':'/storage/cylin/grail/genomes/Mus_musculus/UCSC/mm9/Sequence/Chromosomes/',
'MM10':'/storage/cylin/grail/genomes/Mus_musculus/UCSC/mm10/Sequence/Chromosomes/',
'HG38': '/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg38/Sequence/Chromosomes/',
}

mouse_convert_file = '%s/annotation/HMD_HumanPhenotype.rpt' % (whereAmI)

#making a dictionary for mouse to human conversion
Expand All @@ -136,15 +127,11 @@ def loadAnnotFile(genome,tss_window,geneList=[],skip_cache=False):
for line in mouse_convert_table:
mouse_convert_dict[line[4]] = line[0]

genomeDirectory = genomeDirectoryDict[string.upper(genome)]


#making a chrom_dict that is a list of all chroms with sequence
chrom_list = utils.uniquify([name.split('.')[0] for name in os.listdir(genomeDirectory) if len(name) >0])

annotFile = whereAmI + '/' + genomeDict[string.upper(genome)]


if not skip_cache:
# Try loading from a cache, if the crc32 matches
annotPathHash = zlib.crc32(annotFile) & 0xFFFFFFFF # hash the entire location of this script
Expand Down Expand Up @@ -175,8 +162,13 @@ def loadAnnotFile(genome,tss_window,geneList=[],skip_cache=False):
tssLoci =[]
if geneList==[]:
geneList = startDict.keys()
validGenes = []
for gene in geneList:
tssLoci.append(utils.makeTSSLocus(gene,startDict,tss_window,tss_window))
if gene in startDict:
tssLoci.append(utils.makeTSSLocus(gene,startDict,window,window))
validGenes.append(gene)
else:
print('\tWARNING: gene %s not in annotation database. Ignoring.' % gene)

tssCollection = utils.LocusCollection(tssLoci,50)

Expand All @@ -185,7 +177,7 @@ def loadAnnotFile(genome,tss_window,geneList=[],skip_cache=False):
with open(cache_file_path, 'wb') as cache_fh:
cPickle.dump((startDict, tssCollection), cache_fh, cPickle.HIGHEST_PROTOCOL)

return startDict, tssCollection, genomeDirectory, chrom_list, mouse_convert_dict
return startDict, tssCollection, genomeDirectory, chrom_list, mouse_convert_dict, validGenes



Expand All @@ -195,7 +187,6 @@ def splitRegions(inputGFF,tssCollection):

#if even a single coordinate is shared with the +/-1kb
splitGFF = []
debugCount = 0
for line in inputGFF:

chrom = line[0]
Expand Down Expand Up @@ -384,7 +375,7 @@ def makePeakTable(paramDict,splitGFFPath,averageTablePath,startDict,geneList,gen
signalTable = utils.parseTable(averageTablePath,'\t')

print('LOADING CPGS ISLANDS')
cpgBed = utils.parseTable(paramDict['cpgPath'],'\t')
cpgBed = utils.parseTable(bedFile,'\t')
cpgLoci = []
for line in cpgBed:
cpgLoci.append(utils.Locus(line[0],line[1],line[2],'.',line[-1]))
Expand All @@ -397,8 +388,9 @@ def makePeakTable(paramDict,splitGFFPath,averageTablePath,startDict,geneList,gen
tss_prox_loci = []
tss_distal_loci = []
for refID in geneList:
tss_prox_loci.append(utils.makeTSSLocus(refID,startDict,tss_window,tss_window))
tss_distal_loci.append(utils.makeTSSLocus(refID,startDict,distal_window,distal_window))
if refID in startDict:
tss_prox_loci.append(utils.makeTSSLocus(refID,startDict,tss_window,tss_window))
tss_distal_loci.append(utils.makeTSSLocus(refID,startDict,distal_window,distal_window))

#make a 1kb flanking and 50kb flanking collection
tss_prox_collection = utils.LocusCollection(tss_prox_loci,50)
Expand Down Expand Up @@ -680,10 +672,18 @@ def callGSEA(outputFolder,analysisName,top,analysis_type ='enhancer_vs_promoter'
gseaOutputFolder = utils.formatFolder('%sgsea_top_all_c2%s' % (outputFolder,suffix),True)
rptLabel = '%s_top_all%s' % (analysisName,suffix)

gseaBashFile.write('rm -rf %s/%s.Gsea* \n' % (gseaOutputFolder, rptLabel))
gseaCmd_all = 'java -Xmx4000m -cp %s xtools.gsea.Gsea -res %s -cls %s%s -gmx %s -collapse false -mode Max_probe -norm meandiv -nperm 1000 -permute gene_set -rnd_type no_balance -scoring_scheme weighted -rpt_label %s -metric Diff_of_Classes -sort real -order descending -include_only_symbols true -make_sets true -median false -num 100 -plot_top_x 20 -rnd_seed timestamp -save_rnd_lists false -set_max 500 -set_min 15 -zip_report false -out %s -gui false' % (gseaPath,gctPath,clsPath,analysis_dict[analysis_type][1],gmxPath,rptLabel,gseaOutputFolder)

gseaBashFile.write(gseaCmd_all)
gseaBashFile.write('\n')

if top != 'all':
#for top N
gctPath = '%s%s_top_%s.gct' % (outputFolder,analysisName,top)
clsPath = '%s%s_top_%s.cls' % (outputFolder,analysisName,top)
gseaOutputFolder = utils.formatFolder('%sgsea_top_%s_c2' % (outputFolder,top),True)
rptLabel = '%s_top_%s' % (analysisName,top)

if use_top:
#for top N
Expand All @@ -692,6 +692,7 @@ def callGSEA(outputFolder,analysisName,top,analysis_type ='enhancer_vs_promoter'
gseaOutputFolder = utils.formatFolder('%sgsea_top_%s_c2%s' % (outputFolder,top,suffix),True)
rptLabel = '%s_top_%s%s' % (analysisName,top,suffix)

gseaBashFile.write('rm -rf %s/%s.Gsea* \n' % (gseaOutputFolder, rptLabel))
gseaCmd_top = 'java -Xmx4000m -cp %s xtools.gsea.Gsea -res %s -cls %s%s -gmx %s -collapse false -mode Max_probe -norm meandiv -nperm 1000 -permute gene_set -rnd_type no_balance -scoring_scheme weighted -rpt_label %s -metric Diff_of_Classes -sort real -order descending -include_only_symbols true -make_sets true -median false -num 100 -plot_top_x 20 -rnd_seed timestamp -save_rnd_lists false -set_max 500 -set_min 15 -zip_report false -out %s -gui false' % (gseaPath,gctPath,clsPath,analysis_dict[analysis_type][1],gmxPath,rptLabel,gseaOutputFolder)

gseaBashFile.write(gseaCmd_top)
Expand Down Expand Up @@ -728,7 +729,7 @@ def detectGSEAOutput(analysisName,outputFolder,top,analysis_type ='enhancer_vs_p

candidateFolderList = [folder for folder in folderList if folder.count('%s_top_%s%s.Gsea' % (analysisName,top,suffix)) == 1]
if len(candidateFolderList) > 1:
print('ERROR: MULTIPLE GSEA OUTPUT FOLDERS DETECTED FOR %s WITH TOP %s GENES' % (analysisName,string.upper(str(top))))
print('ERROR: MULTIPLE GSEA OUTPUT FOLDERS DETECTED FOR %s WITH TOP %s GENES' % (analysisName,string.upper(top)))
sys.exit()
elif len(candidateFolderList) == 0:
time.sleep(10)
Expand Down Expand Up @@ -838,7 +839,14 @@ def main():
parser.add_argument("--tads", dest="tads", type=str,
help="Include a .bed of tad regions to restrict enhancer/gene association", required=False,default=None)


#add by Quanhu Sheng
parser.add_argument("--genomeDirectory", dest="genomeDirectory", type=str,
help="Enter the folder contains chromosome sequence in fasta format", required=True)
#gseaPath = '/usr/local/bin/gsea/gsea2-2.2.2.jar'
#gmxPath = '/grail/annotations/gsea/c2.all.v5.1.symbols.gmt' #C2 set
parser.add_argument("--gseaPath", dest="gseaPath", type=str, help="Enter GSEA jar file location", required=True)
parser.add_argument("--gmxPath", dest="gmxPath", type=str, help="Enter GSEA gmt file location, such as c2.all.v5.1.symbols.gmt", required=True)
parser.add_argument("--cpgPath", dest="cpgPath", type=str, help="Enter cpg coordinates in bed format", required=True)



Expand All @@ -858,7 +866,6 @@ def main():
#top analysis subset
top = args.top


#input genome
genome = args.genome.upper()
print('PERFORMING ANALYSIS ON %s GENOME BUILD' % (genome))
Expand Down Expand Up @@ -922,16 +929,17 @@ def main():
#check if tads are being invoked
if args.tads:
print('LOADING TAD LOCATIONS FROM %s' % (args.tads))
use_tads = True
tads_path = args.tads
else:
use_tads = False
tads_path = ''

print('LOADING ANNOTATION DATA FOR GENOME %s' % (genome))

genomeDirectory=args.genomeDirectory

#Quanhu Sheng, assign valid_genes to geneList, 20230712
#important here to define the window
startDict,tssCollection,genomeDirectory,chrom_list,mouse_convert_dict = loadAnnotFile(genome,tss_window,geneList,True)
startDict,tssCollection,genomeDirectory,chrom_list,mouse_convert_dict,geneList = loadAnnotFile(genome,genomeDirectory,tss_window,geneList,True)
#print(tssCollection.getOverlap(utils.Locus('chr5',171387630,171388066,'.')))
#sys.exit()

Expand Down Expand Up @@ -1054,10 +1062,11 @@ def main():
callR_GSEA(top_promoterTablePath,top_distalTablePath,outputFolder,analysisName+'_enhancer_vs_promoter',top)
callR_GSEA(top_signalTablePath,top_backgroundTablePath,outputFolder,analysisName+'_total_contribution',top)

print('MAKING NES PLOTS FOR TOP %s GENES' % (top))
callR_GSEA(top_promoterTablePath,top_distalTablePath,outputFolder,analysisName,top)

print('DETECTING GSEA OUTPUT FOR ALL GENES')
#for top
all_promoterTablePath,all_distalTablePath = detectGSEAOutput(analysisName,outputFolder,'all')
top_promoterTablePath,top_distalTablePath = detectGSEAOutput(analysisName,outputFolder,'all')

print('MAKING NES PLOTS FOR ALL GENES')
callR_GSEA(all_promoterTablePath,all_distalTablePath,outputFolder,analysisName,'all')
Expand Down
19 changes: 9 additions & 10 deletions enhancerPromoter_waterfall.R
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,7 @@ plotContribution <- function(geneTable,analysisName,outputFolder,top=0,nBins=100
if(top == 0){
top = nrow(geneTable)
topString = 'all'
}
else if(top > nrow(geneTable)){
}else if(top > nrow(geneTable)){
top = nrow(geneTable)
topString = 'all'
}else{
Expand Down Expand Up @@ -127,11 +126,11 @@ runWaterfall <- function(geneTable,analysisName,outputFolder,top=0,geneList = c(
if(top == 0){
top = nrow(geneTable)
topString = 'all'
}else if(top > nrow(geneTable)){
top = nrow(geneTable)
topString = 'all'
}else{
topString = as.character(top)
}else {
topString = as.character(top)
if(top > nrow(geneTable)){
top = nrow(geneTable)
}
}
#get the signal for tss, distal and total
promoterSignal =geneTable[,2]
Expand Down Expand Up @@ -342,9 +341,9 @@ geneTable = read.delim(geneTablePath,sep='\t')
plotContribution(geneTable,analysisName,outputFolder)
runWaterfall(geneTable,analysisName,outputFolder)


#top N
print('working on top genes')
print(top)
plotContribution(geneTable,analysisName,outputFolder,as.numeric(top))
runWaterfall(geneTable,analysisName,outputFolder,as.numeric(top))
plotContribution(geneTable,analysisName,outputFolder,top)
runWaterfall(geneTable,analysisName,outputFolder,top)