-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathProcessSRA_hpcc2.py
More file actions
executable file
·301 lines (265 loc) · 11.4 KB
/
ProcessSRA_hpcc2.py
File metadata and controls
executable file
·301 lines (265 loc) · 11.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
## IMPORT
import sys
import os
## MAIN
print ('''
inp1 = .sra file
inp2 = bowtie index root name
inp3 = paired (1) or single reads (0) or paired processed as single (2)
Optional Inputs:
-trim_threads = threads to use when running Trimmomatic (note: the actual number of threads used will be 1 + this value)
-trim_adapter = adapter sequenecs to use with Trimmomatic
-trim_seed = seed size to use with adapter trimming
-trim_pclip = palidromic clipping to use with adapter trimming
-trim_sclip = simple clipping to used with adapter trimming
-trim_leading = quality value for hard clipping of leading bases
-trim_trailing = quality value for hard clipping of trailing bases
-trim_winsize = window size for soft clipping
-trim_winqual = average quality for soft clipping
-trim_minlen = minimum length of trimmed reads
-trim_crop = fixed number of bases to trim from the end of all reads
-trim_hcrop = fixed number of bases to trim form the front of all reads
-tophat_threads = number of threads to use with tophat
-min_intron = minimum intron size to use with tophat and cufflinks, default = 50
-max_intron = maximum intron size to use with tophat and cufflinks, default = 5000
-max_multihit = maximum number of mappings per read in tophat, default = 20
-filter_min_len = minimum read length post-trimming, default = 20
-filter_min_phred = minimum average read phred post-trimming, default = 20
Protocol:
fastq-dump: covert SRA -> fastq
Trimmomatic: trim fastq
Filter reads by length and min ave phred score
Bowtie: map reads
SAMTools: Convert bam file to sam file
Filter sam file to primary and unique reads
Required Modules:
- SRAToolkit (module load SRAToolkit)
- Trimmomatic (module load Trimmomatic)
- Tophat2 (module load TopHat2)
- Boost (module load Boost)
- SAMTools (module load SAMTools)
module load SRAToolkit; module load Trimmomatic; module load TopHat2; module load Boost; module load SAMTools
''')
## READ INPUT SRA
infile = os.path.abspath(sys.argv[1])
genome = sys.argv[2]
PE = int(sys.argv[3])
## PRESET VALUES
# out_dir = ""
# Trimmomatic Parameters-- Always used
# updating adapter_seq to include all universal adaptors, need to have these in wd
if PE == 1:
adapter_seq = "all_PE_adapters.fa"
elif PE == 2:
adapter_seq = "all_PE_adapters.fa"
else:
adapter_seq = "all_SE_adapters.fa"
trim_threads = 4
seed_mismatches = 2
palindrome_clip = 30
simple_clip = 10
leading = 3
trailing = 3
window_size = 4
window_quality = 20
# Trimmomatic Parameters-- Added when value is non-zero
trim_minlen = 0
trim_crop = 0
trim_headcrop = 0
# Default filtering thresholds
min_filter_len = 20
min_filter_phred = 20
# Tophat&Cufflinks Parameters
tophat_threads = 8
min_intron_size = 50
max_intron_size = 5000
max_multiHits = 20
## READ PARAMETERS
for i in range(len(sys.argv)):
# if sys.argv[i] == "-genome":
# genome = sys.argv[i+1]
if sys.argv[i] == "-gff":
gff = sys.argv[i+1]
if sys.argv[i] == "-trim_threads":
trim_threads = sys.argv[i+1]
if sys.argv[i] == "-trim_adapter":
adapter_seq = sys.argv[i+1]
if sys.argv[i] == "-trim_seed":
seed_mismatches = sys.argv[i+1]
if sys.argv[i] == "-trim_pclip":
palindrome_clip = sys.argv[i+1]
if sys.argv[i] == "-trim_sclip":
simple_clip = sys.argv[i+1]
if sys.argv[i] == "-trim_leading":
leading = sys.argv[i+1]
if sys.argv[i] == "-trim_trailing":
trailing = sys.argv[i+1]
if sys.argv[i] == "-trim_winsize":
window_size = sys.argv[i+1]
if sys.argv[i] == "-trim_winqual":
window_quality = sys.argv[i+1]
if sys.argv[i] == "-trim_minlen":
trim_minlen = sys.argv[i+1]
if sys.argv[i] == "-trim_crop":
trim_crop = sys.argv[i+1]
if sys.argv[i] == "-trim_hcrop":
trim_headcrop = sys.argv[i+1]
if sys.argv[i] == "-tophat_threads":
tophat_threads = sys.argv[i+1]
if sys.argv[i] == "-min_intron":
min_intron_size = sys.argv[i+1]
if sys.argv[i] == "-max_intron":
max_intron_size = sys.argv[i+1]
if sys.argv[i] == "-max_multihit":
max_multiHits = sys.argv[i+1]
if sys.argv[i] == "-filter_min_len":
min_filter_len = sys.argv[i+1]
if sys.argv[i] == "-filter_min_phred":
min_filter_phred = sys.argv[i+1]
# if sys.argv[i] == "-output_dir":
# out_dir = os.path.abspath(sys.argv[i+1])
## RUN COMMANDS
# Decompress SRA Files
f_sra = infile
print ("Dumping SRA file")
if PE == 1:
dump_command = "fastq-dump --split-files "+f_sra
os.system(dump_command)
elif PE == 2:
os.system("fastq-dump --split-files %s"%(f_sra))
elif PE == 0:
dump_command = "fastq-dump "+f_sra
os.system(dump_command)
# print dump_command
print ("First fastQC")
if PE == 1:
f_fastq = infile.split(".")[0]+"_1.fastq"
r_fastq = infile.split(".")[0]+"_2.fastq"
print (infile, f_fastq, r_fastq)
fastqc_command1 = "fastqc -f fastq "+ f_fastq
fastqc_command2 = "fastqc -f fastq "+ r_fastq
os.system(fastqc_command1)
os.system(fastqc_command2)
elif PE == 2:
name = infile.split(".")[0]
f_fastq = infile.split(".")[0]+"_1.fastq"
r_fastq = infile.split(".")[0]+"_2.fastq"
print (infile, f_fastq)
os.system("mv %s %s.fastq" %(f_fastq, name))
fastqc_command = "fastqc -f fastq "+name+".fastq"
os.system(fastqc_command)
os.system("rm %s"%r_fastq)
else:
f_fastq = infile.split(".")[0]+".fastq"
print (infile, f_fastq)
fastqc_command = "fastqc -f fastq "+ f_fastq
os.system(fastqc_command)
print ("Trimming reads")
# Trim Reads
if PE==1:
f_fastq = infile.split(".")[0]+"_1.fastq"
r_fastq = infile.split(".")[0]+"_2.fastq"
print (infile, f_fastq, r_fastq)
f_fastq_trimmedP = infile.split(".")[0]+"_1trimmed.fastq"
r_fastq_trimmedP = infile.split(".")[0]+"_2trimmed.fastq"
f_fastq_trimmedU = infile.split(".")[0]+"_1Utrimmed.fastq"
r_fastq_trimmedU = infile.split(".")[0]+"_2Utrimmed.fastq"
trimmomatic_command = "java -jar $TRIM/trimmomatic PE -threads %s %s %s %s %s %s %s " % (trim_threads,f_fastq,r_fastq, f_fastq_trimmedP, f_fastq_trimmedU, r_fastq_trimmedP, r_fastq_trimmedU)
trimmomatic_command = trimmomatic_command + "ILLUMINACLIP:/mnt/research/ShiuLab/16_RNAseq/adapters/%s:%s:%s:%s " % (adapter_seq,seed_mismatches,palindrome_clip,simple_clip)
trimmomatic_command = trimmomatic_command + "LEADING:%s TRAILING:%s SLIDINGWINDOW:%s:%s" % (leading,trailing,window_size,window_quality)
if not trim_minlen == 0:
trimmomatic_command = trimmomatic_command + " MINLEN:%s" % (trim_minlen)
if not trim_crop == 0:
trimmomatic_command = trimmomatic_command + " CROP:%s" % (trim_crop)
if not trim_headcrop == 0:
trimmomatic_command = trimmomatic_command + " HEADCROP:%s" % (trim_headcrop)
os.system(trimmomatic_command)
# print (" Deleting original fastq file:")
# print (" %s"%(f_fastq))
#os.system("rm %s"%(f_fastq))
#os.system("rm %s"%(r_fastq))
# print ("Filtering trimmed reads")
# # Filter trimmed reads
# filter_command1 = "python /mnt/home/lloydjo1/scripts/A_Small_Read_Processing/filter_fastq.py -i %s -min %s -ave %s"%(f_fastq_trimmedP,min_filter_len,min_filter_phred)
# filter_command2 = "python /mnt/home/lloydjo1/scripts/A_Small_Read_Processing/filter_fastq.py -i %s -min %s -ave %s"%(r_fastq_trimmedP,min_filter_len,min_filter_phred)
# # print filter_command
# os.system(filter_command1)
# os.system(filter_command2)
# # filtered_file = f_fastq_trimmed.replace("fastq","")+str(min_filter_len)+"_min_len."+str(min_filter_len)+"_min_phred.fastq"
filtered_file1 = f_fastq_trimmedP
filtered_file2 = r_fastq_trimmedP
# print (" Deleting trimmed fastq file:")
# print (" %s"%(f_fastq_trimmedP))
#os.system("rm %s"%(f_fastq_trimmedP))
#os.system("rm %s"%(r_fastq_trimmedP))
#os.system("rm %s"%(f_fastq_trimmedU))
#os.system("rm %s"%(r_fastq_trimmedU))
print ("Second fastQC")
f_fastq2 = f_fastq_trimmedP
r_fastq2 = r_fastq_trimmedP
fastqc_command1 = "fastqc -f fastq "+ f_fastq2
fastqc_command2 = "fastqc -f fastq "+ r_fastq2
os.system(fastqc_command1)
os.system(fastqc_command2)
print ("Running TopHat")
# Run tophat
f_tophat_file = infile.split(".")[0]+".sra_tophat" #naming file
# if out_dir != "":
# f_tophat_file = out_dir+"/"+f_tophat_file
tophat_command = "tophat2 -p %s -i %s -I %s -g %s -o %s %s %s %s" % (tophat_threads,min_intron_size,max_intron_size,max_multiHits,f_tophat_file,genome,filtered_file1, filtered_file2) #command
print (tophat_command)
os.system(tophat_command)
#print (" Deleting filtered fastq file:")
print (" %s"%(filtered_file1))
#os.system("rm %s"%(filtered_file))
print ("Converting BAM to SAM")
os.system("python /mnt/home/lloydjo1/scripts/A_Small_Read_Processing/bam2sam.py %s/accepted_hits.bam"%(f_tophat_file))
print ("Filtering SAM and get unique reads")
os.system("python /mnt/home/john3784/Github/RNAseq_pipeline/primary_and_unique_mapped_reads2.py %s/accepted_hits.sam %s"%(f_tophat_file, PE))
else:
f_fastq = infile.split(".")[0]+".fastq"
print (infile, f_fastq)
f_fastq_trimmed = infile.split(".")[0]+"trimmed.fastq"
trimmomatic_command = "java -jar $TRIM/trimmomatic SE -threads %s %s %s " % (trim_threads,f_fastq,f_fastq_trimmed)
trimmomatic_command = trimmomatic_command + "ILLUMINACLIP:/mnt/research/ShiuLab/16_RNAseq/adapters/%s:%s:%s:%s " % (adapter_seq,seed_mismatches,palindrome_clip,simple_clip)
trimmomatic_command = trimmomatic_command + "LEADING:%s TRAILING:%s SLIDINGWINDOW:%s:%s" % (leading,trailing,window_size,window_quality)
if not trim_minlen == 0:
trimmomatic_command = trimmomatic_command + " MINLEN:%s" % (trim_minlen)
if not trim_crop == 0:
trimmomatic_command = trimmomatic_command + " CROP:%s" % (trim_crop)
if not trim_headcrop == 0:
trimmomatic_command = trimmomatic_command + " HEADCROP:%s" % (trim_headcrop)
# print trimmomatic_command
os.system(trimmomatic_command)
# print (" Deleting original fastq file:")
# print (" %s"%(f_fastq))
# #os.system("rm %s"%(f_fastq))
# print ("Filtering trimmed reads")
# # Filter trimmed reads
# filter_command = "python /mnt/home/lloydjo1/scripts/A_Small_Read_Processing/filter_fastq.py -i %s -min %s -ave %s"%(f_fastq_trimmed,min_filter_len,min_filter_phred)
# # print filter_command
# os.system(filter_command)
# filtered_file = f_fastq_trimmed.replace("fastq","")+str(min_filter_len)+"_min_len."+str(min_filter_len)+"_min_phred.fastq"
filtered_file = f_fastq_trimmed
# print (" Deleting trimmed fastq file:")
# print (" %s"%(f_fastq_trimmed))
# #os.system("rm %s"%(f_fastq_trimmed))
print ("Second fastQC")
f_fastq2 = f_fastq_trimmed
fastqc_command = "fastqc -f fastq "+ f_fastq2
os.system(fastqc_command)
print ("Running TopHat")
# Run tophat
f_tophat_file = infile.split(".")[0]+".sra_tophat" #naming file
# if out_dir != "":
# f_tophat_file = out_dir+"/"+f_tophat_file
tophat_command = "tophat2 -p %s -i %s -I %s -g %s -o %s %s %s" % (tophat_threads,min_intron_size,max_intron_size,max_multiHits,f_tophat_file,genome,filtered_file) #command
print (tophat_command)
os.system(tophat_command)
#print (" Deleting filtered fastq file:")
print (" %s"%(filtered_file))
#os.system("rm %s"%(filtered_file))
print ("Converting BAM to SAM")
os.system("python /mnt/home/lloydjo1/scripts/A_Small_Read_Processing/bam2sam.py %s/accepted_hits.bam"%(f_tophat_file))
print ("Filtering SAM to primary and unique reads")
os.system("python /mnt/home/john3784/Github/RNAseq_pipeline/primary_and_unique_mapped_reads2.py %s/accepted_hits.sam %s"%(f_tophat_file, PE))