forked from peipeiwang6/RNA-seq_data_processing
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy path01_RNA-seq_processing.py
More file actions
100 lines (87 loc) · 5.99 KB
/
01_RNA-seq_processing.py
File metadata and controls
100 lines (87 loc) · 5.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import sys,os,argparse
import pandas as pd
import numpy as np
def warn(*args, **kwargs):
pass
import warnings
warnings.warn = warn
def main():
parser = argparse.ArgumentParser(description='This code is for RNA-seq data processing')
# Required
req_group = parser.add_argument_group(title='REQUIRED INPUT')
req_group.add_argument('-SRA', help='SRA sample ID', required=True)
req_group.add_argument('-genome_seq', help='genome sequence fasta file', required=True)
req_group.add_argument('-gff', help='gff3 file', required=True)
req_group.add_argument('-layout', help='PE: paired end, SE: single end', required=True)
req_group.add_argument('-workdir', help='the path to your workdir', required=True)
req_group.add_argument('-base', help='base name for genome index', required=True)
# optional
inp_group = parser.add_argument_group(title='OPTIONAL INPUT')
inp_group.add_argument('-trim', help='trim the reads or not, y or n',default='n')
inp_group.add_argument('-threads', help='how many threads to be used',default=4)
inp_group.add_argument('-adapters', help='adapter sequences to be trimmed',default='None')
inp_group.add_argument('-seedMis', help='seed mismatches: specifies the maximum mismatch count which will still allow a full match to be performed',default=2)
inp_group.add_argument('-pClipThres', help='palindromeClipThreshold: specifies how accurate the match between the two adapter ligated reads must be for PE palindrome read alignment.',default=30)
inp_group.add_argument('-sClipThres', help='simpleClipThreshold: specifies how accurate the match between any adapter etc. sequence must be against a read.',default=10)
inp_group.add_argument('-LEADING', help='Specifies the minimum quality required to keep a base',default=3)
inp_group.add_argument('-TRAILING', help='Specifies the minimum quality required to keep a base.',default=3)
inp_group.add_argument('-windowSize', help='specifies the number of bases to average across',default=4)
inp_group.add_argument('-requiredQuality', help='specifies the average quality required',default=20)
inp_group.add_argument('-MINLEN', help='Specifies the minimum length of reads to be kept.',default=36)
inp_group.add_argument('-time', help='hours asked for job running',default=4)
inp_group.add_argument('-ntasks', help='ntasks asked for job running',default=1)
inp_group.add_argument('-cpus', help='cpus-per-task asked for job running',default=8)
inp_group.add_argument('-mem', help='mem asked for job running',default=20)
if len(sys.argv)==1:
parser.print_help()
sys.exit(0)
args = parser.parse_args()
SRA = args.SRA
slum_code = open('Hisat_%s.sh'%SRA, 'w')
slum_code.write('#!/bin/sh --login\n#SBATCH --time=%s:00:00\n#SBATCH --ntasks=%s\n#SBATCH --cpus-per-task=%s\n#SBATCH --mem=%sG\n#SBATCH --job-name Hisat_%s.sh\n#SBATCH -e Hisat_%s.sh.e\n#SBATCH -o Hisat_%s.sh.o\ncd %s\n'%(args.time,args.ntasks,args.cpus,args.mem,SRA,SRA,SRA,args.workdir))
slum_code.write('module load SRA-Toolkit FastQC Trimmomatic hisat2 SAMtools GNU/7.3.0-2.30 OpenMPI/3.1.1-CUDA HTSeq\n')
# # build the index for the genome
# slum_code.write('hisat2-build %s %s_gi\n'%(args.genome_seq,args.genome_seq))
if args.layout == 'PE':
slum_code.write('fastq-dump --split-files %s\n'%SRA)
if args.trim == 'y':
# check the quality before trimming
slum_code.write('fastqc -f fastq %s_1.fastq\n'%SRA)
slum_code.write('fastqc -f fastq %s_2.fastq\n'%SRA)
slum_code.write('java -jar /opt/software/Trimmomatic/0.36-Java-1.8.0_92/trimmomatic-0.36.jar PE -threads %s %s_1.fastq %s_2.fastq %s_1.trimP %s_1.trimU %s_2.trimP %s_2.trimU ILLUMINACLIP:%s:%s:%s:%s LEADING:%s TRAILING:%s SLIDINGWINDOW:%s:%s MINLEN:%s\n'%(args.threads, SRA, SRA, SRA, SRA, SRA, SRA, args.adapters, args.seedMis, args.pClipThres, args.sClipThres, args.LEADING,args.TRAILING, args.windowSize, args.requiredQuality, args.MINLEN))
# check the quality after trimming
slum_code.write('fastqc -f fastq %s_1.trimP\n'%SRA)
slum_code.write('fastqc -f fastq %s_2.trimP\n'%SRA)
# map the reads to the genome
slum_code.write('hisat2 -p 4 --dta -x %s -1 %s_1.trimP -2 %s_2.trimP -S %s.sam\n'%(args.base, SRA, SRA, SRA))
else:
# map the reads to the genome
slum_code.write('hisat2 -p 4 --dta -x %s -1 %s_1.trimP -2 %s_2.trimP -S %s.sam\n'%(args.base, SRA, SRA, SRA))
# sort the sam file
slum_code.write('samtools sort -n -O sam, %s.sam -o %s_sorted.sam\n'%(SRA, SRA))
# get uniquely mapped reads
slum_code.write('python 02_keep_reads_with_quality_60_and_unique_mapping.py %s_sorted.sam\n'%(SRA))
# get read counts
slum_code.write('htseq-count --format=sam -m union -s no -t gene -i ID %s_sorted_quality_60_unique.sam %s > HTSeqCount_%s.out\n'%(SRA, args.gff, SRA))
if args.layout == 'SE':
slum_code.write('fastq-dump %s\n'%SRA)
if args.trim == 'y':
# check the quality before trimming
slum_code.write('fastqc -f fastq %s.fastq\n'%SRA)
slum_code.write('java -jar /opt/software/Trimmomatic/0.36-Java-1.8.0_92/trimmomatic-0.36.jar SE -threads %s %s.fastq %s.trim ILLUMINACLIP:%s:%s:%s:%s LEADING:%s TRAILING:%s SLIDINGWINDOW:%s:%s MINLEN:%s\n'%(args.threads, SRA, SRA, args.adapters, args.seedMis, args.pClipThres, args.sClipThres, args.LEADING,args.TRAILING, args.windowSize, args.requiredQuality, args.MINLEN))
# check the quality after trimming
slum_code.write('fastqc -f fastq %s.trim\n'%SRA)
# map the reads to the genome
slum_code.write('hisat2 -p 4 --dta -x %s -U %s.trim -S %s.sam\n'%(args.base, SRA, SRA))
else:
# map the reads to the genome
slum_code.write('hisat2 -p 4 --dta -x %s -U %s.trim -S %s.sam\n'%(args.base, SRA, SRA))
# sort the sam file
slum_code.write('samtools sort -n -O sam, %s.sam -o %s_sorted.sam\n'%(SRA, SRA))
# get uniquely mapped reads
slum_code.write('python 02_keep_reads_with_quality_60_and_unique_mapping_SE.py %s_sorted.sam\n'%(SRA))
# get read counts
slum_code.write('htseq-count --format=sam -m union -s no -t gene -i ID %s_sorted_quality_60_unique.sam %s > HTSeqCount_%s.out\n'%(SRA, args.gff, SRA))
slum_code.close()
if __name__ == '__main__':
main()