diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index f376dca..0000000 Binary files a/.DS_Store and /dev/null differ diff --git a/README.md b/README.md index f6b758c..9a13281 100644 --- a/README.md +++ b/README.md @@ -112,7 +112,7 @@ Notes: * The trimming and quality control checking of the fastq files for each sample was performed separately to the pipeline. * For most steps, the number of threads used can be specified with the `-t` flag. I usually utilised 8 or 16, depending on the state of the cluster. * Before starting the pipeline, I created a `HUPANdatabases` directory that contains all the necessary databases and reference sequences and annotations needed for the pipeline. In this directory there is: - * The human reference genome fasta file saved in `ref/ref.fa` (downloaded from https://gatk.broadinstitute.org/hc/en-us/articles/360035890951-Human-genome-reference-builds-GRCh38-or-hg38-b37-hg19) + * The human reference genome fasta file saved in `ref/ref.genome.fa` (downloaded from https://gatk.broadinstitute.org/hc/en-us/articles/360035890951-Human-genome-reference-builds-GRCh38-or-hg38-b37-hg19) * The human reference genome transcript file saved in `ref/ref.transcripts.fa` (downloaded from https://www.gencodegenes.org/human/releases.html, release 35) * The human reference genome annotation GTF file saved in `ref/ref.genes.gtf` (downloaded from https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/gencode.v38.chr_patch_hapl_scaff.annotation.gtf.gz, release 38) * NCBI's BLAST non-redundant nucleotide database saved as `/blastindex/data/blastdb.nal`, downloaded and set up as follows: @@ -131,104 +131,112 @@ Notes: **(1) *De novo* assembly of individual genomes** -This step assembles the fastq files of each genome into a whole genome using SGA. The assembled sequences are saved in a directory called 01_assembled. +This step assembles the fastq files of each genome into a whole genome using SGA. The assembled sequences are saved in a directory called `01_assembled`. This step has been changed from the original HUPAN process; a Nextflow pipeline is now used, developed by Gerrit Botha and myself. The Nextflow pipeline repository can be found at: https://github.com/h3abionet/dec-2020-hackathon-stream1/tree/main/assembly **(2) Aligning samples to the reference genome** * This step aligns the assembled genome to the reference genome using the nucmer tool in MUMmer. -* The aligned sequences are saved in a folder called 02_aligned. -* /cbio/bin is the path to the MUMmer, nucmer and show-cords executable files, which all run through a MUMmer Singularity container. +* The aligned sequences are saved in a folder called `02_aligned`. +* `/cbio/bin` is the path to the MUMmer, nucmer and show-cords executable files, which all run through a MUMmer Singularity container. ``` -hupanSLURM alignContig -t 16 01_assembled 02_aligned /cbio/bin /cbio/projects/015/HUPANdatabases/ref/ref.fa +hupanSLURM alignContig -t 16 01_assembled 02_aligned /cbio/bin /cbio/projects/015/HUPANdatabases/ref/ref.genome.fa ``` -* This will result in .delta and .coords files for each sample. -* hupanSLURM alignContig code is found in `/cbio/soft/HUPAN/lib/HUPANrmHighSLURM.pm` +* This will result in `.delta` and `.coords` files for each sample. +* hupanSLURM alignContig code is found in `/cbio/projects/015/HUPANdatabases/HUPAN/lib/HUPANrmHighSLURM.pm` **(3) Extracting contigs with a low similarity to the reference genome** * This step identifies and discards all those contigs with > 95% identity and contig length alignment to the reference genome. -* The sequences with low similarity (and that are potentially non-reference sequences) are saved in a folder called 03_candidate. -* The final argument (02_aligned) is the path to the MUMmer/nucmer results, created in the previous step. +* The sequences with low similarity (and that are potentially non-reference sequences) are saved in a folder called `03_candidate`. +* The final argument (`02_aligned`) is the path to the MUMmer/nucmer results, created in the previous step. * This step is run interactively and produces output in real time, so first run `screen` and then use an interactive node: `srun --nodes=1 --ntasks=1 --mem=50g -t 24:00:00 --pty bash` ``` hupanSLURM extractSeq 01_assembled 03_candidate 02_aligned ``` -* This will result in a .candidate.unaligned.contig file for each sample. -* hupanSLURM extractSeq code is found in `/cbio/soft/HUPAN/lib/HUPANrmHighSLURM.pm` +* This will result in a `.candidate.unaligned.contig` file for each sample. +* hupanSLURM extractSeq code is found in `/cbio/projects/015/HUPANdatabases/HUPAN/lib/HUPANrmHighSLURM.pm` **(4) Assessing the candidate sequences using QUAST** * This steps maps the assembled contigs in the candidate folder against the reference and checks the statistics of these contigs using QUAST. -* The QUAST results of the candidate contigs are saved in a folder called 04_quastresult. -* /cbio/bin if where the QUAST exectuable is located, which runs a QUAST Singularity container. +* The QUAST results of the candidate contigs are saved in a folder called `04_quastresult`. +* `/cbio/projects/015/HUPANdatabases/images/` is where the QUAST exectuable is located, which runs a QUAST Singularity container. ``` -hupanSLURM assemSta -t 16 03_candidate/data 04_quastresult /cbio/bin/ /cbio/projects/015/HUPANdatabases/ref/ref.fa +hupanSLURM assemSta -t 16 03_candidate/data 04_quastresult /cbio/projects/015/HUPANdatabases/images/ /cbio/projects/015/HUPANdatabases/ref/ref.genome.fa +``` +hupanSLURM assemSta code is found in `/cbio/projects/015/HUPANdatabases/HUPAN/lib/HUPANassemStaSLURM.pm` + +* Once this fourth step is finished running, the output needs to be changed slightly (due to the change in aligner in the later QUAST version). +* This can be performed using the `fixQuastOutput.py` script located in `/cbio/projects/015/HUPANdatabases/`. +* Run this script using the command below and follow the prompts to input the working directory and the population directory names in that working directory. +* This step is run interactively and produces output in real time, so first run `screen` and then use an interactive node: `srun --nodes=1 --ntasks=1 -t 6:00:00 --pty bash` +``` +python /cbio/projects/015/HUPANdatabases/fixQuastOutput.py ``` -hupanSLURM assemSta code is found in `/cbio/soft/HUPAN/lib/HUPANassemStaSLURM.pm` **(5) Collecting unaligned (both fully and partially) contigs** -* This steps collects all unaligned (both fully and partially) contigs from the 03_candidate folder using information from the 04_quastresult folder. -* The final unaligned sequences will be saved in folder called 05_unaligned. -* The HUPAN notes on GitHub say to use -p to specify .contig suffix, but the actual .pm file says to use -s. +* This steps collects all unaligned (both fully and partially) contigs from the 03_candidate folder using information from the `04_quastresult` folder. +* The final unaligned sequences will be saved in folder called `05_unaligned`. +* The HUPAN notes on GitHub say to use `-p` to specify .contig suffix, but the actual .pm file says to use `-s`. ``` hupanSLURM getUnalnCtg -s .contig 03_candidate/data 04_quastresult/data 05_unaligned ``` -This will result in .fully.contig, partially.contig and partially.coords files for each sample. -hupanSLURM getUnalnCtg code is found in `/cbio/soft/HUPAN/lib/HUPANunalnCtgsSLURM.pm` +This will result in `.fully.contig`, `partially.contig` and `partially.coords` files for each sample. +hupanSLURM getUnalnCtg code is found in `/cbio/projects/015/HUPANdatabases/HUPAN/lib/HUPANunalnCtgsSLURM.pm` **(6) Merging all the unaligned contigs** -* This step merges all the unaligned contigs in 05_unaligned. -* The resulting files will be saved in the folder 06_mergeunaligned. +* This step merges all the unaligned contigs in `05_unaligned`. +* The resulting files will be saved in the folder `06_mergeunaligned`. * Partially and fully unaligned contigs are still kept separate, and will each result in one file each. ``` hupanSLURM mergeUnalnCtg 05_unaligned/data 06_mergeunaligned ``` This will result in total.fully.fa, total.partilly.coords and total.partilly.fa files with all the merged data. -hupanSLURM mergeUnalnCtg code is found in `/cbio/soft/HUPAN/lib/HUPANunalnCtgsSLURM.pm` +hupanSLURM mergeUnalnCtg code is found in `/cbio/projects/015/HUPANdatabases/HUPAN/lib/HUPANunalnCtgsSLURM.pm` **(7) Removing redundancy** * This step removes redundancy (i.e. similar sequences) using CD-HIT. * It does this by forming 'clusters' of similar contigs at a defined identity threshold and choosing the longest contig in that cluster as the representative contig. * The command must be performed twice; once for fully unaligned sequences, and once for partially unaligned sequences. -* They must be run in a separate directory, called 07_rmredundant. The restults will then be stored in directories within this directory. -* The default % identity for clustering using CD-HIT in HUPAN is 90%, but you can change it by using the -c flag, e.g. `-c 0.85` -* /cbio/bin is the location of the cd-hit-est exectuable, which runs a Singularity container. +* They must be run in a separate directory, called `07_rmredundant`. The restults will then be stored in directories within this directory. +* The default % identity for clustering using CD-HIT in HUPAN is 90%, but you can change it by using the `-c` flag, e.g. `-c 0.85` +* `/cbio/bin` is the location of the cd-hit-est exectuable, which runs a Singularity container. ``` mkdir 07_rmredundant && cd 07_rmredundant hupanSLURM rmRedundant cdhitCluster -t 8 ../06_mergeunaligned/total.fully.fa rmredundant.fully /cbio/bin hupanSLURM rmRedundant cdhitCluster -t 8 ../06_mergeunaligned/total.partilly.fa rmredundant.partially /cbio/bin/ ``` -* This will result in cluster_info.txt and non-redundant.fa files each for both fully and partially unaligned sequences. -* hupanSLURM rmRedundant code is found in `/cbio/soft/HUPAN/lib/HUPANrmRdtSLURM.pm` +* This will result in `cluster_info.txt` and `non-redundant.fa` files each for both fully and partially unaligned sequences. +* hupanSLURM rmRedundant code is found in `/cbio/projects/015/HUPANdatabases/HUPAN/lib/HUPANrmRdtSLURM.pm` **(8) Identifying contamination in the unaligned sequences** * This step identifies contamination by comparing all unaligned sequences to the BLAST nucleotide database. * The database has been previously downloaded and set up according to HUPAN GitHub instructions (found above). -* The aligned results will be saved in a directory called 08_blastresult. Within this directory, there will be a rmredundant.fully and a rmredundant.partially directory. -* /cbio/bin is where the BLAST executable file is found. +* The aligned results will be saved in a directory called `08_blastresult`. Within this directory, there will be a `rmredundant.fully` and a `rmredundant.partially` directory. +* `/cbio/bin` is where the BLAST executable file is found. ``` hupanSLURM blastAlign blast -t 16 07_rmredundant 08_blastresult /cbio/projects/015/HUPANdatabases/blastindex/data/blastdb /cbio/bin ``` -* This will result in non-redundan.blast files. -* hupanSLURM blastAlign code is found in `/cbio/soft/HUPAN/lib/HUPANblastAlignSLURM.pm` +* This will result in `non-redundan.blast` files. +* hupanSLURM blastAlign code is found in `/cbio/projects/015/HUPANdatabases/HUPAN/lib/HUPANblastAlignSLURM.pm` **(9) Obtaining the taxonomic classifications of the sequences** * This steps looks at all the BLAST hits and classifies them into their correct taxonomies (if there are any). * It uses two databases downloaded from NCBI: a taxonomy database, and a database that converts accession numbers to taxonomic IDs. Both have already been downloaded and set up (instructuions found above). * The command has to be run twice, once each for partially and fully unaligned sequences. -* Each command will create its own directory; 09.1_taxclassfully for fully unaligned reads and 09.2_taxclasspartially for partially unaligned reads. +* Each command will create its own directory; `09.1_taxclassfully` for fully unaligned reads and `09.2_taxclasspartially` for partially unaligned reads. ``` hupanSLURM getTaxClass 08_blastresult/data/rmredundant.fully/non-redundan.blast /cbio/projects/015/HUPANdatabases/taxonomyinfo 09.1_taxclassfully hupanSLURM getTaxClass 08_blastresult/data/rmredundant.partially/non-redundan.blast /cbio/projects/015/HUPANdatabases/taxonomyinfo 09.2_taxclasspartially ``` -* hupanSLURM getTaxClass code is found in `/cbio/soft/HUPAN/lib/HUPANgetTaxClassSLURM.pm` +* hupanSLURM getTaxClass code is found in `/cbio/projects/015/HUPANdatabases/HUPAN/lib/HUPANgetTaxClassSLURM.pm` **(10) Removing contaminating sequences** @@ -241,158 +249,191 @@ hupanSLURM getTaxClass 08_blastresult/data/rmredundant.partially/non-redundan.bl hupanSLURM rmCtm -i 60 07_rmredundant/rmredundant.fully/non-redundant.fa 08_blastresult/data/rmredundant.fully/non-redundan.blast 09.1_taxclassfully/accession.name 10.1_rmctmfully hupanSLURM rmCtm -i 60 07_rmredundant/rmredundant.partially/non-redundant.fa 08_blastresult/data/rmredundant.partially/non-redundan.blast 09.2_taxclasspartially/accession.name 10.2_rmctmpartially ``` -* hupanSLURM rmCtm code is found in `/cbio/soft/HUPAN/lib/HUPANrmContaminateSLURM.pm` +* hupanSLURM rmCtm code is found in `/cbio/projects/015/HUPANdatabases/HUPAN/lib/HUPANrmContaminateSLURM.pm` **(11) Combining fully and partially unaligned reads** * This steps removes redundancy again by merging the final fully and partially unaligned sequences, again using CD-HIT. * A directory must first be made and then the fully and partially unaligned sequences combined and moved into this directory. This is done in the first command below. -* This combined file is then run through CD-HIT to remove redundancy (at 90% identity) and the final non-redundant non-reference contigs are saved in 12_finalpangenome. -* /cbio/bin is the location of the CD-HIT executable. +* This combined file is then run through CD-HIT to remove redundancy (at 90% identity) and the final non-redundant non-reference contigs are saved in `12_finalpangenome`. +* `/cbio/bin` is the location of the CD-HIT executable. ``` mkdir 11_nonreference && cat 10.1_rmctmfully/novel_sequence.fa 10.2_rmctmpartially/novel_sequence.fa > 11_nonreference/nonrefernce.before.fa hupanSLURM rmRedundant cdhitCluster -t 8 11_nonreference/nonrefernce.before.fa 12_finalpangenome /cbio/bin/ ``` -* This will result in a tmp directory and cluster_info.txt and non-redundant.fa files. -* hupanSLURM rmRedundant code is found in `/cbio/soft/HUPAN/lib/HUPANrmRdtSLURM.pm` +* This will result in a `tmp` directory and `cluster_info.txt` and `non-redundant.fa` files. +* hupanSLURM rmRedundant code is found in `/cbio/projects/015/HUPANdatabases/HUPAN/lib/HUPANrmRdtSLURM.pm` **(12) Merging the different populations** **At this point, each population has its own non-redundant non-reference sequences. Now we can combine them and remove redundancy.** **By doing this, we are ending up with the same sequences we would have had if we had run all the populations together, but now we have the population-specific information separated and can easily perform downstream or upstream analysis on population differences.** -* To do this, use `cat` to combine all the sequences into one file, and then use CD-HIT to remove redundancy at 90%. +* To do this, use `cat` to combine all the sequences into one file, and then use CD-HIT to remove redundancy at 95%. * The population directories should all be in the same location and can therefore be accessed using *. -* The final non-reference sequences are saved in allpopulations/ +* The final non-reference sequences are saved in `allpopulations/` ``` -cat */12_finalpangenome/non-redundant.fa > ../allpopulations/11_nonreference/all.nonrefernce.before.fa +cat */12_finalpangenome/non-redundant.fa > ../allpopulations/11_nonreference/all.nonreference.before.fa cd ../allpopulations -hupanSLURM rmRedundant cdhitCluster -t 8 11_nonreference/all.nonrefernce.before.fa 12_finalpangenome /cbio/bin/ +hupanSLURM rmRedundant cdhitCluster -t 8 -c 0.95 11_nonreference/all.nonreference.before.fa 12_finalpangenome /cbio/bin/ ``` * This will result in the final non-redundant non-reference sequence file in `allpopulations/12_finalpangenome/non-redundant.fa` for all populations combined. -* All steps from this point are performed in the `allpopulations` directory. +* All steps from this point are performed in the `allpopulations/` directory. **(13) Splitting the sequences into smaller files** * This step splits the non-redundant non-reference sequences into smaller files so that MAKER can handle them easier and quicker. * It takes the fasta file and saves it as smaller files in the second directory given. -* It splits the fasta file into files that contain ~6000 contigs. -* The output directory is 13_genepredinput. +* It splits the fasta file into files that contain roughly 2 Mbp, set using the `-m 2000000` flag +* The output directory is `13_genepredinput`. ``` -hupanSLURM splitSeq 12_finalpangenome/non-redundant.fa 13_genepredinput +hupanSLURM splitSeq -m 2000000 12_finalpangenome/non-redundant.fa 13_genepredinput ``` -* This will result in a number of part*.fa files, each consisting of around 6000 sequences -* The number of files will depend on how many sequences were in the non-redundant.fa file. -* hupanSLURM splitSeq code is found in `/cbio/soft/HUPAN/lib/HUPANsplitSeqSLURM.pm` +* This will result in a number of `part*.fa` files. +* The number of files will depend on how many sequences were in the `non-redundant.fa` file. +* hupanSLURM splitSeq code is found in `/cbio/projects/015/HUPANdatabases/HUPAN/lib/HUPANsplitSeqSLURM.pm` **(14) Predicting genes from the non-reference sequences** * This step takes the split up non-reference sequence files and inputs them into MAKER. * The config is the MAKER configuration file where we have specified all of the variables and datasets to use. A copy of this config file can be found in this repository. -* /cbio/bin is the location of the MAKER executable, which runs a MAKER Singularity container. +* `/cbio/bin` is the location of the MAKER executable, which runs a MAKER Singularity container. ``` -hupanSLURM genePre -t 16 13_genepredinput 14_genepred /cbio/projects/008/jess/HUPANrun/AfricanPanGenome/config /cbio/bin +hupanSLURM genePre -t 16 13_genepredinput 14_genepred /cbio/projects/015/HUPANrun/AfricanPanGenome/RefGraphMAKERconfig /cbio/bin ``` -* This will result in various MAKER predictions for each part*.fa file. -* hupanSLURM genePre code is found in `/cbio/soft/HUPAN/lib/HUPANgenePreSLURM.pm` +* This will result in various MAKER predictions for each `part*.fa` file. +* hupanSLURM genePre code is found in `/cbio/projects/015/HUPANdatabases/HUPAN/lib/HUPANgenePreSLURM.pm` **(15) Merging the novel predictions** * This step merges the novel predictions from the different parts. -* It akes in the directory where all the different parts' results are (14_genepred) and places them into single files in 15_genepredmerge. -* /cbio/bin is where the MAKER executable is. +* It takes in the directory where all the different parts' results are (`14_genepred`) and places them into single files in `15_genepredmerge`. +* `/cbio/bin` is where the MAKER executable is. +* This step is run interactively and produces output in real time (takes about 10 minutes), so first run `screen` and then use a minimal interactive node: `srun --pty bash` ``` hupanSLURM mergeNovGene 14_genepred/result/ 15_genepredmerge /cbio/bin ``` -* This will result in all.maker.map, combine.all.maker.gff, combine.all.maker.proteins.fasta and combine.all.maker.transcripts.fasta files. -* hupanSLURM mergeNovGene code is found in `/cbio/soft/HUPAN/lib/HUPANmergeNovGene.pm` +* This will result in `all.maker.map`, `combine.all.maker.gff`, `combine.all.maker.proteins.fasta` and `combine.all.maker.transcripts.fasta` files. +* hupanSLURM mergeNovGene code is found in `/cbio/projects/015/HUPANdatabases/HUPAN/lib/HUPANmergeNovGene.pm` **(16) Filtering the gene predicitons** * This step filters and refines the novel predicted genes based on HUPAN specifications. These specifications can be found in the HUPAN paper supplementary methods. -* The first command copies the non-redundant.fa sequence file from 12_finalpangenome to the 15_genepredmerge directory, because the script needs it to work. Call it novel.fa. -* The ref/ directory contains exactly what the HUPAN script looks for with the exact names, i.e. ref.fa and ref.transcripts.fa (explained above). +* The first command copies the `non-redundant.fa` sequence file from `12_finalpangenome` to the `15_genepredmerge` directory, because the script needs it to work. Call it `novel.fa`. +* The `ref/` directory contains exactly what the HUPAN script looks for with the exact names, i.e. `ref.genome.fa` and `ref.transcripts.fa` (explained above). * The last three inputs are (in order) the locations of the BLAST, CD-HIT and RepeatMasker executables respectively. ``` cp 12_finalpangenome/non-redundant.fa 15_genepredmerge/novel.fa hupanSLURM filterNovGene -t 16 15_genepredmerge 16_genepredfilter /cbio/projects/015/HUPANdatabases/ref /cbio/bin/ /cbio/bin/ /cbio/bin/ ``` -* hupanSLURM filterNovGene code is found in `/cbio/soft/HUPAN/lib/HUPANfilterNovGeneSLURM.pm` +* hupanSLURM filterNovGene code is found in `/cbio/projects/015/HUPANdatabases/HUPAN/lib/HUPANfilterNovGeneSLURM.pm` **(17) Assembling the pan-genome** -* This step has does two things. First: - * It takes in the human reference annotation file in ref/ref.genes.fa and retains only the longest transcript of each annotation while discarding others, like mRNA, lncRNA, etc. - * The -f flag just tells the script that the file is in gtf format, not gff. - * It saves the annotations in a file called ref.genes-ptpg.gtf, found in the /cbio/projects/015/HUPANdatabases/ref directory. - * The annotation file was downloaded from https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/gencode.v38.chr_patch_hapl_scaff.annotation.gtf.gz (release 38). - * The annotation file downloaded must be later than release 21, otherwise the HUPAN script won't work as it will be in a different format (format changed after release 21). - * However, you also have to remove any annotations from patch/alt/decoy/unplaced contigs! So remove from the file all lines after "chrM" (mitochondrial DNA) annotations. I have not shown this step below, as the exact line numbers may be different. - * So essentially, run the pTpG script on the annotation file, and then remove the extra annotations and save the final working file as ref.genes-ptpg-primaryseqs.gtf. -* Then secondly, we combine this annotation file and the novel sequence annotation (from step 16) into one annotation file - * You must first convert the novel annotation file into GTF format too, or otherwise start with GFF format for the reference annotation. -* Both steps done in real time (interactively) so use `screen` and then start an interactive node with extra memory: `srun --mem=15g --pty bash` -``` -hupanSLURM pTpG -f /cbio/projects/015/HUPANdatabases/ref/ref.genes.gtf /cbio/projects/015/HUPANdatabases/ref/ref.genes-ptpg.gtf -# insert step to remove any additional annotations and save the primary sequence annotations as ref.genes-ptpg-primaryseqs.gtf -# insert step to convert the Final.gff file to GTF format. -mkdir 17_pan && cat /cbio/projects/015/HUPANdatabases/ref/ref.genes-ptpg-primaryseqs.gtf 16_genepredfilter/data/Final/Final.gtf > 17_pan/pan.gtf -``` -* hupanSLURM pTpG code is found in `/cbio/soft/HUPAN/lib/HUPANpTpGSLURM.pm` +* This step does multiple things. First: + * It takes in the human reference annotation file in `ref/ref.genes.fa` and retains only the longest transcript of each annotation while discarding others, like mRNA, lncRNA, etc. It also removes any annotations from patch/alt/decoy/unplaced/mitochondrial DNA contigs, as the HUPAN pipeline seems to be unable to recognise them. + * This step, however, HAS ALREADY BEEN COMPLETED using the original HUPAN `pTpG` script and some additional scripting to remove the extra annotations.The primary annotations have been saved in a file called ` ref.genes-ptpg-primaryseqs.gtf`, found in the `/cbio/projects/015/HUPANdatabases/ref` directory. + * NO MORE ACTION IS REQUIRED FOR THIS ANNOTATIOIN FILE. You can use it as is. Should a new annotation file need to be made, contact Jess Bourn for guidance. + * Some additional information: + * The original annotation file was downloaded from https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/gencode.v38.chr_patch_hapl_scaff.annotation.gtf.gz (release 38). + * The annotation file downloaded MUST be later than release 21, otherwise the HUPAN script won't work as it will be in a different format (format changed after release 21). +* Then secondly, we convert the novel annotation file (GFF format) into GTF format in order to combine it succcessfully with the primary sequence annotation GTF file. + * This is done using an adapted form of the HUPAN `pTpG` script and the result is saved in the same `16_genepredfilter/data/Final/` directory. +* Once these two steps have been completed, the two files can be merged to make the pan-genome annotation file. +* These steps are done in real time (interactively) so use `screen` and then start an interactive node: `srun --pty bash` +``` +hupanSLURM pTpG 16_genepredfilter/data/Final/Final.gff 16_genepredfilter/data/Final/Final-ptpg.gtf +mkdir 17_pan && cat /cbio/projects/015/HUPANdatabases/ref/ref.genes-ptpg-primaryseqs.gtf 16_genepredfilter/data/Final/Final-ptpg.gtf > 17_pan/pan.gtf +``` +* The original hupanSLURM pTpG code is found in `/cbio/projects/015/HUPANdatabases/HUPAN/lib/HUPANpTpGSLURMoriginal.pm` +* The MAKER-adapted hupanSLURM pTpG code is found in `/cbio/projects/015/HUPANdatabases/HUPAN/lib/HUPANpTpGSLURM.pm` **(18) Aligning the samples to the pan-genome** * This step does three things: - * First, it combines the novel sequences with the human reference sequences into one file called pan.fa. This is the final pan-genome file. - * It then uses the bowtie2-build function to build a Bowtie index from the pan-genome sequences. - * This is done interactively and requires a lot of memory, so a `screen` session and an interactive node is required for the second command: `srun --mem=50g --time=24:00:00 --pty bash` -* Finally, it then aligns all the fastq files (of each genome) to the pan-genome file using Bowtie 2. The fastq files of all samples are found in 0_fastq. -* The -f flag specifies Bowtie2 (not BWA), and the -s and -k flags specify the naming convention of the fastq files given (e.g. LZP0B_R1.fastq.gz) -* 18_map2pan is the output directory. -* /cbio/projects/015/HUPANdatabases/images/ is where I have stored the Bowtie 2 Singularity container. -``` -cat /cbio/projects/015/HUPANdatabases/ref/ref.fa 12_finalpangenome/non-redundant.fa > 17_pan/pan.fa + * First, it combines the novel sequences with the human reference sequences into one file called `pan.fa`. This is the final pan-genome file. + * It then uses the `bowtie2-build` function to build a Bowtie index from the pan-genome sequences. + * This is done interactively and requires a lot of memory, so a `screen` session and an interactive node are required for the second command: `srun --mem=50g --time=24:00:00 --pty bash` +* Finally, it then aligns all the fastq files (of each genome) to the pan-genome file using Bowtie 2. The fastq files of all samples are found in `00_fastq`. +* The `-f` flag specifies Bowtie2 (not BWA), and the `-s` and `-k` flags specify the naming convention of the fastq files given (e.g. `LZP0B_R1.fastq.gz`) +* `18_map2pan` is the output directory. +* `/cbio/projects/015/HUPANdatabases/images/` is where I have stored the Bowtie 2 Singularity container. +``` +cat /cbio/projects/015/HUPANdatabases/ref/ref.genome.fa 12_finalpangenome/non-redundant.fa > 17_pan/pan.fa cd 17_pan && /cbio/projects/015/HUPANdatabases/images/bowtie2-build pan.fa pan && cd .. -hupanSLURM alignRead -f bowtie2 -t 8 -s .fastq.gz -k _R 0_fastq 18_map2pan /cbio/projects/015/HUPANdatabases/images/ 17_pan/pan +hupanSLURM alignRead -f bowtie2 -t 8 -s .fastq.gz -k _R 00_fastq 18_map2pan /cbio/projects/015/HUPANdatabases/images/ 17_pan/pan ``` -* The final command will result in a .sam file for each fastq file (genome/individual) given. -* hupanSLURM alignRead code is found in `/cbio/soft/HUPAN/lib/HUPANmapSLURM.pm` +* The final command will result in a `.sam file` for each fastq file (genome/individual) given. +* hupanSLURM alignRead code is found in `/cbio/projects/015/HUPANdatabases/HUPAN/lib/HUPANmapSLURM.pm` **(19) Converting to SAM format** -* This step uses Samtools to convert the .sam to .bam and then sort and index the .bam files. -* It takes in the .sam files in the 18_map2pan directory and outputs the sorted and indexed .bam files to the 19_panBam directory. -* /cbio/bin is where the Samtools executable is found. +* This step uses Samtools to convert the `.sam` files to `.bam` format and then sort and index the `.bam` files. +* It takes in the `.sam` files in the `18_map2pan` directory and outputs the sorted and indexed `.bam` files to the `19_panBam` directory. +* `/cbio/bin` is where the Samtools executable is found. ``` hupanSLURM sam2bam -t 8 18_map2pan/data 19_panBam /cbio/bin ``` -* This will result in a .bam and .bai file for each fastq file (genome/individual) given. -* hupanSLURM sam2bam code is found in `/cbio/soft/HUPAN/lib/HUPANsamToBamSLURM.pm` +* This will result in a `.bam` and `.bai` file for each fastq file (genome/individual) given. +* hupanSLURM sam2bam code is found in `/cbio/projects/015/HUPANdatabases/HUPAN/lib/HUPANsamToBamSLURM.pm` **(20) Calculating gene coverage** * This step calculates the coverage of each gene or CDS in all of the genomes used. -* It does this by looking at the alignment information in the indexed and sorted .bam files from the previous step and identifying whether these alignments fall within the annotations in the pan.gtf file. -* The results will be output to the 20_geneCov directory. +* It does this by looking at the alignment information in the indexed and sorted `.bam` files from the previous step and identifying whether these alignments fall within the annotations in the `pan.gtf` file. +* The results will be output to the `20_geneCovwithY` directory. +* Of importnace: the HUPAN authors removed the Y chromosome genes/CDS regions from further analysis as these would be classified as "distributed" in female samples, even if they are not distributed in male samples. + * We similarly implemented this using the second command below. + * First, a copy is made of the original directory so the Y chromosome presence/absence data is available if required at a later stage. Then, in the new `20_geneCov` directory, 'sed' is used to remove all annotation lines containing the string 'chrY' and the edited files are saved. + * These files lacking Y chromosome data are used in all subsequent steps. ``` -hupanSLURM geneCov -t 8 19_panBam/data 20_geneCov/ 17_pan/pan.gtf +hupanSLURM geneCov -t 8 19_panBam/data 20_geneCovwithY/ 17_pan/pan.gtf +cp -r 20_geneCovwithY 20_geneCov && cd 20_geneCov/data && for i in *; do sed -i '/chrY/d' $i/$i.sta; done && cd ../../ ``` -* This will result in a .sta file for each genome. -* hupanSLURM geneCov code is found in `/cbio/soft/HUPAN/lib/HUPANgeneCovSLURM.pm` +* This will result in two `.sta` files for each sample, one containing Y annotations and one not. +* hupanSLURM geneCov code is found in `/cbio/projects/015/HUPANdatabases/HUPAN/lib/HUPANgeneCovSLURM.pm` **(21) Determing gene presence-absence variance profile** * This steps checks whether a gene is present or absent (based on a defined coverage threshold) for each sample. -* The gene coverage information from the 20_geneCov directory is first combined into two files, one recording gene coverage and one recording CDS coverage. - * The first input is the prefix you want to use for the merged file, and the second input is that data directory where the .sta files are. +* The gene coverage information from the `20_geneCov` directory is first combined into two files, one recording gene coverage and one recording CDS coverage. + * The first input is the prefix you want to use for the merged file, and the second input is that data directory where the `.sta` files are. * The final HUPAN command takes in the two files outputted from the previous command and creates the gene presence-absence variance profile. * It does this by checking the gene or CDS coverage at a certain threshold. HUPAN sets theirs at 95% gene coverage. * For this command, the two integer values are (in order) the minimum gene coverage and the minimum CDS coverage. ``` -cd 20_geneCov && hupanSLURM mergeGeneCov geneCovmerged 20_geneCov/data -mkdir 21_geneExist && hupanSLURM geneExist 20_geneCov/summary_gene.cov 20_geneCov/summary_cds.cov 0 0.95 > 21_geneExist/gene.exist +cd 20_geneCov && hupanSLURM mergeGeneCov summary_ data && cd .. +mkdir 21_geneExist && hupanSLURM geneExist 20_geneCov/summary_gene.cov 20_geneCov/summary_cds.cov 0 0.95 > 21_geneExist/cds95.gene.exist +``` +* The first command will result in `geneCovmergedgene.cov` and `geneCovmergedcds.cov` files. +* hupanSLURM mergeGeneCov code is found in `/cbio/projects/015/HUPANdatabases/HUPAN/lib/HUPANgeneCovSLURM.pm` +* The second command will result in the `gene.exist file`. +* hupanSLURM geneExist code is found in `/cbio/projects/015/HUPANdatabases/HUPAN/lib/HUPANgeneExistSLURM.pm` + +And now you're finally done! + +**Performing simulations using the HUPAN suite** + +* The HUPAN pipelines offers two simulation outputs on your data: + * The `sim` function simulates the size of the core and pan-genomes from the gene presence-absence data generated in step 21. + * The `simSeq` function simulates and plots the total amount of novel sequence as more individuals are added. + +* For the `sim` function: + * A `screen` session and a minimum interactive node are required: `srun --pty bash` + * You must choose whichever `.gene.exist` file is most apprpropriate for analysis of the pan-genome. Here, gene coverage of 95% is chosen. +``` +mkdir 22_simulations +hupanSLURM sim 21_geneExist/gene95.gene.exist 22_simulations/PAV +``` +* This will result in a `sim_out.txt` file and a `sim_out.txt_PAV_plot.pdf` file with the simulated data and plots in them. + +* For the `simSeq` function: + * The first command performs the simulation on the sample-specific unaligned data in `05_unaligned`. + * `/cbio/bin` is the location of the CD-HIT executable which is used for the simulation. + * This simulation will take a few days to run depending on the number of samples. + * The second command just plots the result and is interactive, so a `screen` session and a minimum interactive node are required: `srun --pty bash` +``` +hupanSLURM simSeq simNovelSeq -t 8 05_unaligned/data/ 22_simulations/novelSeqs /cbio/bin +hupanSLURM simSeq plotNovelSeq 22_simulations/novelSeqs/ outputPlot ``` -* The first command will result in geneCovmergedgene.cov and geneCovmergedcds.cov files. -* hupanSLURM mergeGeneCov code is found in `/cbio/soft/HUPAN/lib/HUPANgeneCovSLURM.pm` -* The second command will result in the gene.exist file. -* hupanSLURM geneExist code is found in `/cbio/soft/HUPAN/lib/HUPANgeneExistSLURM.pm` +* This will result in a `simNovelSeq.pdf` file with the simulated plot. diff --git a/bin/adjCdhit.pl b/bin/adjCdhit.pl new file mode 100755 index 0000000..d64e124 --- /dev/null +++ b/bin/adjCdhit.pl @@ -0,0 +1,47 @@ +#!/usr/bin/perl +use strict; +use warnings; + +my $usage="$0 +"; + +die $usage if @ARGV!=5; + +my ($fa,$facls,$con,$outfa,$outcls)=@ARGV; +my %h; +open(IN,$con); +while(){ + chomp; + my ($a,$b)=split /\t/,$_; + $h{$b}=$a; +} +close IN; + +open(OUT,">$outfa"); +open(IN,$fa); +while(){ + if(/^>([0-9]+)/){ + print OUT ">",$h{$1},"\n"; + } + else{ + print OUT $_; + } +} +close IN; +close OUT; + +open(OUT,">$outcls"); +open(IN,$facls); +while(){ + if(/^>/){ + print OUT $_; + } + else{ + my ($a,$b)=split /\.\.\./,$_; + my($c,$d)=split /\>/,$a; + print OUT $c,$h{$d},$b; + } +} +close IN; + + diff --git a/bin/bam2cov b/bin/bam2cov new file mode 100755 index 0000000..cd7b8ca Binary files /dev/null and b/bin/bam2cov differ diff --git a/bin/blastCluster.pl b/bin/blastCluster.pl new file mode 100755 index 0000000..bf0e5bf --- /dev/null +++ b/bin/blastCluster.pl @@ -0,0 +1,215 @@ +#!/usr/bin/perl +use strict; +use warnings; + +my $usage="$0 + +$0 is used to remove redundant sequences from self-blastn output. +blastn should be run with \" -evalue 1e-5 -outfmt \"6 qseqid sseqid +qlen slen length qstart qend sstart send pident evalue\" -max_target_seqs 1000\" + +"; + +die $usage if @ARGV!=4; +my ($fa,$IDEN,$blast_out,$prefix)=@ARGV; + +my %seq=readfa($fa); +my %seqL=readLength($fa); + +my @cluster; +my %fflag; +my %pflag; + +my $index=0; +my %now; +my $pre_q=""; + +open(IN,$blast_out) || die "Cannot open $blast_out!\n"; +while(){ + chomp; + my ($qname,$tname,$qlen,$tlen, $len,$qstart,$qend,$tstart,$tend,$iden,$evalue)=split /\t/,$_; + next if $qname eq $tname; + next if defined $fflag{$qname}; + $iden/=100; + if($qname eq $pre_q){ + if ($qlen>=$tlen){ + add_to_now($tname,$iden,$len,$tstart,$tend,$tlen); + } + else{ + add_to_now($tname,$iden,$len,$qstart,$qend,$qlen); + } + } + else{ + my $new_clus=init_item($qname); + $fflag{$qname}=1; + $pflag{$qname}=$index; + foreach my $k (keys(%now)){ + if($now{$k}->{iden}>$IDEN){ + if(defined $fflag{$k}){ + next if $fflag{$k}>=$now{$k}->{iden}; + $cluster[$pflag{$k}]->{$k}=0; + $new_clus->{$k}=1; + $fflag{$k}=$now{$k}->{iden}; + $pflag{$k}=$index; + } + else{ + $new_clus->{$k}=1; + $fflag{$k}=$now{$k}->{iden}; + $pflag{$k}=$index; + } + } + } + push @cluster, $new_clus; + $index++; + %now=(); + if ($qlen>=$tlen){ + add_to_now($tname,$iden,$len,$tstart,$tend,$tlen); + } + else{ + add_to_now($tname,$iden,$len,$qstart,$qend,$qlen); + } + } + $pre_q=$qname; +} +close IN; + +my $outcl=$prefix.".cluster"; +my $outfa=$prefix.".fa"; +my %rep; + +open(OUT,">$outcl"); +my $i=0; +for($i=0;$i<@cluster;$i++){ + my $l=0; + my $rep=""; + foreach my $k (keys(%{$cluster[$i]})){ + if($cluster[$i]->{$k}==1){ + if($seqL{$k}>$l){ + $rep=$k; + $l=$seqL{$k}; + } + } + } + $rep{$rep}=1; + print OUT "Group ",$i+1,": "; + foreach my $k (keys(%{$cluster[$i]})){ + print OUT " ",$k if $cluster[$i]->{$k}==1; + } + print OUT "\n"; +} +foreach my $k (keys(%seq)){ + if(!defined($fflag{$k})){ + $rep{$k}=1; + print OUT "Group $i: $k\n"; + $i++; + } +} +close OUT; + +open(OUT,">$outfa"); +open(IN,$fa); +my $f=0; +while(){ + chomp; + if(/^>(.+)$/){ + if(defined $rep{$1}){ + $f=1; + print OUT $_,"\n"; + } + } + else{ + print OUT $_,"\n" if $f==1; + } +} +close IN; +close OUT; + +################################################ +sub readfa{ + my %h; + open(IN,$_[0]); + while(){ + chomp; + $h{$1}=1 if(/^>(.+)$/); + } + close IN; + return %h; +} + +sub readLength{ + my %h; + open(IN,$_[0]); + my $name=""; + while(){ + chomp; + if(/^>(.+)$/){ + $name=$1; + $h{$name}=0; + } + else{ + $h{$name}+=length($_); + } + } + close IN; + return %h; +} + +sub init_item{ + my %h; + $h{$_[0]}=1; + return \%h; +} + +sub add_to_now{ + my ($tname,$iden,$len,$tstart,$tend,$tlen)=@_; + if(defined $now{$tname}){ + if(!check_overlap($now{$tname},$tstart,$tend)){ + push @{$now{$tname}->{start}},$tstart; + push @{$now{$tname}->{end}},$tend; + $now{$tname}->{iden}=$now{$tname}->{iden}+$iden*$len/$tlen; + } + } + else{ + $now{$tname}=init_h($iden*$len/$tlen,$tstart,$tend); + } +} + +sub init_h{ + my %h; + $h{iden}=$_[0]; + $h{start}=init_array($_[1]); + $h{end}=init_array($_[2]); + return \%h; +} + +sub init_array{ + my @a; + $a[0]=$_[0]; + return \@a; +} +sub check_overlap{ + my ($ad,$s,$e)=@_; + my $f=0; + for(my $i=0;$i<@{$ad->{start}};$i++){ + if(check_overlap_helper($s,$e,$ad->{start}->[$i],$ad->{end}->$i)){ + $f=1; + last; + } + } + close $f; +} + +sub check_overlap_helper{ + my ($a,$b,$c,$d)=@_; + if($a<$c){ + return 0 if $b<$c; + return 1 if $b>=$c; + } + elsif($a>$c){ + return 0 if $d<$a; + return 1 if $d>=$a; + } + else{ + return 1; + } +} diff --git a/bin/breakGCscaf.pl b/bin/breakGCscaf.pl new file mode 100755 index 0000000..08283cb --- /dev/null +++ b/bin/breakGCscaf.pl @@ -0,0 +1,69 @@ +#!/usr/bin/perl +#Created by Hu Zhiqiang, 2014-10-5 +use strict; +use warnings; +use Getopt::Std; +use vars qw($opt_m); +getopts("m:"); +my $usage="Usage: $0 [options] +This script is to break gap-closed scaffolds into contigs at consecutive N. + +Option: + -m the number of consecutive N to be broken down + default: 10 +"; +die $usage if @ARGV!=2; + +my $con_N=10; +$con_N=$opt_m if defined($opt_m); + +break_scaffolds(@ARGV,$con_N); + +sub break_scaffolds{ + my ($in,$out,$sepN)=@_; + my $sep=""; + for(my $i=1;$i<$sepN;$i++){ + $sep.="N"; + } + open(SCAF,$in)||die("Error09: cannot read file:$in\n"); + open(CONTIG,">$out")||die("Error10: cannot write file:$out\n"); + my $init=1; + my ($header,$seq); + while(my $line=){ + chomp $line; + if($line=~/^>/){ + if(!$init){ + split_and_print_seq($seq,$sep,$header); + ($header)=split /\s+/,$line; + $seq=""; + } + else{ + ($header)=split /\s+/,$line; + $seq=""; + $init=0; + } + } + else{ + $seq.=$line; + } + } + split_and_print_seq($seq,$sep,$header); + close SCAF; + close CONTIG; +} + +sub split_and_print_seq{ + my ($seq,$sep,$header)=@_; + my $i=0; + my @tmp=split /\Q$sep\EN+/,$seq; + foreach my $s (@tmp){ + next if length($s)==0; + $i++; + print CONTIG $header,"_",$i,"\n"; + for(my $j=0;$j0; + } +} + diff --git a/bin/ccov b/bin/ccov new file mode 100755 index 0000000..fcd0362 Binary files /dev/null and b/bin/ccov differ diff --git a/bin/filterNovelgene.pl b/bin/filterNovelgene.pl new file mode 100755 index 0000000..c03e110 --- /dev/null +++ b/bin/filterNovelgene.pl @@ -0,0 +1,349 @@ +#!/usr/bin/perl +#Created by Duan Zhongqu at 2018-11-5 +use strict; +use warnings; +use Getopt::Std; + +my $usage="Usage: $0 seq_file gff_file tra_file pro_file genome_ref tran_ref cdhit_exe blast_exe mkblastdb_exe repeat_exe gene_identity ref_identity thread_num out_dir\n"; + +die $usage if @ARGV!=14; + +my ($seq_file,$gff_file,$tra_file,$pro_file,$genome_ref,$tran_ref,$cdhit_exe,$blast_exe,$mkblastdb_exe,$repeat_exe,$gene_identity,$ref_identity,$thread_num,$out_dir)=@ARGV; + +my $log_file=$out_dir."FilterNovelGene.log"; +open(LOG,">$log_file")||die("Cannot write the file.\n"); +print LOG "\nCommand: $0 $seq_file $gff_file $tra_file $pro_file $genome_ref $tran_ref $cdhit_exe $blast_exe $mkblastdb_exe $repeat_exe $gene_identity $ref_identity $thread_num $out_dir\n\n"; +print LOG "\tInput file:\n\t$seq_file\n\t$gff_file\n\t$tra_file\n\t$pro_file\n\n"; +print LOG "\tReference information:\n\t$genome_ref\n\t$tran_ref\n\n"; +print LOG "\tOutput directory:\n\t$out_dir\n\n"; +print LOG "\tBinary executable file:\n\t$cdhit_exe\n\t$blast_exe\n\t$mkblastdb_exe\n\t$repeat_exe\n\n"; +print LOG "\tOptions:\n\tGene identity: $gene_identity\n\tReference identity: $ref_identity\n\tThread number: $thread_num\n\n"; + +#read the gff file and get the number of novel predicted genes. +my $raw_out_dir=$out_dir."raw/"; +mkdir($raw_out_dir); +my $filter_100bp_dir=$out_dir."100bps/"; +mkdir($filter_100bp_dir); +open(GFF,$gff_file)||die("Error9: cannot read the gff file.\n"); +my $new_gff_file=$raw_out_dir."maker.raw.gff"; +open(OUT1,">$new_gff_file")||die("Error10: cannot write the file: $new_gff_file.\n"); +my $raw_gene_map=$raw_out_dir."maker.gene.map"; +open(OUT,">$raw_gene_map")||die("Error10: cannot write the file: $raw_gene_map.\n"); +print OUT "sequence name\tstart\tend\tgene name\n"; +my $gene_map=$filter_100bp_dir."gene_100bps.map"; +open(OUT2,">$gene_map")||die("Error10: cannot write the file: $gene_map.\n"); +print OUT2 "sequence name\tstart\tend\tgene name\n"; +my $count_raw_gene=0; +my $count_gene=0; +print LOG "--------------------\nRead $gff_file file...\n\tall the annotation informations from maker will be selected and writtend into $new_gff_file.\n\tthe positions of each predicted gene were recorded in $raw_gene_map.\n"; +while(my $line=){ + chomp $line; + if($line=~/maker/){ + print OUT1 $line."\n"; + if($line=~/maker\tgene/){ + $count_raw_gene+=1; + my @string=split "\t",$line; + my $gene_length=$string[4]-$string[3]+1; + my @tmp=split ";",$string[8]; + my $gene_id=substr($tmp[0],3); + print OUT $string[0]."\t".$string[3]."\t".$string[4]."\t".$gene_id."\n"; + if($gene_length>=100){ + $count_gene+=1; + print OUT2 $string[0]."\t".$string[3]."\t".$string[4]."\t".$gene_id."\n"; + } + } + } +} +print LOG "\tthere are $count_raw_gene novel gene were predicted by the maker tools.\n"; +print LOG "\tof these, $count_gene genes are longer than 100 bp.\n\tthe positions of each predicted gene were recorded in $gene_map.\n"; +close GFF; +close OUT1; +close OUT2; +close OUT; + +#read the novel, transcript and protein sequences onto hash +my %novel_seq=readSeq($seq_file); +my %tran_seq=readSeq($tra_file); +my %prot_seq=readSeq($pro_file); +#obtain the gene sequences from novel sequences based on the gff file. +open(IN,$gene_map)||die("Error9: cannot read the file: $gene_map.\n"); +my $gene_file=$filter_100bp_dir."gene.100bps.fasta"; +open(OUT,">$gene_file")||die("Error10: cannot write the file: $gene_file.\n"); +readline IN; +my %gene_seq; +my %gene_length; +print LOG "\twrite these genes into $gene_file.\n"; +while(my $line=){ + chomp $line; + my @string=split "\t",$line; + my $name=$string[0]; + my $start=$string[1]; + my $end=$string[2]; + my $length=$end-$start+1; + my $gene_id=$string[3]; + my $all_seq=$novel_seq{$name}; + my $seq=substr($all_seq,$start-1,$length); + $gene_seq{$gene_id}=$seq; + $gene_length{$gene_id}=length($seq); + print OUT ">".$gene_id."\t".$name."\t".$start."\t".$end."\n".$seq."\n"; +} +close IN; +close OUT; +my $iden=$gene_identity*100; +print LOG "Done.\n--------------------\nRemoving the redundancy genes with sequence identity of ".$iden."%...\n"; +#remove the redundant gene sequence +my $rm_redundancy_dir=$out_dir."Non-redundancy/"; +mkdir ($rm_redundancy_dir); +my $cdhit_out=$rm_redundancy_dir."non.redundant.gene.fasta"; +my $cdhit_log=$rm_redundancy_dir."cdhit.log"; +my $com="$cdhit_exe -i $gene_file -o $cdhit_out -T $thread_num -c $gene_identity >$cdhit_log"; +system($com); +#Count the number of rest novel gene +my @non_redun_gene=extract_gene_name($cdhit_out); +my $non_redun_gene_count=@non_redun_gene; +print LOG "\tthere are $non_redun_gene_count novel genes after removing homology gene sequences in $cdhit_out.\n"; + +#align the novel sequence to reference genome seqeuces +print LOG "Done.\n-------------------\nAlign the gene sequences to reference genome...\n"; +my $genome_ref_dir=$out_dir."ref_genome/"; +mkdir($genome_ref_dir); +my $genome_db=$genome_ref_dir."blast_db_genome"; +my $genome_blast=$genome_ref_dir."non.redundant.gene.blast2genome"; +runblast($blast_exe,$mkblastdb_exe,$genome_ref,$genome_db,$cdhit_out,$genome_blast,$thread_num); + +#discard the gene could align to reference genome at the global identity cutoff 50% +my %gene_similar_genome=filterblast($genome_blast,\%gene_length); +my @gene_non_similar_genome=grep {!$gene_similar_genome{$_}} @non_redun_gene; +my $gene_non_similar_genome_count=@gene_non_similar_genome; +print LOG "\tthere are $gene_non_similar_genome_count novel genes after removing genes similar with the reference genome.\n"; +my $gene_non_similar_genome_file=$genome_ref_dir."Non-similar2genome.gene.fasta"; +my @genes=extractSub($cdhit_out,$gene_non_similar_genome_file,"gene",\@gene_non_similar_genome); +my $tran_non_similar_genome_file=$genome_ref_dir."Non-similar2genome.transcripts.fasta"; +my @tran_non_similar_genome=extractSub($tra_file,$tran_non_similar_genome_file,"transcripts",\@gene_non_similar_genome); +print LOG "Done.\n--------------------\nAlign the gene sequences to reference transcript...\n"; +#align the transcripts sequence to reference transcript sequences +my $tran_ref_dir=$out_dir."ref_tran/"; +mkdir($tran_ref_dir); +my $tran_db=$tran_ref_dir."blast_db_transcripts"; +my $tran_blast=$tran_ref_dir."non.similar2genome.transcripts.blast2transcript"; +runblast($blast_exe,$mkblastdb_exe,$tran_ref,$tran_db,$tran_non_similar_genome_file,$tran_blast,$thread_num); + +#discard the gene could align to transcript genome at the global identity cutoff 50% +my %tran_length; +while((my $key,my $value)=each %tran_seq){ + $tran_length{$key}=length($value); +} +my %tran_similar_genome=filterblast($tran_blast,\%tran_length); +my @tran_non_similar_transcript=grep {!$tran_similar_genome{$_}} @tran_non_similar_genome; +my %hash1; +foreach my $sub (@tran_non_similar_transcript){ + my $tmp=substr($sub,0,length($sub)-3); + if(grep /^$tmp$/,@gene_non_similar_genome){ + $hash1{$tmp}=1; + } +} +my @gene_non_similar_transcript=keys %hash1; +my $gene_non_similar_transcript_count=@gene_non_similar_transcript; +print LOG "\tthere are $gene_non_similar_transcript_count novel genes after removing genes similar with the reference transcript.\n"; +my $gene_non_similar_transcript_file=$tran_ref_dir."Non-similar2transcript.gene.fasta"; +@genes=extractSub($gene_non_similar_genome_file,$gene_non_similar_transcript_file,"gene",\@gene_non_similar_transcript); +my $tran_non_similar_transcript_file=$tran_ref_dir."Non-similar2transcript.transcripts.fasta"; +my @trans=extractSub($tran_non_similar_genome_file,$tran_non_similar_transcript_file,"transcript",\@gene_non_similar_transcript); +print LOG "Done.\n--------------------\nDetermine the full-length genes...\n"; +#define whether the novel gene is complete according to start codon and stop codon. +my %filtered_tran_seq=readSeq($tran_non_similar_transcript_file); +my %complete_gene_transcript_seq; +while((my $key,my $value)=each %filtered_tran_seq){ + my $start=substr($value,0,3); + my $end=substr($value,length($value)-3); + if(($start eq "ATG")&&($end eq "TAG"||$end eq "TAA"||$end eq "TGA")){ + $complete_gene_transcript_seq{$key}=$value; + } +} +my %hash2; +while((my $key,my $value)=each %complete_gene_transcript_seq){ + my $tmp=substr($key,0,length($key)-3); + $hash2{$tmp}=1; +} +my @complete_gene_name=keys %hash2; +my $complete_gene_dir=$out_dir."Full-length/"; +mkdir($complete_gene_dir); +my $complete_gene_file=$complete_gene_dir."Full-length.gene.fasta"; +@genes=extractSub($gene_non_similar_genome_file,$complete_gene_file,"gene",\@complete_gene_name); +my $complete_tran_file=$complete_gene_dir."Full-length.transcript.fasta"; +@trans=extractSub($tran_non_similar_transcript_file,$complete_tran_file,"transcript",\@complete_gene_name); +my $complete_gene_count=@complete_gene_name; +print LOG "\tthere are $complete_gene_count full-length novel genes.\n"; +print LOG "Done.\n--------------------\nRepeatmask the sequences of rest genes...\n"; +#repeat component of full-length novel gene. +my $repeat_out_dir=$out_dir."Repeat/"; +mkdir($repeat_out_dir); +$com="$repeat_exe -parallel $thread_num -species human -html -gff -dir $repeat_out_dir $complete_gene_file"; +system($com); +my $masked_gene_file=$repeat_out_dir."Full-length.gene.fasta.masked"; +my %masked_gene_seq=readSeq($masked_gene_file); +my %final_gene; +while((my $key,my $value)=each %masked_gene_seq){ + my $length=length($value); + my $count = $value =~ tr/N/N/; + if($count/$length<=0.5){ + $final_gene{$key}=1; + } +} +print LOG "Done.\n--------------------\nProcude final data set of novel genes...\n"; +my $final_gene_count=keys %final_gene; +print LOG "\tthere are $final_gene_count full-length novel genes.\n"; +my @final_gene_name=keys %final_gene; +my $final_out_dir=$out_dir."Final/"; +mkdir($final_out_dir); +my $final_gene_file=$final_out_dir."Final.gene.fasta"; +@genes=extractSub($complete_gene_file,$final_gene_file,"gene",\@final_gene_name); +my $final_tran_file=$final_out_dir."Final.transcript.fasta"; +@trans=extractSub($complete_tran_file,$final_tran_file,"transcript",\@final_gene_name); +my $final_prot_file=$final_out_dir."Final.protein.fasta"; +my @proteins=extractSub($pro_file,$final_prot_file,"protein",\@final_gene_name); +my $final_gff_file=$final_out_dir."Final.gff"; +open(IN,$new_gff_file)||die("Error09: cannot read the file: $new_gff_file"); +open(OUT,">$final_gff_file")||die("Errpr10: cannot write the file: $final_gff_file"); +while(my $line=){ + chomp $line; + my @string=split "\t", $line; + my @tmp=split ";", $string[8]; + my $name=substr($tmp[0],3,length($final_gene_name[0])); + if(grep /^$name$/, @final_gene_name){ + print OUT $line."\n"; + } +} +print LOG "Done.\n--------------------\n"; +print LOG "The final data set includes four files:\n\tgene sequences: $final_gene_file\t\ntranscript sequences: $final_tran_file\t\nprotein sequences: $final_prot_file\t\nannotation information: $final_gff_file.\n"; +close IN; +close OUT; +close LOG; + +sub readSeq{ + use strict; + use warnings; + my ($seq_file)=@_; + my %sequence; + open(IN,$seq_file)||die("Error09: Could not read file: $seq_file.\n"); + my $count=0; + my $seq; + my $name; + while(my $line=){ + chomp $line; + if($line=~/^>/){ + my @string=split /\s+/, $line; + if($count==0){ + $count+=1; + $name=substr($string[0],1); + } + else{ + $sequence{$name}=$seq; + $name=substr($string[0],1); + $seq=""; + } + } + else{ + $seq.=$line; + if(eof){ + $sequence{$name}=$seq; + } + } + } + close IN; + close OUT; + return %sequence; +} + +sub runblast{ + use strict; + use warnings; + my($blast_exe,$mkblastdb_exe,$ref,$db,$input,$output,$thread_num)=@_; + my $com="$mkblastdb_exe -in $ref -dbtype nucl -out $db"; + $com.="\n$blast_exe -query $input -db $db -out $output -outfmt 7 -max_target_seqs 1 -num_threads $thread_num\n"; + #print $com."\n"; + system("$com"); +} +1; + +sub extract_gene_name{ + use strict; + use warnings; + my ($input_file)=@_; + open(IN,$input_file)||die("Error09: cannot read the file: $input_file.\n"); + my @names; + while(my $line=){ + chomp $line; + if($line=~/^>/){ + my @string=split "\t",$line; + push @names, substr($string[0],1); + } + } + return @names; +} + +sub filterblast{ + use strict; + use warnings; + my ($blast_file,$gene_length)=@_; + my %gene_length=%$gene_length; + open(IN,$blast_file)||die("Error09: cannot read the file: $blast_file"); + my %names; + while(my $line=){ + chomp $line; + next if $line=~/^#/; + my @string=split "\t", $line; + my $name=$string[0]; + my $identity=$string[2]; + my $align_rate=$string[3]/$gene_length{$name}; + if($align_rate*$identity>=50){ + #if(($align_rate>=0.5)&&($identity>=80)){ + $names{$name}=1; + } + } + return %names; +} +1; +sub extractSub{ + use strict; + use warnings; + my ($input,$output,$type,$subsets)=@_; + my @subsets=@$subsets; + open(IN,$input)||die("Error09: cannot read the file: $input.\n"); + open(OUT,">$output")||die("Error10: cannot write the file: $output.\n"); + my $flag=0; + my %names; + while(my $line=){ + chomp $line; + if($line=~/^>/){ + my @string=split /\s+/,$line; + my $name; + my $tmp=substr($string[0],1); + $names{$tmp}=1; + if($type=~/gene/){ + $name=substr($string[0],1); + } + elsif($type=~/transcript/||$type=~/protein/){ + $name=substr($string[0],1,length($string[0])-4); + } + else{ + die("Please define the sequence type: 'gene', 'transcript' or 'protein'.\n"); + } + if(grep /^$name$/, @subsets){ + print OUT $line."\n"; + $flag=1; + } + else{ + $flag=0; + } + } + else{ + if($flag==1){ + print OUT $line,"\n"; + } + } + } + close IN; + close OUT; + return %names; +} diff --git a/bin/getTaxClass.pl b/bin/getTaxClass.pl new file mode 100755 index 0000000..10c108c --- /dev/null +++ b/bin/getTaxClass.pl @@ -0,0 +1,172 @@ +#!/usr/bin/perl +#Created by Duan Zhongqu at 2018-10-31. +#Revised by Duan Zhongqu at 2019-03-06. + +############################################################################################## +# This script firstly extracts all ACCESSION ID from blast result, # +# and then obtains the Taxonomy ID by the file: nucl_gb.accession2taxid, # +# finnaly, the lineage information were obtain according to rankedlineage.dmp # +############################################################################################## + +use strict; +use warnings; +use Getopt::Std; +use vars qw($opt_h); +getopts("h"); + +my $usage="\nUsage: $0 [options] + +$0 is used to obtain the taxonomic classification of each sequence from blast result. + +Necessary input description: + + The blast result of sequences aligning to nt database. + + The directory should contain two files: + 1. nucl_gb.accession2taxid, which is available in + ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/; + 2. rankedlineage.dmp, which is available in + https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/. + + The output directory. + +Options: + + -h Print this usage. + +"; + +die $usage if @ARGV!=3; +die $usage if $opt_h; +my ($blast_file,$info_dir,$out_dir)=@ARGV; + +die("Error01: Cannot find $blast_file\n") unless(-e $blast_file); +die("Error01: Cannot find $info_dir\n") unless(-e $info_dir); +unless($info_dir=~/\/$/){ + $info_dir.="/"; +} +unless($out_dir=~/\/$/){ + $info_dir.="/"; +} +my $accession2taxid=$info_dir."nucl_gb.accession2taxid"; +my $rankedlineage=$info_dir."rankedlineage.dmp"; + +unless(-e $accession2taxid){ + die("Error01: Cannot find the file: nucl_gb.accession2taxid in the $info_dir,\nplease download from the website: ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/.") +} + +unless(-e $rankedlineage){ + die("Error01: Cannot find the file: rankedlineage.dmp in the $info_dir,\nplease download from the website: https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/.") +} + +#get the accession id of each alignment +open(IN,$blast_file)||die("Error9: cannot read the file: $blast_file!\n"); +my %accession; +while(my $line=){ +chomp $line; +next if $line=~/^#/; +my @string=split "\t",$line; +my @accession_version=split(/\./,$string[1]); + $accession{$accession_version[0]}=$string[1]; +} +close IN; +my $len1=keys %accession; +my $log_file=$out_dir."job.log"; +open(LOG,">$log_file")||die("Error10: cannot write the file $log_file.\n"); +print LOG "There are $len1 accession in the blast file.\n"; + +#get the related taxonomic id of accession id from accession2taxid file. +open(FILE1,$accession2taxid)||die("Error9: cannot read the file $accession2taxid!\n"); +my %acc_tax; +while(my $line=){ + chomp $line; + my @string=split "\t",$line; + if(exists $accession{$string[0]}){ + $acc_tax{$string[0]}=$string[2] + } +} +close FILE1; +my $len2=keys %acc_tax; +print LOG "\tOf these, $len2 accessiones could be found in the $accession2taxid.\n"; + +my $html_dir=$out_dir."html/"; +mkdir($html_dir); + +#obtain the taxonomix id of some accession id due to updating accession2taxid file. +if($len1!=$len2){ + my @acc_keys=keys %accession; + my @non=grep {!$acc_tax{$_}} @acc_keys; + my $len3=@non; + print LOG "\tIn total $len3 accessions have been removed in the file: $accession2taxid , and we will search them from NCBI.\n"; + foreach my $line (@non){ + my $html_file=$html_dir.$line.".html"; + system("wget https://www.ncbi.nlm.nih.gov/nuccore/$line -O $html_file"); + open(HTML,$html_file)||die("error with opening $html_file\n"); + while(){ + if($_ =~ /ORGANISM=(\d+)&/){ + my $tax_id=$1; + $acc_tax{$line}=$tax_id; + } + } + close HTML; + } +} + +#obtain taxonomic classification of each taxonomic id +my %taxid; +while((my $key, my $value)=each %acc_tax){ + $taxid{$value}=1; +} +my $len4=keys %taxid; +my $taxid_file=$out_dir."taxid.txt"; +print LOG "\nThere are $len4 taxid could be found.\n"; +open(TAX,">$taxid_file")||die("Error10: cannot write the file: $taxid_file.\n"); +while((my $key, my $value)=each %taxid){ + print TAX $key."\n"; +} +close TAX; + +my %source; +open(IN,$rankedlineage)||die("Cannot read the file: $rankedlineage.\n"); +while(my $line=){ + chomp $line; + my @string=split /\t\|\t/,$line; + if(exists($taxid{$string[0]})){ + $string[9]=~ s/\t|$//; + $source{$string[0]}.="superkingdom:".$string[9]."; "; + $source{$string[0]}.="kingdom:".$string[8]."; "; + $source{$string[0]}.="phylum:".$string[7]."; "; + $source{$string[0]}.="class:".$string[6]."; "; + $source{$string[0]}.="order:".$string[5]."; "; + $source{$string[0]}.="family:".$string[4]."; "; + $source{$string[0]}.="genus:".$string[3]."; "; + $source{$string[0]}.="species:".$string[2]."."; + } +} +close IN; + +my $len5=keys %source; +my $taxid_name_file=$out_dir."taxid.name"; +print LOG "There are $len5 taxid and name could be found.\n"; +open(NAME,">$taxid_name_file")||die("Error10: cannot write the file: $taxid_name_file.\n"); +while((my $key, my $value)=each %source){ + print NAME $key."\t".$value."\n"; +} + +my $out_file=$out_dir."accession.name"; +open(OUT,">$out_file")||die("Error10: cannot write the table: $out_file.\n"); +print OUT "accesion\taccession.version\ttaxid\tsource\n"; +while((my $key, my $value)=each %accession){ + #print $key."\n"; + my $taxonomy=$acc_tax{$key}; + my $records; + if(exists $source{$taxonomy}){ + $records=$source{$taxonomy}; + } + else{ + $records="Unclassified"; + } + print OUT $key."\t".$value."\t".$taxonomy."\t".$records."\n"; +} +close OUT; +close LOG; diff --git a/bin/getUnalnCtg.pl b/bin/getUnalnCtg.pl new file mode 100755 index 0000000..35cb7d8 --- /dev/null +++ b/bin/getUnalnCtg.pl @@ -0,0 +1,138 @@ +#!/usr/bin/perl +#Created by Hu Zhiqiang, 2014-9-5 +##Modefied by Duan Zhongqu, 2018-11-27 +##Extract both the fully unaligned contigs and partially unaligned contigs. +##Extract the coordinate of partially unaligned contigs locate in reference genome. + +use strict; +use warnings; +use Getopt::Std; +use vars qw($opt_h); +getopts("h"); + +my $usage="\nUsage: $0 [options] + +$0 is used to collect both fully unaligned contigs and partially unaligned contigs, and +collect the coords of the partially unaligned contigs. + +Necessary input description: + + contig_file Assembled contig fasta file. + + unaligned_info_file File including the info of unaligned contigs. + + coords_file File including the coords of contigs. + + output_prefix The prefix of outfiles. + + prefix A prefix added to the name of each contig. This is used + to mark which sample the contigs are from. + +Options: + -h Print this usage page. + +"; + +die $usage if @ARGV!=5; +die $usage if defined($opt_h); +my ($contig_file,$info_file,$coords_file,$output_prefix,$prefix)=@ARGV; + +#Check input file existence +die("Error01: Cannot find $contig_file\n") unless(-e $contig_file); +die("Error02: Cannot find $info_file\n") unless(-e $info_file); +die("Error02: Cannot find $coords_file\n") unless(-e $coords_file); + +#Define the output file +my $full_unalign_file=$output_prefix.".fully.contig"; +my $part_unalign_file=$output_prefix.".partially.contig"; +my $part_coords_file=$output_prefix.".partially.coords"; + +#Check existence of output file +print STDERR "Warning: $full_unalign_file file exists. Overwrite.\n" if -e $full_unalign_file; +print STDERR "Warning: $part_unalign_file file exists. Overwrite.\n" if -e $part_unalign_file; +print STDERR "Warning: $part_coords_file file exists. Overwrite.\n" if -e $part_coords_file; + +#read in unaligned info file and collect the fully unaligned contig names and the partially unaligned contig name. +my %full_unalign_names; +my %part_unalign_names; +open(IN,$info_file)||die("Error05: cannor read $info_file.\n"); +while(my $line=){ + chomp $line; + my @string=split "\t",$line; + if($string[3]=~/^full$/){ + $full_unalign_names{$string[0]}=0; + } + elsif($string[3]=~/^partial$/){ + $part_unalign_names{$string[0]}=0; + } +} + +#collect unaligned contigs from contig file +open(FULL,">$full_unalign_file") || die("Error04: Cannot write $full_unalign_file\n"); +open(PART,">$part_unalign_file") || die("Error04: Cannot write $part_unalign_file\n"); +if($contig_file=~/\.gz$/){ + open(CONTIG,"zcat $contig_file |") || die("Error05: Cannot read $contig_file\n"); +} +else{ + open(CONTIG,$contig_file) || die("Error05: Cannot read $contig_file\n"); +} +my $flag=0; +while(my $line=){ + if($line=~/^>(.+)$/){ + my $m=$1; + $m=~s/\s+/_/g; + if(defined($full_unalign_names{$m})){ + $flag=1; + $full_unalign_names{$m}=1; + print FULL ">$prefix:$m\n"; + } + elsif(defined($part_unalign_names{$m})){ + $flag=2; + $part_unalign_names{$m}=1; + print PART ">$prefix:$m\n"; + } + else{ + $flag=0; + } + } + else{ + print FULL $line if $flag==1; + print PART $line if $flag==2; + } +} +close CONTIG; +close FULL; +close PART; + +#Examine if all fully unaligned contigs are extracted +foreach my $key1 (keys(%full_unalign_names)){ + print STDERR "WARNING: Cannot find $key1 in $full_unalign_file\n" if $full_unalign_names{$key1}==0; +} + +#Examine if all partially unaligned contigs are extracted +foreach my $key2 (keys(%part_unalign_names)){ + print STDERR "WARNING: Cannot find $key2 in $part_unalign_file\n" if $part_unalign_names{$key2}==0; +} + +#Collect the coords of partially unaligned contigs from coords file. +open(IN,$coords_file)||die("Error05: Cannot read $coords_file"); +open(OUT,">$part_coords_file")||die("Error04: Cannot write $part_coords_file"); +print OUT "Query\tRef\tQuery.start\tQuery.end\tRef.start\tRef.end\tQuery.len\tRef.len\tIdentity"; +readline IN; +readline IN; +while(my $line=){ + chomp $line; + my @string=split /\s+/,$line; + if(exists($part_unalign_names{$string[12]})){ + $part_unalign_names{$string[12]}=0; + print OUT $prefix.":".$string[12]."\t".$string[11]."\t".$string[3]."\t".$string[4]."\t".$string[0]."\t".$string[1]."\t".$string[7]."\t".$string[6]."\t".$string[9]."\n"; + } +} +close IN; +close OUT; + +#Examine if all the partially unaligned contigs coords are extracted +foreach my $key3 (keys(%part_unalign_names)){ + print STDERR "WARNING: Cannot find $key3 in $part_unalign_file\n" if $part_unalign_names{$key3}==1; +} + diff --git a/bin/hupan b/bin/hupan new file mode 100755 index 0000000..cc1b4a0 --- /dev/null +++ b/bin/hupan @@ -0,0 +1,230 @@ +#!/usr/bin/perl +use strict; +use warnings; +use HUPANqualSta; +use HUPANtrim; +use HUPANmap; +use HUPANsamToBam; +use HUPANbamSta; +use HUPANassem; +use HUPANassemSta; +use HUPANunalnCtg; +use HUPANrmRdt; +use HUPANfastaSta; +use HUPANgeneCov; +use HUPANpTpG; +use HUPANgeneExist; +use HUPANbam2bed; +use HUPANsim; +use HUPANrmHigh; +use HUPANgetTaxClass; +use HUPANrmContaminate; +use HUPANblastAlign; +use HUPANsimSeq; +use HUPANsplitSeq; +use HUPANgenePre; +use HUPANmergeNovGene; +use HUPANfilterNovGene; + +checkPRE() if @ARGV==0; +PrintUsage() if @ARGV<1; + +my %commands=( + "qualSta" => 0, + "trim" => 0, + "alignRead" => 0, + "sam2bam" => 0, + "bamSta" => 0, + "assemble" => 0, + "assemSta" => 0, + "getUnalnCtg" => 0, + "rmRedundant" => 0, + "pTpG" => 0, + "geneCov" => 0, + "geneExist" => 0, + "subSample" => 0, + "gFamExist" => 0, + "bam2bed" => 0, + "fastaSta" => 0, + "sim" => 0, + "alignContig" => 0, + "extractSeq" => 0, + "getTaxClass" => 0, + "rmCtm" => 0, + "blastAlign" => 0, + "simSeq" => 0, + "splitSeq" => 0, + "genePre" => 0, + "mergeNovGene" => 0, + "filterNovGene" => 0 +); + +my $com=shift @ARGV; +if(defined $commands{$com}){ + $commands{$com}=1; +} +else{ + print STDERR "Invalid command: $com\n"; + PrintUsage(); +} + +if($commands{"qualSta"}){ + if(qualSta::checkQual(@ARGV)){ + qualSta::mergeFastqc(@ARGV); + } +} +elsif($commands{"trim"}){ + trim::trimFastq(@ARGV); +} +elsif($commands{"alignRead"}){ + align::map(@ARGV); +} +elsif($commands{"sam2bam"}){ + adjAlign::sam2bam(@ARGV); +} +elsif($commands{"bamSta"}){ + bamStat::bamsta(@ARGV); +} +elsif($commands{"assemble"}){ + assembly::assemble(@ARGV); +} +elsif($commands{"assemSta"}){ + assemStat::runQuast(@ARGV); +} +elsif($commands{"getUnalnCtg"}){ + unalnCtg::getUnaln(@ARGV); +} +elsif($commands{"rmRedundant"}){ + rmRDT::rmRDT(@ARGV); +} +elsif($commands{"fastaSta"}){ + fastaSta::sta(@ARGV); +} +elsif($commands{"geneCov"}){ + geneCov::cov(@ARGV); +} +elsif($commands{"pTpG"}){ + pTpG::pTpG(@ARGV); +} +elsif($commands{"geneExist"}){ + gExist::checkGeneExist(@ARGV); +} +elsif($commands{"subSample"}){ + gExist::subsetSample(@ARGV); +} +elsif($commands{"gFamExist"}){ + gExist::gE2gfE(@ARGV); +} +elsif($commands{"bam2bed"}){ + bam2cov::bam2bed(@ARGV); +} +elsif($commands{"sim"}){ + sim::simulation(@ARGV); + sim::pavPlot(@ARGV); +} +elsif($commands{"alignContig"}){ + rmHigh::alignContig(@ARGV); +} +elsif($commands{"extractSeq"}){ + rmHigh::extractSeq(@ARGV); +} +elsif($commands{"getTaxClass"}){ + getTaxClass::getTaxClass(@ARGV); +} +elsif($commands{"rmCtm"}){ + rmCTM::rmContaminate(@ARGV); +} +elsif($commands{"blastAlign"}){ + blastAlign::blastAlign(@ARGV); +} +elsif($commands{"simSeq"}){ + simSeq::simSeq(@ARGV); +} +elsif($commands{"splitSeq"}){ + splitSeq::splitSeq(@ARGV); +} +elsif($commands{"genePre"}){ + genePre::genePre(@ARGV); +} +elsif($commands{"mergeNovGene"}){ + mergeNovGene::mergeNovGene(@ARGV); +} +elsif($commands{"filterNovGene"}){ + filterNovGen::filterNovGen(@ARGV) +} + +sub PrintUsage{ + print STDERR "\nUsage: hupan ...\n\n"; + print STDERR "Available commands:\n"; + print STDERR "\tqualSta \tView the overall sequencing quality of a large number of files\n"; + print STDERR "\ttrim \tTrim or filter low-quality reads parallelly\n"; + print STDERR "\talignRead \tMap reads to a reference parallelly\n"; + print STDERR "\tsam2bam \tCovert alignments (.sam) to sorted .bam files\n"; + print STDERR "\tbamSta \tStatistics of parallel mapping\n"; + print STDERR "\tassemble \tAssemble reads parallelly\n"; + print STDERR "\talignContig \tAlign assembly results to a referenece parallelly\n"; + print STDERR "\textractSeq \tExtract contigs parallelly\n"; + print STDERR "\tassemSta \tStatistics of parallel assembly\n"; + print STDERR "\tgetUnalnCtg \tExtract the unaligned contigs from nucmer alignment (processed by quast)\n"; + print STDERR "\trmRedundant \tRemove redundant contigs of a fasta file\n"; + print STDERR "\tpTpG \tGet the longest transcripts to represent genes\n"; + print STDERR "\tgeneCov \tCalculate gene body coverage and CDS coverage\n"; + print STDERR "\tgeneExist \tDetermine gene presence-absence based on gene body coverage and CDS coverage\n"; + print STDERR "\tsubSample \tSelect subset of samples from gene PAV profile\n"; + print STDERR "\tgFamExist \tDetermine gene family presence-absence based on gene presence-absence\n"; + print STDERR "\tbam2bed \tCalculate genome region presence-absence from .bam\n"; + print STDERR "\tfastaSta \tCalculate statistics of fasta file\n"; + print STDERR "\tsim \tSimulation and plot of the pan-genome and the core genome\n"; + print STDERR "\tgetTaxClass \tObtain the taxonomic classification of sequences\n"; + print STDERR "\trmCtm \tDetect and discard the potentail contamination\n"; + print STDERR "\tblastAlign \tAlign sequences to target sequence by blast\n"; + print STDERR "\tsimSeq \tSimulate and plot the total size of novel sequences\n"; + print STDERR "\tsplitSeq \tSplit sequence file into multiple small size files\n"; + print STDERR "\tgenePre \tAb initio gene predict combining with RNA and protein evidence\n"; + print STDERR "\tmergeNovGene \tMerge maker result from multiple maker result files\n"; + print STDERR "\tfilterNovGene\tFilter the novel precited genes.\n"; + exit(-1); +} + +sub checkPRE{ + my $exec="ccov"; + my @path=split /:/,$ENV{PATH}; + my $fpflag=0; + foreach my $p (@path){ + $p.="/".$exec; + if(-e $p && -x $p){ + $fpflag=1; + last; + } + } + die("Can not find ccov in your PATH!\n +") unless($fpflag); + + $exec="Rscript"; + @path=split /:/,$ENV{PATH}; + $fpflag=0; + foreach my $p (@path){ + $p.="/".$exec; + if(-e $p && -x $p){ + $fpflag=1; + last; + } + } + die("Can not find Rscript in your PATH!\n +") unless($fpflag); + + + $exec="bam2cov"; + @path=split /:/,$ENV{PATH}; + $fpflag=0; + foreach my $p (@path){ + $p.="/".$exec; + if(-e $p && -x $p){ + $fpflag=1; + last; + } + } + die("Can not find bam2cov in your PATH!\n +") unless($fpflag); +} + diff --git a/bin/hupanLSF b/bin/hupanLSF new file mode 100755 index 0000000..5e2d87a --- /dev/null +++ b/bin/hupanLSF @@ -0,0 +1,249 @@ +#!/usr/bin/perl +use strict; +use warnings; +use HUPANqualStaLSF; +use HUPANtrimLSF; +use HUPANmapLSF; +use HUPANsamToBamLSF; +use HUPANbamStaLSF; +use HUPANassemLSF; +use HUPANassemStaLSF; +use HUPANunalnCtgLSF; +use HUPANrmRdtLSF; +use HUPANfastaStaLSF; +use HUPANgeneCovLSF; +use HUPANpTpGLSF; +use HUPANgeneExistLSF; +use HUPANbam2bedLSF; +use HUPANsimLSF; +use HUPANrmHighLSF; +use HUPANgetTaxClassLSF; +use HUPANrmContaminateLSF; +use HUPANblastAlignLSF; +use HUPANsimSeqLSF; +use HUPANsplitSeqLSF; +use HUPANgenePreLSF; +use HUPANmergeNovGeneLSF; +use HUPANfilterNovGeneLSF; + +checkPRE() if @ARGV==0; +PrintUsage() if @ARGV<1; +my %commands=( + "qualSta" => 0, + "mergeQualSta" => 0, + "trim" => 0, + "alignRead" => 0, + "sam2bam" => 0, + "bamSta" => 0, + "assemble" => 0, + "assemSta" => 0, + "mergeAssemSta" => 0, + "getUnalnCtg" => 0, + "mergeUnalnCtg" => 0, + "rmRedundant" => 0, + "pTpG" => 0, + "geneCov" => 0, + "mergeGeneCov" => 0, + "geneExist" => 0, + "subSample" => 0, + "gFamExist" => 0, + "bam2bed" => 0, + "fastaSta" => 0, + "sim" => 0, + "alignContig" => 0, + "extractSeq" => 0, + "getTaxClass" => 0, + "rmCtm" => 0, + "blastAlign" => 0, + "simSeq" => 0, + "splitSeq" => 0, + "genePre" => 0, + "mergeNovGene" => 0, + "filterNovGene" => 0 +); + +my $com=shift @ARGV; +if(defined $commands{$com}){ + $commands{$com}=1; +} +else{ + print STDERR "Invalid command: $com\n"; + PrintUsage(); +} +if($commands{"qualSta"}){ + if(qualStaLSF::checkQual(@ARGV)){ + print STDERR "Run \"hupanLSF mergeQualSta\" command after all jobs are finished!\n"; + } +} +elsif($commands{"mergeQualSta"}){ + qualStaLSF::mergeFastqc(@ARGV); +} +elsif($commands{"trim"}){ + trim::trimFastq(@ARGV); +} +elsif($commands{"alignRead"}){ + align::map(@ARGV); +} +elsif($commands{"sam2bam"}){ + adjAlign::sam2bam(@ARGV); +} +elsif($commands{"bamSta"}){ + bamStat::bamsta(@ARGV); +} +elsif($commands{"assemble"}){ + assembly::assemble(@ARGV); +} +elsif($commands{"assemSta"}){ + assemStat::runQuast(@ARGV); +} +elsif($commands{"mergeAssemSta"}){ + assemStat::mergeAssemSta(@ARGV); +} +elsif($commands{"getUnalnCtg"}){ + unalnCtg::getUnaln(@ARGV); +} +elsif($commands{"mergeUnalnCtg"}){ + unalnCtg::mergeUnaln(@ARGV); +} +elsif($commands{"rmRedundant"}){ + rmRDT::rmRDT(@ARGV); +} +elsif($commands{"fastaSta"}){ + fastaSta::sta(@ARGV); +} +elsif($commands{"geneCov"}){ + geneCov::cov(@ARGV); +} +elsif($commands{"mergeGeneCov"}){ + geneCov::merge(@ARGV); +} +elsif($commands{"pTpG"}){ + pTpG::pTpG(@ARGV); +} +elsif($commands{"geneExist"}){ + gExist::checkGeneExist(@ARGV); +} +elsif($commands{"subSample"}){ + gExist::subsetSample(@ARGV); +} +elsif($commands{"gFamExist"}){ + gExist::gE2gfE(@ARGV); +} +elsif($commands{"bam2bed"}){ + bam2cov::bam2bed(@ARGV); +} +elsif($commands{"sim"}){ + sim::simulation(@ARGV); + sim::pavPlot(@ARGV); +} +elsif($commands{"alignContig"}){ + rmHigh::alignContig(@ARGV); +} +elsif($commands{"extractSeq"}){ + rmHigh::extractSeq(@ARGV); +} +elsif($commands{"getTaxClass"}){ + getTaxClass::getTaxClass(@ARGV); +} +elsif($commands{"rmCtm"}){ + rmCTM::rmContaminate(@ARGV); +} +elsif($commands{"blastAlign"}){ + blastAlign::blastAlign(@ARGV); +} +elsif($commands{"simSeq"}){ + simSeq::simSeq(@ARGV); +} +elsif($commands{"splitSeq"}){ + splitSeq::splitSeq(@ARGV); +} +elsif($commands{"genePre"}){ + genePre::genePre(@ARGV); +} +elsif($commands{"mergeNovGene"}){ + mergeNovGene::mergeNovGene(@ARGV); +} +elsif($commands{"filterNovGene"}){ + filterNovGen::filterNovGen(@ARGV) +} + +sub PrintUsage{ + print STDERR "\nUsage: hupanLSF ...\n\n"; + print STDERR "Available commands:\n"; + print STDERR "\tqualSta \tRun fastqc on a large number of files\n"; + print STDERR "\tmergeQualSta \tView the overall sequencing quality by combining fastqc outputs\n"; + print STDERR "\ttrim \tTrim or filter low-quality reads parallelly\n"; + print STDERR "\talignRead \tMap reads to a reference parallelly\n"; + print STDERR "\tsam2bam \tConvert alignments (.sam) to sorted .bam files\n"; + print STDERR "\tbamSta \tStatistics of parallel mapping\n"; + print STDERR "\tassemble \tAssemble reads parallelly\n"; + print STDERR "\talignContig \tAlign assembly results to a referenece parallelly\n"; + print STDERR "\textractSeq \tExtract contigs parallelly\n"; + print STDERR "\tassemSta \tStatistics of parallel assembly\n"; + print STDERR "\tmergeAssemSta\tMerge statistics of all indivduals to a single file\n"; + print STDERR "\tgetUnalnCtg \tExtract the unaligned contigs from nucmer alignment (processed by quast)\n"; + print STDERR "\tmergeUnalnCtg\tMerge the unaligned contigs into a single file\n"; + print STDERR "\trmRedundant \tRemove redundant contigs of a fasta file\n"; + print STDERR "\tpTpG \tGet the longest transcripts to represent genes\n"; + print STDERR "\tgeneCov \tCalculate gene body coverage and CDS coverage\n"; + print STDERR "\tmergeGeneCov \tMerge gene body coverage and cds coverage of each sample to two summary files\n"; + print STDERR "\tgeneExist \tDetermine gene presence-absence based on gene body coverage and CDS coverage\n"; + print STDERR "\tsubSample \tSelect subset of samples from gene PAV profile\n"; + print STDERR "\tgFamExist \tDetermine gene family presence-absence based on gene presence-absence\n"; + print STDERR "\tbam2bed \tCalculate genome region presence-absence from .bam\n"; + print STDERR "\tfastaSta \tCalculate statistics of fasta file\n"; + print STDERR "\tsim \tsimulation and plot of the pan-genome and the core genome\n"; + print STDERR "\tgetTaxClass \tObtain the taxonomic classification of sequence\n"; + print STDERR "\trmCtm \tDetect and discard the potentail contamination\n"; + print STDERR "\tblastAlign \tAlign sequences to target sequence by blast\n"; + print STDERR "\tsimSeq \tSimulate and plot the total size of novel sequences\n"; + print STDERR "\tsplitSeq \tSplit sequence file into multiple small size files\n"; + print STDERR "\tgenePre \tAb initio gene predict combining with RNA and protein evidence\n"; + print STDERR "\tmergeNovGene \tMerge maker result from multiple maker result files\n"; + print STDERR "\tfilterNovGene\tFilter the novel precited genes.\n"; + exit(-1); +} + + +sub checkPRE{ + my $exec="ccov"; + my @path=split /:/,$ENV{PATH}; + my $fpflag=0; + foreach my $p (@path){ + $p.="/".$exec; + if(-e $p && -x $p){ + $fpflag=1; + last; + } + } + die("Can not find ccov in your PATH!\n +") unless($fpflag); + + $exec="Rscript"; + @path=split /:/,$ENV{PATH}; + $fpflag=0; + foreach my $p (@path){ + $p.="/".$exec; + if(-e $p && -x $p){ + $fpflag=1; + last; + } + } + die("Can not find Rscript in your PATH!\n +") unless($fpflag); + + + $exec="bam2cov"; + @path=split /:/,$ENV{PATH}; + $fpflag=0; + foreach my $p (@path){ + $p.="/".$exec; + if(-e $p && -x $p){ + $fpflag=1; + last; + } + } + die("Can not find bam2cov in your PATH!\n +") unless($fpflag); +} + diff --git a/bin/hupanSLURM b/bin/hupanSLURM new file mode 100755 index 0000000..2eb5215 --- /dev/null +++ b/bin/hupanSLURM @@ -0,0 +1,249 @@ +#!/usr/bin/perl +use strict; +use warnings; +use HUPANqualStaSLURM; +use HUPANtrimSLURM; +use HUPANmapSLURM; +use HUPANsamToBamSLURM; +use HUPANbamStaSLURM; +use HUPANassemSLURM; +use HUPANassemStaSLURM; +use HUPANunalnCtgSLURM; +use HUPANrmRdtSLURM; +use HUPANfastaStaSLURM; +use HUPANgeneCovSLURM; +use HUPANpTpGSLURM; +use HUPANgeneExistSLURM; +use HUPANbam2bedSLURM; +use HUPANsimSLURM; +use HUPANrmHighSLURM; +use HUPANgetTaxClassSLURM; +use HUPANrmContaminateSLURM; +use HUPANblastAlignSLURM; +use HUPANsimSeqSLURM; +use HUPANsplitSeqSLURM; +use HUPANgenePreSLURM; +use HUPANmergeNovGeneSLURM; +use HUPANfilterNovGeneSLURM; + +checkPRE() if @ARGV==0; +PrintUsage() if @ARGV<1; +my %commands=( + "qualSta" => 0, + "mergeQualSta" => 0, + "trim" => 0, + "alignRead" => 0, + "sam2bam" => 0, + "bamSta" => 0, + "assemble" => 0, + "assemSta" => 0, + "mergeAssemSta" => 0, + "getUnalnCtg" => 0, + "mergeUnalnCtg" => 0, + "rmRedundant" => 0, + "pTpG" => 0, + "geneCov" => 0, + "mergeGeneCov" => 0, + "geneExist" => 0, + "subSample" => 0, + "gFamExist" => 0, + "bam2bed" => 0, + "fastaSta" => 0, + "sim" => 0, + "alignContig" => 0, + "extractSeq" => 0, + "getTaxClass" => 0, + "rmCtm" => 0, + "blastAlign" => 0, + "simSeq" => 0, + "splitSeq" => 0, + "genePre" => 0, + "mergeNovGene" => 0, + "filterNovGene" => 0 +); + +my $com=shift @ARGV; +if(defined $commands{$com}){ + $commands{$com}=1; +} +else{ + print STDERR "Invalid command: $com\n"; + PrintUsage(); +} +if($commands{"qualSta"}){ + if(qualStaSLURM::checkQual(@ARGV)){ + print STDERR "Run \"hupanSLURM mergeQualSta\" command after all jobs are finished!\n"; + } +} +elsif($commands{"mergeQualSta"}){ + qualStaSLURM::mergeFastqc(@ARGV); +} +elsif($commands{"trim"}){ + trim::trimFastq(@ARGV); +} +elsif($commands{"alignRead"}){ + align::map(@ARGV); +} +elsif($commands{"sam2bam"}){ + adjAlign::sam2bam(@ARGV); +} +elsif($commands{"bamSta"}){ + bamStat::bamsta(@ARGV); +} +elsif($commands{"assemble"}){ + assembly::assemble(@ARGV); +} +elsif($commands{"assemSta"}){ + assemStat::runQuast(@ARGV); +} +elsif($commands{"mergeAssemSta"}){ + assemStat::mergeAssemSta(@ARGV); +} +elsif($commands{"getUnalnCtg"}){ + unalnCtg::getUnaln(@ARGV); +} +elsif($commands{"mergeUnalnCtg"}){ + unalnCtg::mergeUnaln(@ARGV); +} +elsif($commands{"rmRedundant"}){ + rmRDT::rmRDT(@ARGV); +} +elsif($commands{"fastaSta"}){ + fastaSta::sta(@ARGV); +} +elsif($commands{"geneCov"}){ + geneCov::cov(@ARGV); +} +elsif($commands{"mergeGeneCov"}){ + geneCov::merge(@ARGV); +} +elsif($commands{"pTpG"}){ + pTpG::pTpG(@ARGV); +} +elsif($commands{"geneExist"}){ + gExist::checkGeneExist(@ARGV); +} +elsif($commands{"subSample"}){ + gExist::subsetSample(@ARGV); +} +elsif($commands{"gFamExist"}){ + gExist::gE2gfE(@ARGV); +} +elsif($commands{"bam2bed"}){ + bam2cov::bam2bed(@ARGV); +} +elsif($commands{"sim"}){ + sim::simulation(@ARGV); + sim::pavPlot(@ARGV); +} +elsif($commands{"alignContig"}){ + rmHigh::alignContig(@ARGV); +} +elsif($commands{"extractSeq"}){ + rmHigh::extractSeq(@ARGV); +} +elsif($commands{"getTaxClass"}){ + getTaxClass::getTaxClass(@ARGV); +} +elsif($commands{"rmCtm"}){ + rmCTM::rmContaminate(@ARGV); +} +elsif($commands{"blastAlign"}){ + blastAlign::blastAlign(@ARGV); +} +elsif($commands{"simSeq"}){ + simSeq::simSeq(@ARGV); +} +elsif($commands{"splitSeq"}){ + splitSeq::splitSeq(@ARGV); +} +elsif($commands{"genePre"}){ + genePre::genePre(@ARGV); +} +elsif($commands{"mergeNovGene"}){ + mergeNovGene::mergeNovGene(@ARGV); +} +elsif($commands{"filterNovGene"}){ + filterNovGen::filterNovGen(@ARGV) +} + +sub PrintUsage{ + print STDERR "\nUsage: hupanSLURM ...\n\n"; + print STDERR "Available commands:\n"; + print STDERR "\tqualSta \tRun fastqc on a large number of files\n"; + print STDERR "\tmergeQualSta \tView the overall sequencing quality by combining fastqc outputs\n"; + print STDERR "\ttrim \tTrim or filter low-quality reads parallelly\n"; + print STDERR "\talignRead \tMap reads to a reference parallelly\n"; + print STDERR "\tsam2bam \tConvert alignments (.sam) to sorted .bam files\n"; + print STDERR "\tbamSta \tStatistics of parallel mapping\n"; + print STDERR "\tassemble \tAssemble reads parallelly\n"; + print STDERR "\talignContig \tAlign assembly results to a referenece parallelly\n"; + print STDERR "\textractSeq \tExtract contigs parallelly\n"; + print STDERR "\tassemSta \tStatistics of parallel assembly\n"; + print STDERR "\tmergeAssemSta\tMerge statistics of all indivduals to a single file\n"; + print STDERR "\tgetUnalnCtg \tExtract the unaligned contigs from nucmer alignment (processed by quast)\n"; + print STDERR "\tmergeUnalnCtg\tMerge the unaligned contigs into a single file\n"; + print STDERR "\trmRedundant \tRemove redundant contigs of a fasta file\n"; + print STDERR "\tpTpG \tGet the longest transcripts to represent genes\n"; + print STDERR "\tgeneCov \tCalculate gene body coverage and CDS coverage\n"; + print STDERR "\tmergeGeneCov \tMerge gene body coverage and cds coverage of each sample to two summary files\n"; + print STDERR "\tgeneExist \tDetermine gene presence-absence based on gene body coverage and CDS coverage\n"; + print STDERR "\tsubSample \tSelect subset of samples from gene PAV profile\n"; + print STDERR "\tgFamExist \tDetermine gene family presence-absence based on gene presence-absence\n"; + print STDERR "\tbam2bed \tCalculate genome region presence-absence from .bam\n"; + print STDERR "\tfastaSta \tCalculate statistics of fasta file\n"; + print STDERR "\tsim \tsimulation and plot of the pan-genome and the core genome\n"; + print STDERR "\tgetTaxClass \tObtain the taxonomic classification of sequences\n"; + print STDERR "\trmCtm \tDetect and discard the potentail contamination\n"; + print STDERR "\tblastAlign \tAlign sequences to target sequence by blast\n"; + print STDERR "\tsimSeq \tSimulate and plot the total size of novel sequences\n"; + print STDERR "\tsplitSeq \tSplit sequence file into multiple small size files\n"; + print STDERR "\tgenePre \tAb initio gene predict combining with RNA and protein evidence\n"; + print STDERR "\tmergeNovGene \tMerge maker result from multiple maker result files\n"; + print STDERR "\tfilterNovGene\tFilter the novel precited genes.\n"; + exit(-1); +} + + +sub checkPRE{ + my $exec="ccov"; + my @path=split /:/,$ENV{PATH}; + my $fpflag=0; + foreach my $p (@path){ + $p.="/".$exec; + if(-e $p && -x $p){ + $fpflag=1; + last; + } + } + die("Can not find ccov in your PATH!\n +") unless($fpflag); + + $exec="Rscript"; + @path=split /:/,$ENV{PATH}; + $fpflag=0; + foreach my $p (@path){ + $p.="/".$exec; + if(-e $p && -x $p){ + $fpflag=1; + last; + } + } + die("Can not find Rscript in your PATH!\n +") unless($fpflag); + + + $exec="bam2cov"; + @path=split /:/,$ENV{PATH}; + $fpflag=0; + foreach my $p (@path){ + $p.="/".$exec; + if(-e $p && -x $p){ + $fpflag=1; + last; + } + } + die("Can not find bam2cov in your PATH!\n +") unless($fpflag); +} + diff --git a/bin/linearK.pl b/bin/linearK.pl new file mode 100755 index 0000000..3317eff --- /dev/null +++ b/bin/linearK.pl @@ -0,0 +1,384 @@ +#!/usr/bin/perl +#Created by Hu Zhiqiang, 2014-10-5 +use strict; +use warnings; +use Getopt::Std; +use vars qw($opt_h $opt_t $opt_g $opt_s $opt_r $opt_w $opt_u $opt_c $opt_n $opt_k $opt_m); +getopts("ht:g:s:r:w:u:c:n:k:m:"); + +my $usage="Usage: $0 [options] + +$0 is to assembly paired FASTQ reads with SOAP DENOVO 2 with a dynamic K-mer. + +ATTENTION: Executable 1)SOAPdenovo-63mer, 2)SOAPdenovo-127mer and 3)GapCloser should EXIST in soap_denovo2_directory! + +Necessary input description: + + data_directory This directory should contain one or more pair of FASTQ files + with suffix of .fastq.gz or .fq.gz. The suffix can be changed with -s option. + + output_directory The output directory. + + soap_denovo2_directory Directory where soapdenovo2 program locates. SOAPdenovo-63mer, + SOAPdenovo-127mer and GapCloser should exist in this directory. + +Options: + -h Print this usage page. + + -t Threads used. + Default: 1 + + -g Genome size. Used to infer sequencing depth. + Default: 3000000000 (3Gb) + + -s Suffix of files within data_directory. + Default: .fastq.gz + + -r Parameters of linear function: Kmer=2*int(0.5*(a*Depth+b))+1. + The parameter should be input as \"a,b\". + Default: 0.76,20 + + -w Step-length of Kmer change. + Default: 2 + + -u Upper limmited times of Kmer change. This parameter is set to reduce + redundancy computation. + Default: 10 + + -c Parameters of soapdenovo2 config file. 8 parameters ligated by comma + 1)maximal read length + 2)average insert size + 3)if sequence needs to be reversed + 4)in which part(s) the reads are used + 5)use only first N bps of each read + 6)in which order the reads are used while scaffolding + 7)cutoff of pair number for a reliable connection (at least 3 for + short insert size) + 8)minimum aligned length to contigs for a reliable read location + (at least 32 for short insert size) + Default: 80,460,0,3,80,1,3,32 + + -n The minimum length of contigs. Contigs shorter than this length will + NOT be used when calculating N50. + Default: 100 + + -k Available Kmer range. Give comma-seperated lower bound and upper bound. + Default: 15,127 + + -m The number of consecutive Ns to be broken down to contigs.This is used + in the process break gapclosed scaffolds to contigs. + Default: 10. + +======================================================= +Any comments or bugs could be sent to Hu Zhiqiang: + doodlehzq\@sjtu.edu.cn. +======================================================= +"; + +die $usage if @ARGV!=3; +die $usage if defined($opt_h); +my ($data_dir,$out_dir,$tool_dir)=@ARGV; + +#check existence of output directory +# die("Error: output directory \"$out_dir\" already exists. +#To avoid overwriting of existing files. We kindly request that the +# output directory should not exist."); if -e $out_dir; + +#detect executables +$tool_dir.="/" unless $tool_dir=~/\/$/; +my $exec63=$tool_dir."SOAPdenovo-63mer"; +die("Cannot find $exec63 in directory $tool_dir\n") unless -e $exec63; +my $exec127=$tool_dir."SOAPdenovo-127mer"; +die("Cannot find $exec127 in directory $tool_dir\n") unless -e $exec127; +my $execgap=$tool_dir."GapCloser"; +die("Cannot find $execgap in directory $tool_dir\n") unless -e $execgap; + +#define thread number +my $thread_num=1; +$thread_num=$opt_t if defined($opt_t); + +#define file suffix +my $suffix=".fastq.gz"; +$suffix=$opt_s if defined($opt_s); + +#define genome size +my $gsize=3000000000; +$gsize=$opt_g if defined($opt_g); + +#define linear function of Kmer +my ($lin_a,$lin_b)=(0.76,20); +($lin_a,$lin_b)=split /,/,$opt_r if defined($opt_r); + +#define step of Kmer change +my $step=2; +$step=$opt_w if defined($opt_w); + +#define max iteration times +my $max_iter=10; +$max_iter=$opt_u if defined($opt_u); + +#define SOAPdenovo contig parameters +my ($max_rd_len,$avg_ins,$reverse_seq,$asm_flags,$rd_len_cutoff,$rank,$pair_num_cutoff,$map_len) + =(80,460,0,3,80,1,3,32); +($max_rd_len,$avg_ins,$reverse_seq,$asm_flags,$rd_len_cutoff,$rank,$pair_num_cutoff,$map_len) + =split /,/,$opt_c if defined($opt_c); + +#define the min length of contigs +my $min_contig_len=100; +$min_contig_len=$opt_n if defined($opt_n); + +#define Kmer range +my ($minKmer, $maxKmer)=(15,127); +($minKmer, $maxKmer)=split /,/,$opt_k if defined($opt_k); + +#define number of consecutive Ns +my $con_N=10; +$con_N=$opt_m if defined($opt_m); + +#adjust directory and create output directory +$data_dir.="/" unless($data_dir=~/\/$/); +$out_dir.="/" unless($out_dir=~/\/$/); +mkdir($out_dir); + +#read in files under data_directory +opendir(DATA,$data_dir) || die("Error01: can not read input data directory: $data_dir\n"); +my @files=readdir(DATA); +closedir DATA; + +#parse fastq pairs +my %fq_base; +foreach my $f (@files){ + next if $f=~/^\.+$/; + print STDERR "Warnig: $f without suffix: $suffix\n" unless $f=~/$suffix$/; + next unless $f=~/$suffix$/; + my $fb=substr($f,0,length($f)-length($suffix)-1); + $fq_base{$fb}=1 unless defined($fq_base{$fb}); +} + +#generate SOAPdenovo config +my $config_file=$out_dir."soap.config"; +open(CONFIG,">$config_file") || die("Error02: can not write soap config file:$config_file\n"); +print CONFIG "\#maximal read length\nmax_rd_len=$max_rd_len\n[LIB]\n\#average insert size\navg_ins=$avg_ins\n\#if sequence needs to be reversed\nreverse_seq=$reverse_seq\n\#in which part(s) the reads are used\nasm_flags=$asm_flags\n\#use only first 100 bps of each read\nrd_len_cutoff=$rd_len_cutoff\n\#in which order the reads are used while scaffolding\nrank=$rank\n\# cutoff of pair number for a reliable connection (at least 3 for short insert size)\npair_num_cutoff=$pair_num_cutoff\n\#minimum aligned length to contigs for a reliable read location (at least 32 for short insert size)\nmap_len=$map_len\n\#a pair of fastq file, read 1 file should always be followed by read 2 file\n"; +foreach my $b (keys(%fq_base)){ + my $forward=$data_dir.$b."1".$suffix; + my $reverse=$data_dir.$b."2".$suffix; + print STDERR "Warning: missed file: $forward\n" unless -e $forward; + print STDERR "Warning: missed file: $reverse\n" unless -e $reverse; + print STDERR "Warning: missed file: $reverse\n" unless(-e $forward && -e $reverse); + print CONFIG "q1=$forward\nq2=$reverse\n" if(-e $forward && -e $reverse); +} +close CONFIG; + +#calculate sequencing depth +my $base_number=0; +foreach my $b (keys(%fq_base)){ + my $forward=$data_dir.$b."1".$suffix; + my $reverse=$data_dir.$b."2".$suffix; + $base_number+=count_base_from_fq($forward); + $base_number+=count_base_from_fq($reverse); +} +my $depth=$base_number/$gsize; +print STDERR "Total base number: $base_number\n"; +print STDERR "Sequencing depth = $depth (Referense size=$gsize)\n"; + +#calculate init Kmer +my $Kmid=2*int(0.5*($lin_a*$depth+$lin_b))+1; +my $Klow=$Kmid-$step; +my $Khigh=$Kmid+$step; +print STDERR "Init Kmer = $Kmid\n"; + +#init assembly +my $Nmid=run_assembly($Kmid,$out_dir."K".$Kmid); +my $Nlow=run_assembly($Klow,$out_dir."K".$Klow); +my $Nhigh=run_assembly($Khigh,$out_dir."K".$Khigh); + +#interated assembly +my $iter_times=0; +while($Nmid<$Nlow || $Nmid<$Nhigh){ + $iter_times++; + if($Nmid<$Nhigh){ #Kmer goes higher + if($iter_times>$max_iter){ #check iteration times + print STDERR "Stop best Kmer selection: out of max iteration times($max_iter)\n"; + ($Kmid,$Khigh)=($Khigh,$Kmid); + ($Nmid,$Nhigh)=($Nhigh,$Nmid); + last; + } + if($Khigh+$step>$maxKmer){ #check Kmer region + print STDERR "Stop best Kmer selection: ouside the UP boundary of Kmer($maxKmer)\n"; + ($Kmid,$Khigh)=($Khigh,$Kmid); + ($Nmid,$Nhigh)=($Nhigh,$Nmid); + last; + } + $Klow=$Kmid; + $Nlow=$Nmid; + $Kmid=$Khigh; + $Nmid=$Nhigh; + $Khigh+=$step; + $Nhigh=run_assembly($Khigh,$out_dir."K".$Khigh); + } + elsif($Nmid<$Nlow){ #Kmer goes lower + if($iter_times>$max_iter){ #check iteration times + print STDERR "Stop best Kmer selection: out of max iteration times($max_iter)\n"; + ($Kmid,$Klow)=($Klow,$Kmid); + ($Nmid,$Nlow)=($Nlow,$Nmid); + last; + } + if($Klow-$step<$minKmer){ #check Kmer region + print STDERR "Stop best Kmer selection: ouside the DOWN boundary of Kmer($minKmer)\n"; + ($Kmid,$Klow)=($Klow,$Kmid); + ($Nmid,$Nlow)=($Nlow,$Nmid); + last; + } + $Klow=$Kmid; + $Nlow=$Nmid; + $Kmid=$Khigh; + $Nmid=$Nhigh; + $Khigh+=$step; + $Nhigh=run_assembly($Khigh,$out_dir."K".$Khigh); + } +} + +#adjust outputs +system("mv $out_dir"."K$Kmid/* $out_dir"); +print STDERR "Finished!\nIteration_time=$iter_times\nKmer(selected)=$Kmid\nN50(K=$Kmid)=$Nmid\n"; +exit; + + + +######################## sub routines ############################ + +sub count_base_from_fq{ #usage: count_base_from_fq(file_name) + my ($fq)=@_; + my $bn=0; + if($fq=~/\.gz$/){ + open(FQ,"zcat $fq |") ||die("Error07: cannot open gzipped file: $fq\n"); + } + else{ + open(FQ,"$fq") ||die("Error08: cannot open file: $fq\n"); + } + my $i=0; + while(){ + $i++; + chomp; + $bn+=length($_) if($i%4==2); + } + close FQ; + return $bn; +} + + +sub run_assembly{ #usage n50=run_assembly(k,outdir) + my ($kmer,$out)=@_; + print STDERR "Assembly with kmer=$kmer\n"; + mkdir($out) unless(-e $out); + +#run soap_denovo + my $com; + if($kmer<=63){ + $com=$exec63; + } + else{ + $com=$exec127; + } + $com.=" all -s $config_file -o $out/K$kmer -K $kmer -R -F -p $thread_num >$out/soap.log 2>&1"; + system($com); + +#run gapcloser + $com=$execgap; + $com.=" -b $config_file -a $out/K$kmer.scafSeq -o $out/K\Q$kmer\E.gcScafSeq -t $thread_num >$out/gapcloser.log 2>&1"; + system($com); + +#break gap closed scaffolds to contigs + break_scaffolds("$out/K\Q$kmer\E.gcScafSeq","$out/K\Q$kmer\E.gcContig",$con_N); + +#calculate N50 + + my $n50=calculate_N50("$out/K\Q$kmer\E.gcContig"); + print STDERR "N50(K=$kmer)=$n50\n"; + return $n50; +} + +sub break_scaffolds{ + my ($in,$out,$sepN)=@_; + my $sep=""; + for(my $i=1;$i<$sepN;$i++){ + $sep.="N"; + } + open(SCAF,$in)||die("Error09: cannot read file:$in\n"); + open(CONTIG,">$out")||die("Error10: cannot write file:$out\n"); + my $init=1; + my ($header,$seq); + while(my $line=){ + chomp $line; + if($line=~/^>/){ + if(!$init){ + split_and_print_seq($seq,$sep,$header); + ($header)=split /\s+/,$line; + $seq=""; + } + else{ + ($header)=split /\s+/,$line; + $seq=""; + $init=0; + } + } + else{ + $seq.=$line; + } + } + split_and_print_seq($seq,$sep,$header); + close SCAF; + close CONTIG; +} + +sub split_and_print_seq{ + my ($seq,$sep,$header)=@_; + my $i=0; + my @tmp=split /\Q$sep\EN+/,$seq; + foreach my $s (@tmp){ + next if length($s)==0; + $i++; + print CONTIG $header,"_",$i,"\n"; + for(my $j=0;$j0; + } +} + +sub calculate_N50{ #usage: n50=calculate_N50(contig_file) + open(CONTIG,$_[0])||die("Error11: cannot open file:$_[0]\n"); + my @len; + my $seq=""; + my $cl=0; + my $init=1; + while(){ + if(/^>/){ + if(!$init){ + push @len,$cl if($cl>=$min_contig_len); + $seq=""; + $cl=0; + } + else{ + $init=0; + } + } + else{ + chomp; + $cl+=length($_); + } + } + push @len,$cl if($cl>=$min_contig_len); + close CONTIG; + @len=sort{$a<=>$b}(@len); + my $total=0; + foreach my $v (@len){ + $total+=$v; + } + $total/=2; + my $cal=0; + for(my $i=0;$i<@len;$i++){ + $cal+=$len[$i]; + return $len[$i] if($cal>=$total); + } +} diff --git a/bin/pav_plot.R b/bin/pav_plot.R new file mode 100755 index 0000000..e54948c --- /dev/null +++ b/bin/pav_plot.R @@ -0,0 +1,94 @@ +#!/usr/bin/Rscript +argv <- commandArgs(TRUE) + + + +#PAV simulation result plotting +#By Yue Zhao and Zhiqiang Hu, 3/30/2016 + + +#packages +library(ggplot2) + + +#Function definition + +#summarySE +## Gives count, mean, standard deviation, standard error of the mean, and confidence interval (default 95%). +## data: a data frame. +## measurevar: the name of a column that contains the variable to be summariezed +## groupvars: a vector containing names of columns that contain grouping variables +## na.rm: a boolean that indicates whether to ignore NA's +## conf.interval: the percent range of the confidence interval (default is 95%) +summarySE <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE, + conf.interval=.95, .drop=TRUE) { + library(plyr) + + # New version of length which can handle NA's: if na.rm==T, don't count them + length2 <- function (x, na.rm=FALSE) { + if (na.rm) sum(!is.na(x)) + else length(x) + } + + # This does the summary. For each group's data frame, return a vector with + # N, mean, and sd + datac <- ddply(data, groupvars, .drop=.drop, + .fun = function(xx, col) { + c(N = length2(xx[[col]], na.rm=na.rm), + mean = mean (xx[[col]], na.rm=na.rm), + sd = sd (xx[[col]], na.rm=na.rm) + ) + }, + measurevar + ) + + # Rename the "mean" column + datac <- rename(datac, c("mean" = measurevar)) + + datac$se <- datac$sd / sqrt(datac$N) # Calculate standard error of the mean + + # Confidence interval multiplier for standard error + # Calculate t-statistic for confidence interval: + # e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1 + ciMult <- qt(conf.interval/2 + .5, datac$N-1) + datac$ci <- datac$se * ciMult + + return(datac) +} + + + +#readSim: read simulation data +readSim <- function(file = NULL){ + DATA=read.table(file,sep="\t",header=T) + d1=summarySE(data = DATA,measurevar ="Pan",groupvars = "Time") + d1$Ave = d1$Pan + colnames(d1)[3] = "class" + d1$class = "pan" + + d2=summarySE(data = DATA,measurevar ="Core",groupvars = "Time") + d2$Ave = d2$Core + colnames(d2)[3] = "class" + d2$class = "core" + + d = rbind(d1,d2) + + return(d) +} + + + +#start processing data and plotting + +all=readSim(file=argv[1]) +plot1=paste(argv[1],"_PAV_plot.pdf",sep="") + +p<-ggplot(all, aes(x=Time, y=Ave, color = class)) +p=p+ +geom_errorbar(aes(ymin=Ave-se, ymax=Ave+se), width=.05, size=1, color = "black")+ +geom_line(size=1)+geom_point()+ +scale_y_continuous(name="Average number")+ +scale_x_continuous(name="line number") + +ggsave(plot1,p) + diff --git a/bin/plotNovelSeq.R b/bin/plotNovelSeq.R new file mode 100755 index 0000000..293d5a8 --- /dev/null +++ b/bin/plotNovelSeq.R @@ -0,0 +1,23 @@ +#!/usr/bin/Rscript +argv <- commandArgs(TRUE) + + + +#Novel sequence simulation result plotting +#Duan Zhongqu, 2018-10-21 + + +#packages +library(ggplot2) + +data<-read.table(argv[1],sep="\t",header = T) +data_dir<-argv[2] +library(ggplot2) +plot1<-paste(data_dir,"simNovelSeq.pdf",sep="") + +p<-ggplot(data = data,aes(x=Sample.number,y=Sequence.length/1000000)) +p=p+ + geom_point(col="red",size=1.5)+xlab("Number of individuals")+ylab("Total length (Mb)")+theme_bw()+ + theme(axis.text=element_text(size=20,face="bold",colour = "black"),axis.title=element_text(size=20,face="bold",colour = "black"))+ + xlim(0,max(data$Sample.number))+ylim(0,max(data$Sequence.length)/1000000) +ggsave(plot1,p) diff --git a/bin/plotQual.R b/bin/plotQual.R new file mode 100755 index 0000000..f1a986b --- /dev/null +++ b/bin/plotQual.R @@ -0,0 +1,14 @@ +#!/usr/bin/Rscript +argv <- commandArgs(TRUE) +library("ggplot2") +library("reshape2") +d=read.table(argv[1],header=T) +d1=melt(d) +plot1=paste(argv[1],"_heatmap.pdf",sep="") +d1$variable=factor(d1$variable,levels=rev(levels(d1$variable))) +k=ggplot(data=d1)+geom_tile(aes(x=Sample,y=variable,fill=value))+ + scale_x_discrete(name="")+scale_y_discrete(name="")+ + scale_fill_gradient2(name="",low="red",mid="gold",high="darkgreen", + breaks=c(-1,0,1),labels=c("FAIL","WARNING","PASS"))+theme(axis.text.x=element_text(angle=90)) +ggsave(plot1,k) + diff --git a/bin/rmCtm.pl b/bin/rmCtm.pl new file mode 100755 index 0000000..a2fad42 --- /dev/null +++ b/bin/rmCtm.pl @@ -0,0 +1,221 @@ +#!/usr/bin/perl +#Created by Duan Zhongqu, 2018-10-22 +use strict; +use warnings; +use Getopt::Std; +use vars qw($opt_h); +getopts("h"); +my $usage="\nUsage: $0 [options] + +$0 is used to detect and discard the potentail contaminated sequence. + +Necessary input description: + + sequence_file The sequence file. + + blast_file The blast result of sequence file. + + accession_file The accession file. + + output_directory Both final output files and intermediate results + will be found in this directory. To avoid + overwriting of existing files. We kindly request + that the output_directory should not exist. It is + to say, this directory will be created by the + script itself. + + min_length The minimum alignment. + + min_identity The minimun identity. + +Options: + + -h Print this usage page. + +"; + +die $usage if @ARGV!=6; +die $usage if defined($opt_h); +my ($seq_file,$blast_file,$accession_file,$out_dir,$min_length,$min_identity)=@ARGV; + +#adjust directory names and create output directory +$out_dir.="/" unless($out_dir=~/\/$/); +mkdir($out_dir); +my $log_file=$out_dir."RemoveContaminate.log"; + +#Classify the accessions of target sequences according to the orangism information +open(IN,$accession_file)||die("Error9: cannot read the file: $accession_file!\n"); +my %accession_Human; +my %accession_OtherPrimate; +my %accession_Microbiology; +my %accession_Other; +my $count_Human=0; +my $count_OtherPrimate=0; +my $count_Microbiology=0; +my $count_Other=0; +while(my $line=){ + chomp $line; + my @string=split "\t", $line; + if($string[3]=~/genus:Homo/){ + $accession_Human{$string[1]}=$string[2]; + $count_Human+=1; + } + elsif($string[3]=~/order:Primates/){ + $accession_OtherPrimate{$string[1]}=$string[2]; + $count_OtherPrimate+=1; + } + elsif($string[3]=~/superkingdom:Viruses/||$string[3]=~/superkingdom:Bacteria/||$string[3]=~/superkingdom:Archaea/){ + $accession_Microbiology{$string[1]}=$string[2]; + $count_Microbiology+=1; + } + else{ + $accession_Other{$string[1]}=$string[2]; + $count_Other+=1; + } +} +close IN; +my $count_total=$count_Human+$count_OtherPrimate+$count_Microbiology+$count_Other; +open(LOG,">$log_file")||die("Error10: cannot write the file: $log_file.\n"); +print LOG "Accession information:\n"; +print LOG "In total, $count_total accessiones in the '$accession_file' file, of these\n\t$count_Human are belong to human;\n\t$count_OtherPrimate are belong to other primates;\n\t$count_Microbiology are belong to microbiology (including bacteria, virses and archaes);\n\tand $count_Other are belong to other orangism (including non-primate animals and plants).\n\n"; +close LOG; +#Classify the contigs according to the blast result +my %contig_Human; +my %contig_OtherPrimate; +my %contig_Microbiology; +my %contig_Other; +open(IN,$blast_file)||die("Error9: cannot read the file: $blast_file!\n"); +while(my $line=){ + chomp $line; + next if $line=~/^#/; + my @string=split "\t",$line; + if((int($string[2])>=$min_identity)&&($string[3]>=$min_length)){ + if(exists($accession_Human{$string[1]})){ + $contig_Human{$string[0]}=$string[1]; + } + elsif(exists($accession_OtherPrimate{$string[1]})){ + $contig_OtherPrimate{$string[0]}=$string[1]; + } + elsif(exists($accession_Microbiology{$string[1]})){ + $contig_Microbiology{$string[0]}=$string[1]; + } + elsif(exists($accession_Other{$string[1]})){ + $contig_Other{$string[0]}=$string[1]; + } + else{ + print STDERR ("Warning: $string[1] have not in $accession_file\n"); + } + } +} +close IN; + +#Output the classified contigs +open(IN,$seq_file)||die("Error9: cannot read the file: $seq_file!\n"); +my $seq_orangism_file=$out_dir."sequence.orangism.txt"; +my $Human_seq_file=$out_dir."human.fa"; +my $OtherPrimate_seq_file=$out_dir."other_primate.fa"; +my $Microbiology_seq_file=$out_dir."microbiology.fa"; +my $Other_seq_file=$out_dir."other.fa"; +my $NotInNt_seq_file=$out_dir."not_in_nt.fa"; +my $Novel_seq_file=$out_dir."novel_sequence.fa"; +open(OUT,">$seq_orangism_file")||die("Error10: cannot write the file: $seq_orangism_file.\n"); +open(OUT1,">$Human_seq_file")||die("Error10: cannot write the file: $Human_seq_file.\n"); +open(OUT2,">$OtherPrimate_seq_file")||die("Error10: cannot write the file: $OtherPrimate_seq_file.\n"); +open(OUT3,">$Microbiology_seq_file")||die("Error10: cannot write the file: $Microbiology_seq_file.\n"); +open(OUT4,">$Other_seq_file")||die("Error10: cannot write the file: $Other_seq_file.\n"); +open(OUT5,">$NotInNt_seq_file")||die("Error10: cannot write the file: $NotInNt_seq_file.\n"); +open(NOVEL,">$Novel_seq_file")||die("Error10: cannot write the file: $Novel_seq_file.\n"); +my $flag=0; +my $count_contig_Human=0; +my $count_contig_OtherPrimate=0; +my $count_contig_Microbiology=0; +my $count_contig_Other=0; +my $count_contig_NotInNt=0; +my $length_contig_Human=0; +my $length_contig_OtherPrimate=0; +my $length_contig_Microbiology=0; +my $length_contig_Other=0; +my $length_contig_NotInNt=0; +while(my $line=){ + chomp $line; + if($line=~/^>/){ + my @string=split /\s+/,$line; + my $name=substr($string[0],1,length($string[0])-1); + if(exists($contig_Human{$name})){ + print OUT $name."\t".$contig_Human{$name}."\t"."human\n"; + print OUT1 $line."\n"; + print NOVEL $line."\n"; + $count_contig_Human+=1; + $flag=1; + } + elsif(exists($contig_OtherPrimate{$name})){ + print OUT $name."\t".$contig_OtherPrimate{$name}."\t"."other primate\n"; + print OUT2 $line."\n"; + print NOVEL $line."\n"; + $count_contig_OtherPrimate+=1; + $flag=2; + } + elsif(exists($contig_Microbiology{$name})){ + print OUT $name."\t".$contig_Microbiology{$name}."\t"."microbiology\n"; + print OUT3 $line."\n"; + $count_contig_Microbiology+=1; + $flag=3; + } + elsif(exists($contig_Other{$name})){ + print OUT $name."\t".$contig_Other{$name}."\t"."other\n"; + print OUT4 $line."\n"; + $count_contig_Other+=1; + $flag=4; + } + else{ + print OUT $name."\t--\t"."not in nt database\n"; + print OUT5 $line."\n"; + print NOVEL $line."\n"; + $count_contig_NotInNt+=1; + $flag=5; + } + } + else{ + if($flag==1){ + print OUT1 $line."\n"; + print NOVEL $line."\n"; + $length_contig_Human+=length($line); + } + elsif($flag==2){ + print OUT2 $line."\n"; + print NOVEL $line."\n"; + $length_contig_OtherPrimate+=length($line); + } + elsif($flag==3){ + print OUT3 $line."\n"; + $length_contig_Microbiology+=length($line); + } + elsif($flag==4){ + print OUT4 $line."\n"; + $length_contig_Other+=length($line); + } + elsif($flag==5){ + print OUT5 $line."\n"; + print NOVEL $line."\n"; + $length_contig_NotInNt+=length($line); + } + } +} +close OUT1; +close OUT2; +close OUT3; +close OUT4; +close OUT5; +close NOVEL; +my $count_contig_total=$count_contig_Human+$count_contig_OtherPrimate+$count_contig_Microbiology+$count_contig_Other+$count_contig_NotInNt; +my $length_contig_total=$length_contig_Human+$length_contig_OtherPrimate+$length_contig_Microbiology+$length_contig_Other+$length_contig_NotInNt; +open(LOG,">>$log_file")||die("Error10: cannot write the file: $log_file.\n"); +print LOG "Result summary:\n"; +print LOG "Type\tNumber\tLength\n"; +print LOG "Human\t$count_contig_Human\t$length_contig_Human\n"; +print LOG "Other Primate\t$count_contig_OtherPrimate\t$length_contig_OtherPrimate\n"; +print LOG "Microbiology\t$count_contig_Microbiology\t$length_contig_Microbiology\n"; +print LOG "Other\t$count_contig_Other\t$length_contig_Other\n"; +print LOG "Not in nt database\t$count_contig_NotInNt\t$length_contig_NotInNt\n"; +print LOG "Total\t$count_contig_total\t$length_contig_total\n"; +close LOG; diff --git a/lib/HUPANgeneCovSLURM.pm b/lib/HUPANgeneCovSLURM.pm index 0521712..0f2d973 100644 --- a/lib/HUPANgeneCovSLURM.pm +++ b/lib/HUPANgeneCovSLURM.pm @@ -120,7 +120,7 @@ foreach my $s (@sample){ open(JOB,">$job_file")||die("Error05: Unable to create job file: $job_file\n"); print JOB "\#!/bin/bash\n"; print JOB "\#SBATCH --job-name=$s","_genesta\n"; #job name - print JOB "\#SBATCH -p $opt_q\n" if defined $opt_q; #queue name in the submission system + # print JOB "\#SBATCH -p $opt_q\n" if defined $opt_q; #queue name in the submission system print JOB "\#SBATCH --output=$out_file\n"; #stdout print JOB "\#SBATCH --error=$err_file\n"; #stderr print JOB "\#SBATCH -n $thread_num\n"; #thread number diff --git a/lib/HUPANmapSLURM.pm b/lib/HUPANmapSLURM.pm index a27d610..b5a36c9 100644 --- a/lib/HUPANmapSLURM.pm +++ b/lib/HUPANmapSLURM.pm @@ -207,7 +207,7 @@ foreach my $s (@sample){ # print JOB "\#SBATCH -p fat\n"; #stderr print JOB "\#SBATCH -n $thread_num\n"; #thread number print JOB "\#SBATCH --ntasks-per-node=$thread_num\n"; - print JOB "\#SBATCH --time=24:00:00\n"; + print JOB "\#SBATCH --time=36:00:00\n"; print JOB "$com\n"; #commands close JOB; system("sbatch $job_file"); #submit job diff --git a/lib/HUPANpTpGSLURM.pm b/lib/HUPANpTpGSLURM.pm index 1ca1eb3..13de76c 100644 --- a/lib/HUPANpTpGSLURM.pm +++ b/lib/HUPANpTpGSLURM.pm @@ -1,4 +1,7 @@ #!/usr/bin/perl + +# This is the script for MAKER produced annotations! + package pTpG; sub pTpG{ @@ -50,12 +53,16 @@ while(my $line=){ if($format eq "gff"){ my @string=split /[;|=]/,$record; if($type eq "gene"){ - if($string[5] eq "protein_coding"){ - $pre_gid=$string[3]; + $pre_gid=$string[1]; + $pre_tid=$string[1]; + print OUT "$chr\t$source\t$type\t$start\t$end\t$score\t$sym\t$phase\tgene_id \"$pre_gid\"; transcript_id \"$pre_tid\";\n"; + }elsif($type eq "mRNA"){ + if($string[3] eq $pre_gid){ + $pre_gid=$string[1]; } }else{ - if($string[5] eq $pre_gid){ - $pre_tid=$string[7]; + if($string[3] eq $pre_gid){ + $pre_tid=$string[3]; print OUT "$chr\t$source\t$type\t$start\t$end\t$score\t$sym\t$phase\tgene_id \"$pre_gid\"; transcript_id \"$pre_tid\";\n"; } } diff --git a/lib/HUPANrmHighSLURM.pm b/lib/HUPANrmHighSLURM.pm index 612332f..4075e00 100644 --- a/lib/HUPANrmHighSLURM.pm +++ b/lib/HUPANrmHighSLURM.pm @@ -152,6 +152,7 @@ foreach my $s (@sample){ print JOB "\#SBATCH -n $thread_num\n"; #thread number print JOB "\#SBATCH --ntasks-per-node=$thread_num\n\n"; #use the same node print JOB "\#SBATCH --time=96:00:00\n"; # set time + print JOB "\#SBATCH --mem=110GB\n"; print JOB "$com\n"; #commands close JOB; system("sbatch $job_file"); #submit job diff --git a/lib/HUPANsimSeqSLURM.pm b/lib/HUPANsimSeqSLURM.pm index 74d83d2..f1a8d32 100644 --- a/lib/HUPANsimSeqSLURM.pm +++ b/lib/HUPANsimSeqSLURM.pm @@ -131,7 +131,7 @@ Options: my $j=int(rand(scalar(@samples))); my $sample=$samples[$j]; splice @samples,$j,1; - my $individual_file=$data_dir.$sample."/".$sample.".fa"; + my $individual_file=$data_dir.$sample."/".$sample.".fully.contig"; my $num=$i+1; my $contig_file=$simulation_dir.$num."_raw.fa"; my $com="cp $individual_file $contig_file"; @@ -142,11 +142,11 @@ Options: my $com="\ncp $individual_file $in_file"; my $outfile=$simulation_dir."1_non.redundant.fa"; $com.="\n$exe -i $in_file -o $outfile -T $thread_num -c $iden"; - $com.="\nfor((\$i=2;\$i<=".@sample.";\$i++));\ndo\nk=\$((\$i-1));"; - $individual_file=$simulation_dir."\$i_raw.fa"; - my $novel_file=$simulation_dir."\$k_non.redundant.fa"; + $com.="\nfor((i=2;i<=".@sample.";i++));\ndo\nk=\$((\$i-1));"; + $individual_file=$simulation_dir."\${i}_raw.fa"; + my $novel_file=$simulation_dir."\${k}_non.redundant.fa"; $com.="\ncat $individual_file $novel_file >$in_file"; - $outfile=$simulation_dir."\$i_non.redundant.fa"; + $outfile=$simulation_dir."\${i}_non.redundant.fa"; $com.="\n$exe -i $in_file -o $outfile -T $thread_num -c $iden"; $com.="\ndone;"; #print $com."\n"; @@ -164,6 +164,8 @@ Options: print JOB "\#SBATCH --error=$err_file\n"; #stderr print JOB "\#SBATCH -n $thread_num\n"; #thread number print JOB "\#SBATCH --ntasks-per-node=$thread_num\n"; + print JOB "\#SBATCH --time=330:00:00\n"; + print JOB "\#SBATCH --mem=10GB\n"; print JOB "$com\n"; #commands close JOB; system("sbatch $job_file"); #submit job diff --git a/lib/HUPANunalnCtgSLURM.pm b/lib/HUPANunalnCtgSLURM.pm index 303912e..1a384e4 100644 --- a/lib/HUPANunalnCtgSLURM.pm +++ b/lib/HUPANunalnCtgSLURM.pm @@ -140,7 +140,8 @@ foreach my $s (@sample){ $unalign_info_file=$quad.$unalign_info_file; #obtain the filtered coords file within quast dir - $quad.="nucmer_output/"; + $quad.="minimap_output/"; + #$quad.="nucmer_output/"; my $coords_file=""; opendir(ASS,$quad) || die("Error04: cannot open data directory: $quad\n"); my @nucmer_files=readdir(ASS); diff --git a/lib/originalHUPANpTpGSLURM.pm b/lib/originalHUPANpTpGSLURM.pm new file mode 100644 index 0000000..1ca1eb3 --- /dev/null +++ b/lib/originalHUPANpTpGSLURM.pm @@ -0,0 +1,232 @@ +#!/usr/bin/perl +package pTpG; + +sub pTpG{ +use strict; +use warnings; +use Getopt::Std; +use vars qw($opt_h $opt_e $opt_f); +getopts("hef"); +my $usage="\nUsage: hupan pTpG [options] inputfile outputfile + +hupan pTpG is used to obtain the longest transcript of each protein-coding genes from annotation file. +Notice: a file named \"protein_coding.gtf\" will be automatically prodcued to store all the protein-coding genes. + +Necessary input description: + + inputfile Annotation file. Default: \"gff\" format. + If the annotation file is \"gtf\", please use the option \"-f\". + + outputfile Outputfile. + +Options: + + -h Print this usage page. + + -e Check \"exon\" length instead of check \"CDS\" length. + + -f The annotation file is \"gtf\" format. +"; + +#print @ARGV; +die $usage if @ARGV!=2; +die $usage if defined($opt_h); +my ($gff_file,$out_file)=@ARGV; + +my $format="gff"; +$format="gtf" if defined($opt_f); + +open(IN,$gff_file)||die("Error: cannot read the file: $gff_file.\n"); +my $temp_file="protein_coding.gtf"; +open(OUT,">$temp_file")||die("Error: cannot write the file: $temp_file.\n"); +my $pre_gid=""; +my $pre_tid=""; +my %genes; + +while(my $line=){ + chomp $line; + next if $line=~/^#/; + my ($chr,$source,$type,$start,$end,$score,$sym,$phase,$record)=split /\t/,$line; + if($format eq "gff"){ + my @string=split /[;|=]/,$record; + if($type eq "gene"){ + if($string[5] eq "protein_coding"){ + $pre_gid=$string[3]; + } + }else{ + if($string[5] eq $pre_gid){ + $pre_tid=$string[7]; + print OUT "$chr\t$source\t$type\t$start\t$end\t$score\t$sym\t$phase\tgene_id \"$pre_gid\"; transcript_id \"$pre_tid\";\n"; + } + } + }else{ + if($type eq "gene"){ + $record=~/(gene_id \"[^\"]+\"); (gene_type \"[^\"]+\")/; + if($2 eq "gene_type \"protein_coding\""){ + $pre_gid=$1; + } + }else{ + $record=~/(gene_id \"[^\"]+\"); (transcript_id \"[^\"]+\")/; + if($1 eq $pre_gid ){ + $pre_tid=$2; + print OUT "$chr\t$source\t$type\t$start\t$end\t$score\t$sym\t$phase\t$pre_gid; $pre_tid;\n"; + } + } + } +} +close IN; +close OUT; + +my ($Vec, $num) = @{Read_GTF($temp_file)}; + +#open(OUT,">$out"); +open(OUT,">$out_file")||die("Error: cannot write the file: $out_file.\n"); +foreach my $d (@{$Vec}){ + my $target=$d->[0]; + my $len=getLen($d->[0]); + for(my $i=1;$i<@{$d};$i++){ + my $nlen=getLen($d->[$i]); + if($nlen>$len){ + $len=$nlen; + $target=$d->[$i]; + } + } + if($len!=0){ + OutputTran($target); + } +} + +exit; + + +########## sub routines ########### +sub getLen{ + my ($d)=@_; + my $len=0; + foreach my $k (@{$d}){ + if(defined $opt_e){ + if($k->{type} eq "exon"){ + $len+=$k->{end}-$k->{start}+1; + } + } + else{ + if($k->{type} eq "CDS" ||$k->{type} eq "stop_codon"){ + $len+=$k->{end}-$k->{start}+1; + } + } + } + return $len; +} + +sub OutputTran{ #\@tvec,$name ,output file + my ($ad) = @_; + my $d1; + for($d1=0;$d1<@{$ad};$d1++){ + print OUT "${${$ad}[$d1]}{chr}\t"; + print OUT "${${$ad}[$d1]}{source}\t"; + print OUT "${${$ad}[$d1]}{type}\t"; + print OUT "${${$ad}[$d1]}{start}\t"; + print OUT "${${$ad}[$d1]}{end}\t"; + print OUT "${${$ad}[$d1]}{score}\t"; + print OUT "${${$ad}[$d1]}{sym}\t"; + print OUT "${${$ad}[$d1]}{phase}\t"; + print OUT "${${$ad}[$d1]}{gid}; "; + print OUT "${${$ad}[$d1]}{tid};\n"; + } +} + +sub Read_GTF{ + my ($file) = @_; + my @gvec; + my @re; + my $begin = 1; + my $pre_tid; + my $pre_gid; + my $i = 0; + my $j = 0; + my $k = 0; + my @temp; + open(IN,$file)||die "open $file error:$!\n"; + while(){ + chomp; + if(/^[0-9a-zA-Z]+/){ + if($begin == 1){ + @temp = split("\t",$_); + $temp[8]=~/(gene_id \"[^\"]+\"); (transcript_id \"[^\"]+\")/; + $pre_tid = $2; + $pre_gid = $1; + + $gvec[$i] = init_gene(); + ${$gvec[$i]}[$j] = init_tran(); + + ${${$gvec[$i]}[$j]}[$k] = + new_sf($temp[0],$temp[1],$temp[2],$temp[3],$temp[4],$temp[5],$temp[6],$temp[7],$1,$2); + $begin = 0; + } + else{ + @temp = split("\t",$_); + $temp[8] =~ /(gene_id \"[^\"]+\"); (transcript_id \"[^\"]+\")/; + + if($1 eq $pre_gid){ + if($2 eq $pre_tid){ + $k++; + ${${$gvec[$i]}[$j]}[$k] = + new_sf($temp[0],$temp[1],$temp[2],$temp[3],$temp[4],$temp[5],$temp[6],$temp[7],$1,$2); + } + else{ + $j++; + ${$gvec[$i]}[$j] = init_tran(); + $k = 0; + ${${$gvec[$i]}[$j]}[$k] = + new_sf($temp[0],$temp[1],$temp[2],$temp[3],$temp[4],$temp[5],$temp[6],$temp[7],$1,$2); + $pre_tid = $2; + } + } + else{ + $i++; + $gvec[$i] = init_gene(); + $j = 0; + ${$gvec[$i]}[$j] = init_tran(); + $k = 0; + ${${$gvec[$i]}[$j]}[$k] = + new_sf($temp[0],$temp[1],$temp[2],$temp[3],$temp[4],$temp[5],$temp[6],$temp[7],$1,$2); + $pre_gid =$1; + $pre_tid = $2; + } + } + } + } + close IN; + $re[0] = \@gvec; + $re[1] = scalar(@gvec); + return \@re; +} + +sub new_sf{ + my ($chr,$src,$type,$start,$end,$score,$sym,$phase,$gid,$tid)=@_; + my %hash = ( + "chr" => $chr, + "source" => $src, + "type" => $type, + "score" => $score, + "gid" => $gid, + "tid" => $tid, + "start" => $start, + "end" => $end, + "sym" => $sym, + "phase" => $phase, + ); + return \%hash; +} + +sub init_gene{ + my @a; + return \@a; +} +sub init_tran{ + my @a; + return \@a; +} +1; +} +1; diff --git a/src/ccov/CCOV.cpp b/src/ccov/CCOV.cpp index d615849..e8e7441 100644 --- a/src/ccov/CCOV.cpp +++ b/src/ccov/CCOV.cpp @@ -138,9 +138,13 @@ int main(int argc, char* argv[]){ //Check coverage cerr <<"Calculate coverage and evenness" <<" ......"< > &mapper, GTF >f){ + int mycount=0; vector::iterator giter=gtf.GeneStructureSequence.begin(); while(giter!=gtf.GeneStructureSequence.end()){ + mycount++; + cerr << "While loop number: " << mycount <