From 57a0e5bad30f282c1ec909f66819ac957f5d005e Mon Sep 17 00:00:00 2001 From: Simon Ye Date: Thu, 16 Apr 2015 14:00:28 -0400 Subject: [PATCH 01/18] Fix perms on txt files. --- License.txt | 0 Readme.txt | 0 2 files changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 License.txt mode change 100755 => 100644 Readme.txt diff --git a/License.txt b/License.txt old mode 100755 new mode 100644 diff --git a/Readme.txt b/Readme.txt old mode 100755 new mode 100644 From b831cacc7ca9d9c8d65c3daf0392098db44fdc64 Mon Sep 17 00:00:00 2001 From: Simon Ye Date: Thu, 16 Apr 2015 14:01:32 -0400 Subject: [PATCH 02/18] Add -d option in SURPI.sh for changing full database path. --- SURPI.sh | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/SURPI.sh b/SURPI.sh index 293e7f5..3c4264a 100755 --- a/SURPI.sh +++ b/SURPI.sh @@ -13,7 +13,7 @@ # SURPI_version="1.0.22" -optspec=":f:hvz:" +optspec=":f:d:hvz:" bold=$(tput bold) normal=$(tput sgr0) green='\e[0;32m' @@ -26,6 +26,7 @@ scriptname=${0##*/} while getopts "$optspec" option; do case "${option}" in f) config_file=${OPTARG};; # get parameters from config file if specified + d) ref_path=${OPTARG};; # Path to reference db directory. h) HELP=1;; v) VERIFICATION=1;; z) create_config_file=${OPTARG} @@ -246,32 +247,32 @@ eBLASTn="1e-15" # SURPI will subtract all SNAP databases found in this directory from the input sequence # Useful if you want to subtract multiple genomes (without combining SNAP databases) # or, if you need to split a db if it is larger than available RAM. -SNAP_subtraction_folder="/reference/hg19" +SNAP_subtraction_db="$ref_path/hg19" # directory for SNAP-indexed databases of NCBI NT (for mapping phase in comprehensive mode) # directory must ONLY contain snap indexed databases -SNAP_COMPREHENSIVE_db_dir="/reference/COMP_SNAP" +SNAP_COMPREHENSIVE_db_dir="$ref_path/COMP_SNAP" # directory for SNAP-indexed databases for mapping phase in FAST mode # directory must ONLY contain snap indexed databases -SNAP_FAST_db_dir="/reference/FAST_SNAP" +SNAP_FAST_db_dir="$ref_path/FAST_SNAP" #Taxonomy Reference data directory #This folder should contain the 3 SQLite files created by the script "create_taxonomy_db.sh" #gi_taxid_nucl.db - nucleotide db of gi/taxonid #gi_taxid_prot.db - protein db of gi/taxonid #names_nodes_scientific.db - db of taxonid/taxonomy -taxonomy_db_directory="/reference/taxonomy" +taxonomy_db_directory="$ref_path/taxonomy" #RAPSearch viral database name: indexed protein dataset (all of Viruses) -#make sure that directory also includes the .info file -RAPSearch_VIRUS_db="/reference/RAPSearch/rapsearch_viral_db" +#make sure that directory also includes the .info file +RAPSearch_VIRUS_db="$ref_path/RAPSearch/rapsearch_viral_db" #RAPSearch nr database name: indexed protein dataset (all of NR) -#make sure that directory also includes the .info file -RAPSearch_NR_db="/reference/RAPSearch/rapsearch_nr_db" +#make sure that directory also includes the .info file +RAPSearch_NR_db="$ref_path/RAPSearch/rapsearch_nr_db" -ribo_snap_bac_euk_directory="/reference/RiboClean_SNAP" +ribo_snap_bac_euk_directory="$ref_path/RiboClean_SNAP" ########################## # Server related values @@ -296,7 +297,7 @@ cache_reset="0" ########################## # AWS_master_slave will start up a slave instance on AWS for each division of the nt database -# It will be more costly, but should run significantly faster than the solo method, which +# It will be more costly, but should run significantly faster than the solo method, which # runs each NT division through SNAP serially on a single machine. # If using the "AWS_master_slave" option, be sure that all parameters in the AWS section below are # set properly. @@ -306,7 +307,7 @@ cache_reset="0" #Which method to use for SNAP to nt [AWS_master_slave/solo] # AWS_master_slave will start up a slave instance on AWS for each division of the nt database -# It will be more costly, but should run significantly faster than the solo method, which +# It will be more costly, but should run significantly faster than the solo method, which # runs each NT division through SNAP serially on a single machine. # If using the "AWS_master_slave" option, be sure that all parameters in the AWS section below are # set properly. @@ -318,7 +319,7 @@ snap_nt_procedure="solo" ami_id="ami-5ef61936" #Number of slave instances will not exceed this value. Useful for testing, in order to restrict instance count. -#Otherwise, number of instances should be equal to number of SNAP-NT database divisions. This value is +#Otherwise, number of instances should be equal to number of SNAP-NT database divisions. This value is #automatically calculated by SURPI. max_slave_instances=29 @@ -959,7 +960,7 @@ if [ $run_mode = "Comprehensive" ] then echo -e "$(date)\t$scriptname\t######### Running ABYSS and Minimus #########" START_deNovo=$(date +%s) - echo -e "$(date)\t$scriptname\tAdding matched viruses to NT unmatched" + echo -e "$(date)\t$scriptname\tAdding matched viruses to NT unmatched" sed "n;n;n;d" "$basef.NT.snap.matched.fl.Viruses.fastq" | sed "n;n;d" | sed "s/^@/>/g" | sed 's/>/>Vir/g' > "$basef.NT.snap.matched.fl.Viruses.fasta" gt sequniq -seqit -force -o "$basef.NT.snap.matched.fl.Viruses.uniq.fasta" "$basef.NT.snap.matched.fl.Viruses.fasta" cat "$basef.NT.snap.unmatched.uniq.fl.fasta" "$basef.NT.snap.matched.fl.Viruses.uniq.fasta" > "$basef.NT.snap.unmatched_addVir_uniq.fasta" @@ -1062,7 +1063,7 @@ then awk '{print$1}' $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.annotated > $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.annotated.header awk '{print$1}' $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.annotated > $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.annotated.header - # find headers in viral rapsearch that are no longer found in rapsearch to nr + # find headers in viral rapsearch that are no longer found in rapsearch to nr sort $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.annotated.header \ $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.annotated.header | \ uniq -d | \ @@ -1125,7 +1126,7 @@ then cp $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.addseq.all.annotated $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.annotated echo -e "$(date)\t$scriptname\tretrieved taxonomy" grep "Viruses" $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.annotated > $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.Viruses.annotated - egrep "^contig" $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.annotated > $basef.Contigs.$rapsearch_database.RAPSearch.e${ecutoff_NR}.annotated + egrep "^contig" $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.annotated > $basef.Contigs.$rapsearch_database.RAPSearch.e${ecutoff_NR}.annotated echo -e "$(date)\t$scriptname\textracted RAPSearch taxonomy" echo -e "$(date)\t$scriptname\tStarting Readcount table" echo -e "$(date)\t$scriptname\tParameters: table_generator.sh $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.Viruses.annotated RAP Y Y Y Y" From b56f0b8e9573ea0961cc5fc7e0eec7719b6e0c2a Mon Sep 17 00:00:00 2001 From: Simon Ye Date: Thu, 16 Apr 2015 14:05:14 -0400 Subject: [PATCH 03/18] Remove date naming on files in create_SURPI_data. Fix bugs with insufficient locationSize and depreacted 0factor in SNAP. --- create_SURPI_data.sh | 44 ++++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/create_SURPI_data.sh b/create_SURPI_data.sh index 8e13c42..9a4649d 100755 --- a/create_SURPI_data.sh +++ b/create_SURPI_data.sh @@ -23,8 +23,9 @@ red='\e[0;31m' endColor='\e[0m' DATE=$(date +%m%d%Y) -db_dir="NCBI_$DATE" -curated_dir="curated_$DATE" +echo $DATE > DATE.txt +db_dir="NCBI" +curated_dir="curated" cleanup_dir="rawdata" @@ -53,9 +54,6 @@ if [ ! -d "$reference_dir" ]; then mkdir "$reference_dir" fi -#set SNAP index Ofactor. See SNAP documentation for details. -Ofactor=1000 - #download NCBI data to $db_dir, curated data to $curated_dir echo -e "$(date)\t$scriptname\tdownload_SURPI_data.sh -d $db_dir -c $curated_dir" download_SURPI_data.sh -d "$db_dir" -c "$curated_dir" @@ -68,12 +66,10 @@ if [ ! -d "$reference_dir/$taxonomy_dir" ]; then fi create_taxonomy_db.sh -d "$db_dir" -mv gi_taxid_nucl.db "$reference_dir/$taxonomy_dir/gi_taxid_nucl_$DATE.db" -mv gi_taxid_prot.db "$reference_dir/$taxonomy_dir/gi_taxid_prot_$DATE.db" -mv names_nodes_scientific.db "$reference_dir/$taxonomy_dir/names_nodes_scientific_$DATE.db" -ln -s "gi_taxid_nucl_$DATE.db" "$reference_dir/$taxonomy_dir/gi_taxid_nucl.db" -ln -s "gi_taxid_prot_$DATE.db" "$reference_dir/$taxonomy_dir/gi_taxid_prot.db" -ln -s "names_nodes_scientific_$DATE.db" "$reference_dir/$taxonomy_dir/names_nodes_scientific.db" +mv gi_taxid_nucl.db "$reference_dir/$taxonomy_dir/gi_taxid_nucl.db" +mv gi_taxid_prot.db "$reference_dir/$taxonomy_dir/gi_taxid_prot.db" +mv names_nodes_scientific.db "$reference_dir/$taxonomy_dir/names_nodes_scientific.db" + # ##create RAPSearch nr db and move into $RAPSearch_dir # @@ -85,10 +81,10 @@ echo -e "$(date)\t$scriptname\tDecompressing nr..." pigz -dc -k "$db_dir/nr.gz" > nr echo -e "$(date)\t$scriptname\tStarting prerapsearch on nr..." -prerapsearch -d nr -n "rapsearch_nr_${DATE}_db_v2.12" +prerapsearch -d nr -n "rapsearch_nr_db_v2.12" echo -e "$(date)\t$scriptname\tCompleted prerapsearch on nr." -mv rapsearch_nr_${DATE}_db_v2.12 "$reference_dir/$RAPSearch_dir" -mv rapsearch_nr_${DATE}_db_v2.12.info "$reference_dir/$RAPSearch_dir" +mv rapsearch_nr_db_v2.12 "$reference_dir/$RAPSearch_dir" +mv rapsearch_nr_db_v2.12.info "$reference_dir/$RAPSearch_dir" # ## Index curated data @@ -111,13 +107,13 @@ pigz -dc -k "$curated_dir/rdp_typed_iso_goodq_9210seqs.fa.gz" > rdp_typed_iso_go echo -e "$(date)\t$scriptname\tIndexing curated data..." echo -e "$(date)\t$scriptname\tSNAP indexing hg19_rRNA_mito_Hsapiens_rna.fa..." -snap index hg19_rRNA_mito_Hsapiens_rna.fa snap_index_hg19_rRNA_mito_Hsapiens_rna -hg19 -O$Ofactor +snap index hg19_rRNA_mito_Hsapiens_rna.fa snap_index_hg19_rRNA_mito_Hsapiens_rna -hg19 -locationSize 5 echo -e "$(date)\t$scriptname\tSNAP indexing Bacterial_Refseq_05172012.CLEAN.LenFiltered.uniq.fa..." -snap index Bacterial_Refseq_05172012.CLEAN.LenFiltered.uniq.fa snap_index_Bacterial_Refseq_05172012.CLEAN.LenFiltered.uniq_s16 -s 16 -O$Ofactor +snap index Bacterial_Refseq_05172012.CLEAN.LenFiltered.uniq.fa snap_index_Bacterial_Refseq_05172012.CLEAN.LenFiltered.uniq_s16 -s 16 -locationSize 5 echo -e "$(date)\t$scriptname\tStarting prerapsearch on rapsearch_viral_aa_130628_db_v2.12.fasta..." prerapsearch -d rapsearch_viral_aa_130628_db_v2.12.fasta -n "rapsearch_viral_aa_130628_db_v2.12" echo -e "$(date)\t$scriptname\tSNAP indexing viruses-5-2012_trimmedgi-MOD_addedgi.fa..." -snap index viruses-5-2012_trimmedgi-MOD_addedgi.fa snap_index_viruses-5-2012_trimmedgi-MOD_addedgi -O$Ofactor +snap index viruses-5-2012_trimmedgi-MOD_addedgi.fa snap_index_viruses-5-2012_trimmedgi-MOD_addedgi -locationSize 5 #RiboClean additions if [ ! -d "$reference_dir/$RiboClean_dir" ]; then @@ -125,13 +121,13 @@ if [ ! -d "$reference_dir/$RiboClean_dir" ]; then fi echo -e "$(date)\t$scriptname\tSNAP indexing 18s_rRNA_gene_not_partial.fa..." -snap index 18s_rRNA_gene_not_partial.fa snap_index_18s_rRNA_gene_not_partial.fa -O$Ofactor +snap index 18s_rRNA_gene_not_partial.fa snap_index_18s_rRNA_gene_not_partial.fa -locationSize 5 echo -e "$(date)\t$scriptname\tSNAP indexing viruses-5-2012_trimmedgi-MOD_addedgi.fa..." -snap index 23s.fa snap_index_23sRNA -O$Ofactor +snap index 23s.fa snap_index_23sRNA -locationSize 5 echo -e "$(date)\t$scriptname\tSNAP indexing rdp_typed_iso_goodq_9210seqs.fa..." -snap index 28s_rRNA_gene_NOT_partial_18s_spacer_5.8s.fa snap_index_28s_rRNA_gene_NOT_partial_18s_spacer_5.8s.fa -O$Ofactor +snap index 28s_rRNA_gene_NOT_partial_18s_spacer_5.8s.fa snap_index_28s_rRNA_gene_NOT_partial_18s_spacer_5.8s.fa -locationSize 5 echo -e "$(date)\t$scriptname\tSNAP indexing viruses-5-2012_trimmedgi-MOD_addedgi.fa..." -snap index rdp_typed_iso_goodq_9210seqs.fa snap_index_rdp_typed_iso_goodq_9210seqs -O$Ofactor +snap index rdp_typed_iso_goodq_9210seqs.fa snap_index_rdp_typed_iso_goodq_9210seqs -locationSize 5 if [ ! -d "$reference_dir/$FAST_dir" ]; then mkdir "$reference_dir/$FAST_dir" @@ -160,14 +156,14 @@ else fi echo -e "$(date)\t$scriptname\tStarting indexing of SNAP-nt..." echo -e "$(date)\t$scriptname\tcreate_snap_to_nt.sh -n $SNAP_nt_chunks -f nt" -create_snap_to_nt.sh -n "$SNAP_nt_chunks" -f "nt" -p "$DATE" +create_snap_to_nt.sh -n "$SNAP_nt_chunks" -f "nt" -p "prefix" echo -e "$(date)\t$scriptname\tCompleted indexing of SNAP-nt." if [ ! -d "$reference_dir/$SNAP_nt_dir" ]; then mkdir "$reference_dir/$SNAP_nt_dir" fi -mv "snap_index_${DATE}.nt."* "$reference_dir/$SNAP_nt_dir" +mv "snap_index_prefix.nt."* "$reference_dir/$SNAP_nt_dir" # ## Cleanup @@ -181,6 +177,6 @@ mv *.fa "$cleanup_dir" mv *.fasta "$cleanup_dir" mv nr "$cleanup_dir" mv nt "$cleanup_dir" -mv "$DATE"* "$cleanup_dir" +mv "prefix"* "$cleanup_dir" echo -e "$(date)\t$scriptname\t${green}Completed creation of SURPI reference data.${endColor}" From e9268e3fd5377afba10ab9646516cf05b4ce4375 Mon Sep 17 00:00:00 2001 From: Simon Ye Date: Thu, 16 Apr 2015 14:07:27 -0400 Subject: [PATCH 04/18] Actually make -p option in create_snap_to_nt work. --- create_snap_to_nt.sh | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/create_snap_to_nt.sh b/create_snap_to_nt.sh index edb297e..3ea42a1 100755 --- a/create_snap_to_nt.sh +++ b/create_snap_to_nt.sh @@ -15,7 +15,7 @@ # Copyright (C) 2014 Scot Federman - All Rights Reserved # SURPI has been released under a modified BSD license. # Please see license file for details. -# Last revised 7/2/2014 +# Last revised 7/2/2014 scriptname=${0##*/} bold=$(tput bold) @@ -25,10 +25,7 @@ red='\e[0;31m' endColor='\e[0m' prefix=$(date +%m%d%Y) -#set SNAP index Ofactor. See SNAP documentation for details -Ofactor=1000 - -while getopts ":n:hs:f:" option; do +while getopts ":n:hp:s:f:" option; do case "${option}" in f) fasta_file_to_index=${OPTARG};; n) num_chunks=${OPTARG};; @@ -44,20 +41,20 @@ done if [[ ${HELP} -eq 1 || $# -lt 1 ]] then cat < Date: Thu, 16 Apr 2015 14:08:47 -0400 Subject: [PATCH 05/18] Remove --parallel from sort in taxonomy lookup. --- taxonomy_lookup.pl | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/taxonomy_lookup.pl b/taxonomy_lookup.pl index 79d0eac..d8e5ba2 100755 --- a/taxonomy_lookup.pl +++ b/taxonomy_lookup.pl @@ -80,12 +80,11 @@ # sort/uniq gi file my $startsort = [gettimeofday()]; -system ("sort --parallel=$cores -u $basef_inputfile.gi > $basef_inputfile.gi.uniq"); - +system ("sort -u $basef_inputfile.gi > $basef_inputfile.gi.uniq"); my $sorttime = tv_interval($startsort); print localtime()."\t$0\tTime to sort -u gi file: $sorttime seconds\n"; -# Parallelization can occur at this point in the code. Since the file in now sorted, it can be split into n chunks +# Parallelization can occur at this point in the code. Since the file in now sorted, it can be split into n chunks # with no overlap. my %taxonomy; @@ -110,19 +109,19 @@ chomp; my ($family, $genus, $species, $lineage) = ("") x 4; my $result = taxonomy_fgsl($_, $seq_type); - + if ($result =~ /family--(.*?)\t/) { $family = $1; } - + if ($result =~ /genus--(.*?)\t/) { $genus = $1; } - + if ($result =~ /species--(.*?)\t/) { $species = $1; } - + if ($result =~ /lineage--(.*)$/) { $lineage = $1; } @@ -228,7 +227,7 @@ sub taxonomy_fgsl { $sth->execute(($taxid)); if (@noderow = $sth->fetchrow_array) { (my $tid, my $parent, my $rank) = @noderow; - + if ($namerow = $name_return->fetchrow_array) { $name = trim($namerow); } From 27e11f3f7a10846d36aabdb9a5a02194fd7bba46 Mon Sep 17 00:00:00 2001 From: Simon Ye Date: Thu, 16 Apr 2015 19:42:42 -0400 Subject: [PATCH 06/18] Remove trailing whitespace. --- abyss_minimus.sh | 12 ++++++------ compare_multiple_sam.py | 4 ++-- compare_sam.py | 12 ++++++------ coveragePlot.py | 10 +++++----- coverage_generator_bp.sh | 2 +- create_SURPI_data.sh | 4 ++-- create_tab_delimited_table.pl | 8 ++++---- create_taxonomy_db.py | 10 +++++----- create_taxonomy_db.sh | 10 +++++----- crop_reads.csh | 8 ++++---- cutadapt_quality.csh | 6 +++--- download_SURPI_data.sh | 8 ++++---- extractAlltoFast.sh | 8 ++++---- extractHeaderFromFastq.csh | 2 +- extractSamFromSam.sh | 6 +++--- fastq-extractBarcodedSRA.sh | 4 ++-- get_genbankfasta.pl | 6 +++--- mapPerfectBLASTtoGenome.py | 2 +- plot_reads_to_gi.sh | 10 +++++----- preprocess.sh | 6 +++--- preprocess_ncores.sh | 6 +++--- ribo_snap_bac_euk.sh | 4 ++-- slave/setup_slave.sh | 2 +- snap_nt.sh | 12 ++++++------ split_fasta.pl | 2 +- start_slaves.sh | 8 ++++---- table_generator.sh | 12 ++++++------ taxonomy_lookup_embedded.pl | 10 +++++----- tweet.pl | 4 ++-- update_sam.py | 2 +- 30 files changed, 100 insertions(+), 100 deletions(-) diff --git a/abyss_minimus.sh b/abyss_minimus.sh index ed14d85..cc924f5 100755 --- a/abyss_minimus.sh +++ b/abyss_minimus.sh @@ -77,11 +77,11 @@ for f in `cat $inputfile.barcodes` ; do END_SPLIT=$(date +%s) diff_SPLIT=$(( $END_SPLIT - $START_SPLIT )) echo -e "$(date)\t$scriptname\tSplitting FASTA took $diff_SPLIT s." - ### run abyss (deBruijn assembler) on each 100,000 read demultiplexed fasta file, including the unsplit demultiplexed file + ### run abyss (deBruijn assembler) on each 100,000 read demultiplexed fasta file, including the unsplit demultiplexed file echo -e "$(date)\t$scriptname\tRunning abyss on each $split_FASTA_size chunk..." START_ABYSS=$(date +%s) - for d in bar$f.${inputfile}_* ; do + for d in bar$f.${inputfile}_* ; do abyss-pe k=$kmer name=$d.f se=$d np=$cores >& $d.abyss.log done @@ -115,10 +115,10 @@ for f in `cat $inputfile.barcodes` ; do ########### echo -e "$(date)\t$scriptname\tStarting cat barcode addition and cutoff of minimo output" START_MIN_PROCESS=$(date +%s) - # Minimo output gives more than one line per sequence, so here we linearize sequences (linearization protocol from here http://seqanswers.com/forums/showthread.php?t=27567 ) + # Minimo output gives more than one line per sequence, so here we linearize sequences (linearization protocol from here http://seqanswers.com/forums/showthread.php?t=27567 ) # then we add the relevant barcode to the end of the header contig. Contigs that survive untouched from abyss already have a barcode at the end of them, so that extra barcode is taken away - cat all-contigs.fa | awk '{if (substr($0,1,1)==">"){if (p){print "\n";} print $0} else printf("%s",$0);p++;}END{print "\n"}' | sed '/^$/d' | sed 's/ /_/g' | sed "s/$/"$f"/g;n" | sed "s/"$f""$f"/"$f"/g" | sed 's/#/#@/g' | sed 's/^>/>contig_/g' > all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss-minim.fa - # change generic name all-contigs.fa + cat all-contigs.fa | awk '{if (substr($0,1,1)==">"){if (p){print "\n";} print $0} else printf("%s",$0);p++;}END{print "\n"}' | sed '/^$/d' | sed 's/ /_/g' | sed "s/$/"$f"/g;n" | sed "s/"$f""$f"/"$f"/g" | sed 's/#/#@/g' | sed 's/^>/>contig_/g' > all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss-minim.fa + # change generic name all-contigs.fa mv all-contigs.fa all-contigs.fa.$f # only contigs larger than $cutoff_post_Minimo are retained cat all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss-minim.fa | awk 'NR%2==1 {x = $0} NR%2==0 { if (length($0) >= '$3') printf("%s\n%s\n",x,$0)}' > all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss.${cutoff_post_Minimo}-mini.fa @@ -148,7 +148,7 @@ fi rm -f all.$inputfile.contigs.abyssmini.cut$cutoff_post_Abyss.$cutoff_post_Minimo.e1.NR.RAPSearch.aln rm -f all.$inputfile.contigs.abyssmini.e1.NR.RAPSearch.m8.noheader rm -f all.$inputfile.contigs.abyssmini.e1.NR.RAPSearch.m8.noheader.seq -rm -f all.$inputfile.contigs.abyssmini.e1.NR.RAPSearch.m8.noheader +rm -f all.$inputfile.contigs.abyssmini.e1.NR.RAPSearch.m8.noheader rm -f all.$inputfile.contigs.abyssmini.e1.NR.RAPSearch.m8.noheader.ex.fa rm -f all.$inputfile.unitigs.cut$cutoff_post_Abyss-contigs.sortlen.seq rm -f all-contigs* diff --git a/compare_multiple_sam.py b/compare_multiple_sam.py index 1af36c8..334536b 100755 --- a/compare_multiple_sam.py +++ b/compare_multiple_sam.py @@ -9,7 +9,7 @@ # # Copyright (C) 2014 Charles Y Chiu - All Rights Reserved # Permission to copy and modify is granted under the BSD license -# Last revised 3/21/2014 +# Last revised 3/21/2014 import sys @@ -61,7 +61,7 @@ lineList = [] for file in fileObjList: lineList.append(file.readline()) - + for file in fileObjList: file.close() outputFile.close() diff --git a/compare_sam.py b/compare_sam.py index 2bd65af..7ced0a3 100755 --- a/compare_sam.py +++ b/compare_sam.py @@ -42,17 +42,17 @@ def logHeader(): while line1 != '': data1 = line1.split() - + # check to see whether line is in the SAM header or alignment section if (data1[0][0]!="@"): #We are in the SAM alignment section edit_distance=int(data1[12].split(":")[2]) - + #using old snap (<=v0.15.4), could use length of line to determine if it was a hit or miss. #new snap (>=v1.0) instead uses an edit distance of -1 to signify a miss. - - if (edit_distance >= 0): + + if (edit_distance >= 0): #this is a hit new_hits += 1 sum_edistance_new += int(data1[12].split(":")[2]) @@ -61,10 +61,10 @@ def logHeader(): # this has been previously flagged as a hit, and is also now a hit. Compare edit distances to find best hit. edit_distance_previous=firstentry[1] edit_distance_current = data1[12].split(":")[2] - + existing_hits += 1 sum_edistance_existing += int(edit_distance_previous) - + if (int(edit_distance_current) <= int(edit_distance_previous)): replacements += 1 sum_data1 += int(edit_distance_current) diff --git a/coveragePlot.py b/coveragePlot.py index c95981e..62a7930 100755 --- a/coveragePlot.py +++ b/coveragePlot.py @@ -2,7 +2,7 @@ # # coveragePlot.py # -# This program generates genomic coverage plots +# This program generates genomic coverage plots # Chiu Laboratory # University of California, San Francisco # January, 2014 @@ -47,7 +47,7 @@ def smart_truncate1(text, max_length=100, suffix='...'): # print "Installed version is: %s." % mpl_version -#load function is deprecated as of matplotlib v1.3.1, replaced with +#load function is deprecated as of matplotlib v1.3.1, replaced with if (LooseVersion(mpl_version) >= LooseVersion('1.3.1') ): data = np.loadtxt(dataFile) else: @@ -78,7 +78,7 @@ def smart_truncate1(text, max_length=100, suffix='...'): plot(data[:,0],data[:,1],color) xlabel("base position",fontsize=8) ylabel("fold coverage",fontsize=8) - title_text = sys.argv[2] + title_text = sys.argv[2] suptitle(title_text,fontsize=9) xMin, xMax, yMin, yMax = min(data[:,0]),max(data[:,0]),min(data[:,1]),max(data[:,1]) # add a 10% buffer to yMax @@ -105,7 +105,7 @@ def smart_truncate1(text, max_length=100, suffix='...'): plot(data[:,0],data[:,1],color) xlabel("base position",fontsize=8) ylabel("fold coverage",fontsize=8) - title_text = sys.argv[2] + title_text = sys.argv[2] suptitle(title_text,fontsize=9) xMin, xMax, yMin, yMax = min(data[:,0]),max(data[:,0]),min(data[:,1]),max(data[:,1]) yMax *= 1.1 @@ -122,7 +122,7 @@ def smart_truncate1(text, max_length=100, suffix='...'): plot(data[:,0],data[:,1],color) xlabel("base position",fontsize=8) ylabel("fold coverage",fontsize=8) - title_text = sys.argv[2] + title_text = sys.argv[2] suptitle(title_text,fontsize=9) xMin, xMax, yMin, yMax = min(data[:,0]),max(data[:,0]),min(data[:,1]),max(data[:,1]) yMax *= 1.1 diff --git a/coverage_generator_bp.sh b/coverage_generator_bp.sh index 9510a8d..6f13feb 100755 --- a/coverage_generator_bp.sh +++ b/coverage_generator_bp.sh @@ -93,7 +93,7 @@ do # create list of curated GIs $SNAP_file.$genus.gi.list.curatedgenome # First retain only gis that are complete genomes if no GIs with "complete genomes" are returned, we'll go to GIs "complete sequences", if no complete sequences, then we just go with the GIs we have - get_genbankfasta.pl -i bar.$bar.$SNAP_file.$genus.gi.list > bar.$bar.$SNAP_file.$genus.gi.headers + get_genbankfasta.pl -i bar.$bar.$SNAP_file.$genus.gi.list > bar.$bar.$SNAP_file.$genus.gi.headers egrep ">gi.*[Cc]omplete [Gg]enome" bar.$bar.$SNAP_file.$genus.gi.headers | awk -F "|" '{print$2}' | head -n $top_gis > bar.$bar.$SNAP_file.$genus.gi if [ -s bar.$bar.$SNAP_file.$genus.gi ] diff --git a/create_SURPI_data.sh b/create_SURPI_data.sh index 9a4649d..e79a01b 100755 --- a/create_SURPI_data.sh +++ b/create_SURPI_data.sh @@ -15,7 +15,7 @@ # Copyright (C) 2014 Scot Federman - All Rights Reserved # SURPI has been released under a modified BSD license. # Please see license file for details. -# Last revised 7/7/2014 +# Last revised 7/7/2014 scriptname=${0##*/} green='\e[0;32m' @@ -42,7 +42,7 @@ SNAP_nt_dir="COMP_SNAP" RiboClean_dir="RiboClean_SNAP" # SNAP_nt_chunks= # of chunks to split nt into, SNAP index will be created for each chunk. -# Currently, the minimum number of chunks is around 17-20 +# Currently, the minimum number of chunks is around 17-20 # SNAP 0.15.4 will not successfully index nt with less than 17 chunks, though this will vary a bit with each NT release. # I picked 20 here as a safe default. # This will likely have to be increased when using SNAP 1.0, which has different indexing characteristics, diff --git a/create_tab_delimited_table.pl b/create_tab_delimited_table.pl index 77a1fc5..f0a0710 100755 --- a/create_tab_delimited_table.pl +++ b/create_tab_delimited_table.pl @@ -10,7 +10,7 @@ # Copyright (C) 2014 Scot Federman - All Rights Reserved # SURPI has been released under a modified BSD license. # Please see license file for details. -# Last revised 1/26/2014 +# Last revised 1/26/2014 use Getopt::Std; use strict; @@ -21,14 +21,14 @@ if ($opt_h) { print < gi_taxid_nucl.dmp pigz -dc -k "$db_directory/gi_taxid_prot.dmp.gz" > gi_taxid_prot.dmp # the below grep "fixes" the issue whereby aliases, mispellings, and other alternate names are returned. -# We could simply look for a name that is a "scientific name", +# We could simply look for a name that is a "scientific name", # but this shrinks the db a bit, speeding up lookups, and removes data we do not need at this time. echo -e "$(date)\t$scriptname\tRetaining scientific names..." grep "scientific name" names.dmp > names_scientificname.dmp diff --git a/crop_reads.csh b/crop_reads.csh index 4b5a56d..29fc65b 100755 --- a/crop_reads.csh +++ b/crop_reads.csh @@ -9,7 +9,7 @@ # # # will go from position 10 to 85 (or whatever size is available) -# +# # # $1 = file to crop # $2 = position to start crop ($start_nt) @@ -18,11 +18,11 @@ # Copyright (C) 2014 Charles Y Chiu - All Rights Reserved # SURPI has been released under a modified BSD license. # Please see license file for details. -# Last revised 1/26/2014 - +# Last revised 1/26/2014 + if ($#argv != 3) then echo "Usage: crop_reads.csh " exit(1) endif -cat $argv[1] | awk '(NR%2==1){print $0} (NR%2==0){print substr($0,'"$argv[2]"','"$argv[3]"')}' +cat $argv[1] | awk '(NR%2==1){print $0} (NR%2==0){print substr($0,'"$argv[2]"','"$argv[3]"')}' diff --git a/cutadapt_quality.csh b/cutadapt_quality.csh index 9198d9a..f29c2de 100755 --- a/cutadapt_quality.csh +++ b/cutadapt_quality.csh @@ -3,7 +3,7 @@ # cutadapt_quality.csh # # runs cutadapt to remove primer sequences from Illumina files -# also accepts a quality argument for Illumina / Sanger quality +# also accepts a quality argument for Illumina / Sanger quality # user specifies length cutoff # user specifies whether short reads less than length cutoff are kept; if so, they are converted to size=1 # @@ -19,7 +19,7 @@ # Copyright (C) 2014 Charles Chiu - All Rights Reserved # SURPI has been released under a modified BSD license. # Please see license file for details. - + if ($#argv != 7) then echo "Usage: cutadapt_quality.csh " exit(1) @@ -113,6 +113,6 @@ endif #set numreads_end = `egrep -c "@HWI|@M00|@SRR" $inputfile:r.cutadapt.fastq` #@ reads_removed = $numreads_start - $numreads_end -#echo $reads_removed" reads removed by cutadapt" +#echo $reads_removed" reads removed by cutadapt" #echo $numreads_end" reads at end of cutadapt" rm -f $TEMPDIR{$$} diff --git a/download_SURPI_data.sh b/download_SURPI_data.sh index c2bf6a3..f20f32e 100755 --- a/download_SURPI_data.sh +++ b/download_SURPI_data.sh @@ -12,7 +12,7 @@ # Copyright (C) 2014 Scot Federman - All Rights Reserved # SURPI has been released under a modified BSD license. # Please see license file for details. -# Last revised 7/2/2014 +# Last revised 7/2/2014 scriptname=${0##*/} bold=$(tput bold) @@ -38,10 +38,10 @@ done if [[ ${HELP-} -eq 1 ]] then cat < " + echo "Usage: $scriptname " exit 65 fi @@ -36,7 +36,7 @@ if [ $inputfile_type = BLASTN ] then awk '{print$1}' $inputfile > $inputfile.header echo -e "$(date)\t$scriptname\tuniqued blastn file, replaced beginning with @" - + if [ $parentfile_type = FASTA ] then seqtk subseq $parentfile $inputfile.header > $outputfile @@ -59,12 +59,12 @@ then echo -e "$(date)\t$scriptname\tDone preparing input Fasta file " if [ $parentfile_type = FASTA ] - then + then seqtk subseq $parentfile $inputfile.header > $outputfile elif [ $parentfile_type = FASTQ ] then if [ $output_format = FASTQ ] - then + then cat $parentfile | fqextract $inputfile.header > $outputfile elif [ $output_format = FASTA ] then diff --git a/extractHeaderFromFastq.csh b/extractHeaderFromFastq.csh index d441aa2..5945279 100755 --- a/extractHeaderFromFastq.csh +++ b/extractHeaderFromFastq.csh @@ -14,7 +14,7 @@ # Copyright (C) 2014 Charles Y Chiu - All Rights Reserved # SURPI has been released under a modified BSD license. # Please see license file for details. -# Last revised 1/26/2014 +# Last revised 1/26/2014 if ($#argv != 4) then echo "Usage: extractHeaderFromFastq.csh
" diff --git a/extractSamFromSam.sh b/extractSamFromSam.sh index 31cfe1a..79b91e6 100755 --- a/extractSamFromSam.sh +++ b/extractSamFromSam.sh @@ -1,7 +1,7 @@ #!/bin/bash -# +# # extactSamFromSam.sh -# +# # extract SAM reads corresponding to a SAM header file from another SAM reference file and writes to # SAM output file # Chiu Laboratory @@ -40,7 +40,7 @@ else let "numlines = `wc -l basef | awk '{print $1}'`" let "LinesPerCore = numlines / $cores" echo -e "$(date)\t$scriptname\twill use $cores cores with $LinesPerCore entries per core" - + split -l $LinesPerCore $basef echo -e "$(date)\t$scriptname\textracting reads from $baseg using headers from $basef" diff --git a/fastq-extractBarcodedSRA.sh b/fastq-extractBarcodedSRA.sh index f990a01..1b2fd58 100755 --- a/fastq-extractBarcodedSRA.sh +++ b/fastq-extractBarcodedSRA.sh @@ -18,7 +18,7 @@ if [ $# -lt 1 ]; then exit fi -# +# # gets absolute path to SRA file # SRA_PATH=$(readlink -e $1) @@ -34,7 +34,7 @@ files=($header*.fastq) # restore barcode and R1/R2 designation to FASTQ headers for f in "${files[@]}" -do +do barcode=$(echo "$f" | sed 's/_[12]//g' | sed 's/.fastq//g' | sed 's/.*_//g') readnum=$(echo "$f" | sed 's/.*_\([12]\).fastq/\1/g' | sed '/'$header'/d') echo -e "$(date)\t$scriptname\tRestoring barcode # $barcode and R1/R2 designation to $f" diff --git a/get_genbankfasta.pl b/get_genbankfasta.pl index c4c2f22..991a385 100755 --- a/get_genbankfasta.pl +++ b/get_genbankfasta.pl @@ -33,7 +33,7 @@ if ($opt_h) { print < " + echo "Usage: $scriptname <.fasta> " echo "not uniquing Blastn anymore 9/13/13 given a fasta file and the GI or FASTA of the reference to assemble to -> makes an assembly. blastn e value in 1e-8 format . jedi has 16 cores available" exit fi @@ -59,7 +59,7 @@ done cat $1*.$2.$3.$4.blastn > $1.$2.$3.$4.Blastn -uniq_blastn_nopipes.sh $1.$2.$3.$4.Blastn +uniq_blastn_nopipes.sh $1.$2.$3.$4.Blastn extractAlltoFast.sh $1.$2.$3.$4.Blastn.uniq BLASTN $1 FASTA $1.$2.$3.$4.Blastn.uniq.ex.fa FASTA ####figuring out length of sequence in gi####### @@ -79,7 +79,7 @@ echo -e -n "$(date)\t$scriptname\t%Coverage = " echo "scale=6;100*$coverage/$gilength" | bc let "sumofcolumntwo = `awk '{print$2}' $1.$2.$3.$4.map | awk '{sum += $1} END{print sum}'`" -echo -e -n "$(date)\t$scriptname\tAverage depth of coverage (x) = " +echo -e -n "$(date)\t$scriptname\tAverage depth of coverage (x) = " echo "scale=6;$sumofcolumntwo/$gilength" | bc let "numberBlastnReads = `egrep -c "^SCS|^HWI|gi|^M00|^kmer|^SRR" $1.$2.$3.$4.Blastn.uniq`" echo -e "$(date)\t$scriptname\tNumber of reads contributing to assembly $numberBlastnReads" @@ -94,7 +94,7 @@ echo "Reference sequence length = $gilength bp" >> $1.$2.$3.$4.report echo "Coverage in bp = $coverage" >> $1.$2.$3.$4.report echo -n "%Coverage = " >> $1.$2.$3.$4.report echo "scale=6;100*$coverage/$gilength" | bc >> $1.$2.$3.$4.report -echo -n "Average depth of coverage = " >> $1.$2.$3.$4.report +echo -n "Average depth of coverage = " >> $1.$2.$3.$4.report echo "scale=6;$sumofcolumntwo/$gilength" | bc >> $1.$2.$3.$4.report #echo "Average depth of coverage = $averagedepthcoverage x " >> $1.$2.$3.$4.report echo "Number of reads contributing to assembly = $numberBlastnReads" >> $1.$2.$3.$4.report @@ -106,7 +106,7 @@ ps2pdf14 $1.$2.$3.$4.ps $1.$2.$3.$4.pdf #### Cleanup ######## rm -f $1*$2.$3.$4.blastn rm -f $1*$2.$3.$4.error -rm -f $3.$2.fasta.nhr +rm -f $3.$2.fasta.nhr rm -f $3.$2.fasta.nin rm -f $3.$2.fasta.nsq #rm -f $1.$2.$3.$4.Blastn diff --git a/preprocess.sh b/preprocess.sh index 443ba3d..1791961 100755 --- a/preprocess.sh +++ b/preprocess.sh @@ -8,7 +8,7 @@ # January, 2014 # # cutadapt=>crop->dust removal (no uniq) *** -# +# # 12/20/12 - modified to switch to cutadapt for trimming # 12/31/12 - modified from Cshell to BASH version for timing # @@ -100,7 +100,7 @@ fi echo -e "$(date)\t$scriptname\t********** running crop, Read1 **********" START1=$(date +%s) -if [ $run_uniq == "Y" ] +if [ $run_uniq == "Y" ] then echo -e "$(date)\t$scriptname\tWe will be using $crop_length as the length of the cropped read" crop_reads.csh $basef.cutadapt.uniq.fastq $start_nt $crop_length > $basef.cutadapt.uniq.cropped.fastq @@ -117,7 +117,7 @@ echo -e "$(date)\t$scriptname\tDone crop: CROP took $diff seconds" echo -e "$(date)\t$scriptname\t********** running dust, Read1 **********" START1=$(date +%s) -if [ $run_uniq == "Y" ] +if [ $run_uniq == "Y" ] then prinseq-lite.pl -fastq $basef.cutadapt.uniq.cropped.fastq -out_format 3 -out_good $basef.cutadapt.uniq.cropped.dusted -out_bad $basef.cutadapt.uniq.cropped.dusted.bad -log -lc_method dust -lc_threshold 7 mv -f $basef.cutadapt.uniq.cropped.dusted.fastq $basef.preprocessed.fastq diff --git a/preprocess_ncores.sh b/preprocess_ncores.sh index 820e50a..89760a5 100755 --- a/preprocess_ncores.sh +++ b/preprocess_ncores.sh @@ -1,5 +1,5 @@ #!/bin/bash -# +# # preprocess_ncores.sh # # This script runs preprocessing across multiple cores (FASTA/FASTQ header modification, quality filtering, adapter trimming, and low-complexity filtering) @@ -108,13 +108,13 @@ do cat $basef.preprocessed.fastq >> $basef2.preprocessed.fastq rm -f $basef.preprocessed.fastq cat $basef.cutadapt.cropped.dusted.bad.fastq >> $basef2.cutadapt.cropped.dusted.bad.fastq - rm -f $basef.cutadapt.cropped.dusted.bad.fastq + rm -f $basef.cutadapt.cropped.dusted.bad.fastq rm -f $f rm -f $basef.modheader.fastq rm -f $basef.cutadapt.summary.log rm -f $basef.adapterinfo.log - rm -f $basef.cutadapt.cropped.fastq + rm -f $basef.cutadapt.cropped.fastq done echo -e "$(date)\t$scriptname\tDone concatenating output..." diff --git a/ribo_snap_bac_euk.sh b/ribo_snap_bac_euk.sh index 261e3de..97aea22 100755 --- a/ribo_snap_bac_euk.sh +++ b/ribo_snap_bac_euk.sh @@ -58,8 +58,8 @@ awk '{print$1}' $inputfile.noSmallS_LargeS.sam | sed '/^@/d' > $inputfile.noSmal # retrieve reads from original $inputfile extractSamFromSam.sh $inputfile.noSmallS_LargeS.header.sam $inputfile $basef.noRibo.annotated -echo -e "$(date)\t$scriptname\tCreated $inputfile.noRibo.annotated" -table_generator.sh $basef.noRibo.annotated SNAP N Y N N +echo -e "$(date)\t$scriptname\tCreated $inputfile.noRibo.annotated" +table_generator.sh $basef.noRibo.annotated SNAP N Y N N rm -f $inputfile.noLargeS.sam rm -f $inputfile.noLargeS.matched.sam diff --git a/slave/setup_slave.sh b/slave/setup_slave.sh index e9a52a8..58a5261 100755 --- a/slave/setup_slave.sh +++ b/slave/setup_slave.sh @@ -22,7 +22,7 @@ if [ ! -d $bin_folder ] then mkdir $bin_folder fi - + CWD=$(pwd) # set timezone diff --git a/snap_nt.sh b/snap_nt.sh index 50c31b3..f0fe92c 100755 --- a/snap_nt.sh +++ b/snap_nt.sh @@ -1,16 +1,16 @@ #!/bin/bash -# +# # snap_nt.sh # # This script runs SNAP against the NT database # Chiu Laboratory # University of California, San Francisco # -# +# # Copyright (C) 2014 Charles Chiu - All Rights Reserved # SURPI has been released under a modified BSD license. # Please see license file for details. - + expected_args=6 scriptname=${0##*/} @@ -72,12 +72,12 @@ for snap_index in $SNAP_NT_index_directory/* ; do cat $basef.snap.log >> $basef.snapNT.log cat $basef.time.log >> $basef.timeNT.log - + compare_sam.py $basef.tmp.sam $basef.prev.sam cat $basef.prev.sam | egrep -v "^@" | awk '{print "@"$1"\n"$10"\n""+"$1"\n"$11}' > $basef.tmp.fastq counter=1 - + END2=$(date +%s) echo -e "$(date)\t$scriptname\tDone with $snap_index." @@ -86,7 +86,7 @@ for snap_index in $SNAP_NT_index_directory/* ; do done # need to restore the hits -update_sam.py $basef.prev.sam $basef.NT.sam +update_sam.py $basef.prev.sam $basef.NT.sam rm -f $basef.tmp.sam rm -f $basef.tmp.fastq diff --git a/split_fasta.pl b/split_fasta.pl index 060cb92..e040512 100755 --- a/split_fasta.pl +++ b/split_fasta.pl @@ -80,7 +80,7 @@ #write record to file print (OUTFILE ">$sequenceTitle\n"); print (OUTFILE "$sequenceEntry\n"); - $seqCount++; + $seqCount++; $seqThisFile++; if ($seqThisFile == $numberToCopy) { diff --git a/start_slaves.sh b/start_slaves.sh index 622df47..e428d77 100755 --- a/start_slaves.sh +++ b/start_slaves.sh @@ -33,14 +33,14 @@ file_with_slave_ips=$7 placement_group=$8 ### -#This is the time (in seconds) this script waits after starting up the AWS machines. +#This is the time (in seconds) this script waits after starting up the AWS machines. #It allows the instances time to start up. In practice, 120s appears to be sufficient. WAIT_TIME=120 #folder where slave scripts are located slave_folder="/usr/local/bin/surpi/slave" #script located on slave to run -slave_script="slave_setup.sh" +slave_script="slave_setup.sh" #this parameter is currently tied to the $keypair used during slave_setup.sh. should be cleaned up prior to release pemkey="/home/ubuntu/.ssh/surpi.pem" @@ -106,7 +106,7 @@ do done # instance_id_list=( $result ) # instance_id_size=${#instance_id_list[@]} -# +# # if [ "$instance_id_size" != "$number_of_instances" ]; then # echo "ERROR: could not create $number_of_instances instances" # echo "$number_of_instances instances were created." @@ -150,7 +150,7 @@ do echo -e "$(date)\t$scriptname\t$attached" >> slave.$COUNTER.log 2>&1 #verify $attached here to verify that EBS was attached - + echo -e "$(date)\t$scriptname\tconnecting and running $slave_script script..." >> slave.$COUNTER.log 2>&1 ssh -o StrictHostKeyChecking=no -i $pemkey ubuntu@$host "/home/ubuntu/$slave_script" >> slave.$COUNTER.log 2>&1 & let COUNTER=COUNTER+1 diff --git a/table_generator.sh b/table_generator.sh index 240d3b1..4491367 100755 --- a/table_generator.sh +++ b/table_generator.sh @@ -7,7 +7,7 @@ # University of California, San Francisco # January, 2014 # -# input file: annotated sam or annotated -m 8 file with taxonomies provided in the following format at the end of the .sam and -m 8 file: +# input file: annotated sam or annotated -m 8 file with taxonomies provided in the following format at the end of the .sam and -m 8 file: # "gi# --family --genus --species" # Output files are tab delimited files ending in .counttable whereby rows represent taxonomic annotations at various levels (family, genus, species, gi) # Columns represent individual barcodes found in the dataset, and cells contain the number of reads @@ -20,7 +20,7 @@ scriptname=${0##*/} if [ $# -lt 6 ]; then - echo "Usage: $scriptname " + echo "Usage: $scriptname " exit fi @@ -33,7 +33,7 @@ genus=$5 family=$6 ### -###substitute forward slash with @_ because forward slash in species name makes it ungreppable. using @_ because @ is used inside the contig barcode (ie. #@13 is barcode 13, contig generated) +###substitute forward slash with @_ because forward slash in species name makes it ungreppable. using @_ because @ is used inside the contig barcode (ie. #@13 is barcode 13, contig generated) create_tab_delimited_table.pl -f $file_type $inputfile | sed 's/ /_/g' | sed 's/,/_/g' | sed 's/\//@_/g' > $inputfile.tempsorted echo -e "$(date)\t$scriptname\tdone creating $inputfile.tempsorted" @@ -61,7 +61,7 @@ then done echo -e "GI\tSpecies\tGenus\tFamily(@=contigbarcode)" > $inputfile.header cat $inputfile.header $inputfile.gi.uniq.columntable > $inputfile.gi.counttable_temp - paste $inputfile.gi.counttable_temp bar*.$inputfile.gi.output > $inputfile.gi.counttable + paste $inputfile.gi.counttable_temp bar*.$inputfile.gi.output > $inputfile.gi.counttable sed -i 's/@_/ /g' $inputfile.gi.counttable echo -e "$(date)\t$scriptname\tdone generating gi counttable" fi @@ -84,7 +84,7 @@ then done echo -e "Species\tGenus\tFamily(@=contigbarcode)" > $inputfile.header cat $inputfile.header $inputfile.species.uniq.columntable > $inputfile.species.counttable_temp - paste $inputfile.species.counttable_temp bar*.$inputfile.species.output > $inputfile.species.counttable + paste $inputfile.species.counttable_temp bar*.$inputfile.species.output > $inputfile.species.counttable sed -i 's/@_/ /g' $inputfile.species.counttable echo -e "$(date)\t$scriptname\tdone generating species counttable" fi @@ -106,7 +106,7 @@ then done echo -e "Genus\tFamily(@=contigbarcode)" > $inputfile.header cat $inputfile.header $inputfile.genus.uniq.columntable > $inputfile.genus.counttable_temp - paste $inputfile.genus.counttable_temp bar*.$inputfile.genus.output > $inputfile.genus.counttable + paste $inputfile.genus.counttable_temp bar*.$inputfile.genus.output > $inputfile.genus.counttable sed -i 's/@_/ /g' $inputfile.genus.counttable echo -e "$(date)\t$scriptname\tdone generating genus counttable" fi diff --git a/taxonomy_lookup_embedded.pl b/taxonomy_lookup_embedded.pl index 5709a89..7184152 100755 --- a/taxonomy_lookup_embedded.pl +++ b/taxonomy_lookup_embedded.pl @@ -9,7 +9,7 @@ # Copyright (C) 2014 Scot Federman - All Rights Reserved # SURPI has been released under a modified BSD license. # Please see license file for details. -# Last revised 1/26/2014 +# Last revised 1/26/2014 use DBI; use strict; @@ -36,12 +36,12 @@ if ($opt_h) { print < Date: Thu, 16 Apr 2015 19:47:05 -0400 Subject: [PATCH 07/18] Fix shebangs for python/perl scripts. --- compare_multiple_sam.py | 2 +- compare_sam.py | 2 +- coveragePlot.py | 2 +- create_tab_delimited_table.pl | 2 +- create_taxonomy_db.py | 2 +- get_genbankfasta.pl | 2 +- mapPerfectBLASTtoGenome.py | 2 +- split_fasta.pl | 2 +- taxonomy_lookup.pl | 2 +- taxonomy_lookup_embedded.pl | 3 ++- tweet.pl | 2 +- update_sam.py | 2 +- 12 files changed, 13 insertions(+), 12 deletions(-) diff --git a/compare_multiple_sam.py b/compare_multiple_sam.py index 334536b..ce151ed 100755 --- a/compare_multiple_sam.py +++ b/compare_multiple_sam.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python # # compare_multiple_sam.py # diff --git a/compare_sam.py b/compare_sam.py index 7ced0a3..07e8703 100755 --- a/compare_sam.py +++ b/compare_sam.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python # # compare_sam.py # diff --git a/coveragePlot.py b/coveragePlot.py index 62a7930..0c48713 100755 --- a/coveragePlot.py +++ b/coveragePlot.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python # # coveragePlot.py # diff --git a/create_tab_delimited_table.pl b/create_tab_delimited_table.pl index f0a0710..a1be09e 100755 --- a/create_tab_delimited_table.pl +++ b/create_tab_delimited_table.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl # # create_tab_delimited_table.pl # diff --git a/create_taxonomy_db.py b/create_taxonomy_db.py index bc8fac5..09fe402 100755 --- a/create_taxonomy_db.py +++ b/create_taxonomy_db.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python # # create_taxonomy_db.py # diff --git a/get_genbankfasta.pl b/get_genbankfasta.pl index 991a385..995872e 100755 --- a/get_genbankfasta.pl +++ b/get_genbankfasta.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl # # get_genbankfasta.pl # diff --git a/mapPerfectBLASTtoGenome.py b/mapPerfectBLASTtoGenome.py index 64ebf17..cf3da59 100755 --- a/mapPerfectBLASTtoGenome.py +++ b/mapPerfectBLASTtoGenome.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python # # Simple script to iterate over the -m 8 results from a # BLAST and spits out the number of hits at each base of the diff --git a/split_fasta.pl b/split_fasta.pl index e040512..f67f660 100755 --- a/split_fasta.pl +++ b/split_fasta.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl #split_fasta.pl version 1.0 #This script accepts a file consisting of multiple FASTA formatted sequence records. #It splits the file into multiple new files, each consisting of a subset of the original records. diff --git a/taxonomy_lookup.pl b/taxonomy_lookup.pl index d8e5ba2..79c0f6e 100755 --- a/taxonomy_lookup.pl +++ b/taxonomy_lookup.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl -w # # taxonomy_lookup.pl # diff --git a/taxonomy_lookup_embedded.pl b/taxonomy_lookup_embedded.pl index 7184152..d8e7a51 100755 --- a/taxonomy_lookup_embedded.pl +++ b/taxonomy_lookup_embedded.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl # # taxonomy_lookup_embedded.pl # @@ -12,6 +12,7 @@ # Last revised 1/26/2014 use DBI; +use warnings; use strict; use Getopt::Std; use Time::HiRes qw[gettimeofday tv_interval]; diff --git a/tweet.pl b/tweet.pl index 27127ad..255896b 100755 --- a/tweet.pl +++ b/tweet.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl # # tweet.pl # diff --git a/update_sam.py b/update_sam.py index 395886d..29cc545 100755 --- a/update_sam.py +++ b/update_sam.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python # # update_sam.py # this script reannotates the SAM file based on the better hit identified in "compare_sam.py" From 8256bb3079ad6856d4f4fef5d507482d9356e509 Mon Sep 17 00:00:00 2001 From: Simon Ye Date: Sat, 18 Apr 2015 11:07:16 -0400 Subject: [PATCH 08/18] Append to $PATH in config file. --- SURPI.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SURPI.sh b/SURPI.sh index 3c4264a..84804fe 100755 --- a/SURPI.sh +++ b/SURPI.sh @@ -91,7 +91,7 @@ fi if [[ $create_config_file ]] then - echo "PATH=/usr/local/bin/surpi:/usr/local/bin:/usr/bin/:/bin" > go_$configprefix + echo "PATH=/usr/local/bin/surpi:/usr/local/bin:/usr/bin/:/bin:$PATH" > go_$configprefix echo "nohup $scriptname -f $configprefix.config > SURPI.$configprefix.log 2> SURPI.$configprefix.err" >> go_$configprefix chmod +x go_$configprefix From c2dcc66a2b240df181c72a5941cede63b1b8aed6 Mon Sep 17 00:00:00 2001 From: Simon Ye Date: Sat, 18 Apr 2015 12:02:26 -0400 Subject: [PATCH 09/18] Remove ABYSS-P from hard dependencies. --- SURPI.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SURPI.sh b/SURPI.sh index 84804fe..e0bc6ed 100755 --- a/SURPI.sh +++ b/SURPI.sh @@ -487,7 +487,7 @@ nopathf=${FASTQ_file##*/} # remove the path to file basef=${nopathf%.fastq} #verify that all software dependencies are properly installed -declare -a dependency_list=("gt" "seqtk" "fastq" "fqextract" "cutadapt" "prinseq-lite.pl" "dropcache" "$snap" "rapsearch" "fastQValidator" "abyss-pe" "ABYSS-P" "Minimo") +declare -a dependency_list=("gt" "seqtk" "fastq" "fqextract" "cutadapt" "prinseq-lite.pl" "dropcache" "$snap" "rapsearch" "fastQValidator" "abyss-pe" "Minimo") echo "-----------------------------------------------------------------------------------------" echo "DEPENDENCY VERIFICATION" echo "-----------------------------------------------------------------------------------------" From dd5d6e0321e175eb94a45f6b633791cb857c93d8 Mon Sep 17 00:00:00 2001 From: Simon Ye Date: Sat, 18 Apr 2015 12:07:45 -0400 Subject: [PATCH 10/18] Add back default value for -d in SURPI.sh. --- SURPI.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SURPI.sh b/SURPI.sh index e0bc6ed..1193cbe 100755 --- a/SURPI.sh +++ b/SURPI.sh @@ -26,7 +26,7 @@ scriptname=${0##*/} while getopts "$optspec" option; do case "${option}" in f) config_file=${OPTARG};; # get parameters from config file if specified - d) ref_path=${OPTARG};; # Path to reference db directory. + d) ref_path=${OPTARG:-/reference};; # Path to reference db directory. h) HELP=1;; v) VERIFICATION=1;; z) create_config_file=${OPTARG} From 52b56755ac7c9bf73e2aecbb105fb6bcd29485d6 Mon Sep 17 00:00:00 2001 From: Simon Ye Date: Wed, 20 May 2015 16:49:02 -0400 Subject: [PATCH 11/18] Use 0x4 FLAGS bit for sam merging, not edit distance. --- compare_sam.py | 125 +++++++++++++++++++++++++------------------------ 1 file changed, 64 insertions(+), 61 deletions(-) diff --git a/compare_sam.py b/compare_sam.py index 07e8703..a8feb98 100755 --- a/compare_sam.py +++ b/compare_sam.py @@ -14,13 +14,13 @@ import os def logHeader(): - import os.path, sys, time - return "%s\t%s\t" % (time.strftime("%a %b %d %H:%M:%S %Z %Y"), os.path.basename(sys.argv[0])) + import os.path, sys, time + return "%s\t%s\t" % (time.strftime("%a %b %d %H:%M:%S %Z %Y"), os.path.basename(sys.argv[0])) usage = "compare_sam.py " if len(sys.argv) != 3: - print usage - sys.exit(0) + print usage + sys.exit(0) SAMfile1 = sys.argv[1] outputFile1 = sys.argv[2] @@ -41,63 +41,66 @@ def logHeader(): linenum += 1 while line1 != '': - data1 = line1.split() - - # check to see whether line is in the SAM header or alignment section - if (data1[0][0]!="@"): - #We are in the SAM alignment section - - edit_distance=int(data1[12].split(":")[2]) - - #using old snap (<=v0.15.4), could use length of line to determine if it was a hit or miss. - #new snap (>=v1.0) instead uses an edit distance of -1 to signify a miss. - - if (edit_distance >= 0): - #this is a hit - new_hits += 1 - sum_edistance_new += int(data1[12].split(":")[2]) - firstentry = data1[0].split("|") - if (len(firstentry)>=2): - # this has been previously flagged as a hit, and is also now a hit. Compare edit distances to find best hit. - edit_distance_previous=firstentry[1] - edit_distance_current = data1[12].split(":")[2] - - existing_hits += 1 - sum_edistance_existing += int(edit_distance_previous) - - if (int(edit_distance_current) <= int(edit_distance_previous)): - replacements += 1 - sum_data1 += int(edit_distance_current) - # need to pick the new gi, not the old gi, bug fix 1/27/13 - gi=data1[2].split("|")[1] - replacement_line = line1.replace("|" + firstentry[1] + "|" + firstentry[2],"|" + str(edit_distance_current) + "|" + str(gi),1) - outputFile.write(replacement_line) - else: - sum_data1 += int(edit_distance_previous) - outputFile.write(line1) - else: - edit_distance_current = data1[12].split(":")[2] - existing_hits += 1 - sum_data1 += int(edit_distance_current) - gi=data1[2].split("|")[1] - replacement_line = line1.replace(data1[0],data1[0] + "|" + str(edit_distance_current) + "|" + str(gi),1) - outputFile.write(replacement_line) - replacements+=1 - else: - #this is a miss - outputFile.write(line1) - firstentry = data1[0].split("|") - if (len(firstentry)>=2): - edit_distance_previous=firstentry[1] - existing_hits += 1 - sum_data1 += int(edit_distance_previous) - sum_edistance_existing += int(edit_distance_previous) - else: - #We are in the SAM header section - outputFile.write(line1) - - line1 = file1.readline() - linenum += 1 + data1 = line1.split() + + # check to see whether line is in the SAM header or alignment section + if (data1[0][0]!="@"): + #We are in the SAM alignment section + + edit_distance=int(data1[12].split(":")[2]) + mapped = not bool(int(data1[1]) & 0x4) + + #using old snap (<=v0.15.4), could use length of line to determine if it was a hit or miss. + #new snap (>=v1.0) instead uses an edit distance of -1 to signify a miss. + + # Not sure what above is saying. + # Use 0x4 mask in FLAG field to check for unmapped reads. + if (mapped): + #this is a hit + new_hits += 1 + sum_edistance_new += edit_distance + firstentry = data1[0].split("|") + if (len(firstentry)>=2): + # this has been previously flagged as a hit, and is also now a hit. Compare edit distances to find best hit. + edit_distance_previous=firstentry[1] + edit_distance_current = data1[12].split(":")[2] + + existing_hits += 1 + sum_edistance_existing += int(edit_distance_previous) + + if (int(edit_distance_current) <= int(edit_distance_previous)): + replacements += 1 + sum_data1 += int(edit_distance_current) + # need to pick the new gi, not the old gi, bug fix 1/27/13 + gi=data1[2].split("|")[1] + replacement_line = line1.replace("|" + firstentry[1] + "|" + firstentry[2],"|" + str(edit_distance_current) + "|" + str(gi),1) + outputFile.write(replacement_line) + else: + sum_data1 += int(edit_distance_previous) + outputFile.write(line1) + else: + edit_distance_current = data1[12].split(":")[2] + existing_hits += 1 + sum_data1 += int(edit_distance_current) + gi=data1[2].split("|")[1] + replacement_line = line1.replace(data1[0],data1[0] + "|" + str(edit_distance_current) + "|" + str(gi),1) + outputFile.write(replacement_line) + replacements+=1 + else: + #this is a miss + outputFile.write(line1) + firstentry = data1[0].split("|") + if (len(firstentry)>=2): + edit_distance_previous=firstentry[1] + existing_hits += 1 + sum_data1 += int(edit_distance_previous) + sum_edistance_existing += int(edit_distance_previous) + else: + #We are in the SAM header section + outputFile.write(line1) + + line1 = file1.readline() + linenum += 1 file1.close() outputFile.close() From 3b60a93b6d7d7b6f6331dbe92440f740ba1e6511 Mon Sep 17 00:00:00 2001 From: Simon Ye Date: Thu, 21 May 2015 11:26:47 -0400 Subject: [PATCH 12/18] Add debug/logging scripts. --- debug.sh | 41 +++++++++++++++++++++++++++++++++++++++++ logging.sh | 29 +++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100755 debug.sh create mode 100755 logging.sh diff --git a/debug.sh b/debug.sh new file mode 100755 index 0000000..8a07c32 --- /dev/null +++ b/debug.sh @@ -0,0 +1,41 @@ +#!/bin/bash +set -e +trap _exit_trap EXIT +trap _err_trap ERR +_showed_traceback=f + +function _exit_trap +{ + local _ec="$?" + if [[ $_ec != 0 && "${_showed_traceback}" != t ]]; then + traceback 1 + fi +} + +function _err_trap +{ + local _ec="$?" + local _cmd="${BASH_COMMAND:-unknown}" + traceback 1 + _showed_traceback=t + echo "The command ${_cmd} exited with exit code ${_ec}." 1>&2 +} + +function traceback +{ + # Hide the traceback() call. + local -i start=$(( ${1:-0} + 1 )) + local -i end=${#BASH_SOURCE[@]} + local -i i=0 + local -i j=0 + + echo "Traceback (last called is first):" 1>&2 + for ((i=${start}; i < ${end}; i++)); do + j=$(( $i - 1 )) + local function="${FUNCNAME[$i]}" + local file="${BASH_SOURCE[$i]}" + local line="${BASH_LINENO[$j]}" + echo " ${function}() in ${file}:${line}" 1>&2 + done +} + diff --git a/logging.sh b/logging.sh new file mode 100755 index 0000000..8168821 --- /dev/null +++ b/logging.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +########################################################################## +# library +########################################################################## + +# via http://stackoverflow.com/a/12199798/18706 +format_date() { + ((h=${1}/3600)) + ((m=(${1}%3600)/60)) + ((s=${1}%60)) + printf "%02d:%02d:%02d\n" $h $m $s +} + +start=$(date +"%s") +age() { + now=$(date +"%s") + the_age=$(($now-$start)) + format_date $the_age +} + +announce() { + echo `age`" ${FUNCNAME[ 1 ]} $1" +} + +log() { + echo -e "$(age)\t$scriptname\t $1" +} + From 1320ce26fec835e942e9f56c9e4b251fdbb7bbcd Mon Sep 17 00:00:00 2001 From: Simon Ye Date: Thu, 21 May 2015 21:41:51 -0400 Subject: [PATCH 13/18] Normalize subtraction snap directory name. --- create_SURPI_data.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/create_SURPI_data.sh b/create_SURPI_data.sh index e79a01b..93a1cd9 100755 --- a/create_SURPI_data.sh +++ b/create_SURPI_data.sh @@ -35,6 +35,7 @@ cleanup_dir="rawdata" #You should likely stick with the default values here. If changing them, then the corresponding #values will also need to be changed within the SURPI config file. reference_dir="reference" +subtraction_dir="Subtraction_SNAP" taxonomy_dir="taxonomy" RAPSearch_dir="RAPSearch" FAST_dir="FAST_SNAP" @@ -133,8 +134,12 @@ if [ ! -d "$reference_dir/$FAST_dir" ]; then mkdir "$reference_dir/$FAST_dir" fi +if [ ! -d "$reference_dir/$subtraction_dir" ]; then + mkdir "$reference_dir/$subtraction_dir" +fi + echo -e "$(date)\t$scriptname\tMoving curated data into place..." -mv snap_index_hg19_rRNA_mito_Hsapiens_rna "$reference_dir" +mv snap_index_hg19_rRNA_mito_Hsapiens_rna "$reference_dir/$subtraction_dir" mv snap_index_Bacterial_Refseq_05172012.CLEAN.LenFiltered.uniq_s16 "$reference_dir/$FAST_dir" mv rapsearch_viral_aa_130628_db_v2.12 "$reference_dir/$RAPSearch_dir" mv rapsearch_viral_aa_130628_db_v2.12.info "$reference_dir/$RAPSearch_dir" From a521bfc785581b2eaae2c71a249d31ebc9de12c0 Mon Sep 17 00:00:00 2001 From: Simon Ye Date: Thu, 21 May 2015 21:42:15 -0400 Subject: [PATCH 14/18] Taxonomy lookup use warnings not shebang. --- taxonomy_lookup.pl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/taxonomy_lookup.pl b/taxonomy_lookup.pl index 79c0f6e..35d7b85 100755 --- a/taxonomy_lookup.pl +++ b/taxonomy_lookup.pl @@ -1,4 +1,4 @@ -#!/usr/bin/env perl -w +#!/usr/bin/env perl # # taxonomy_lookup.pl # @@ -12,6 +12,7 @@ # Please see license file for details. use strict; +use warnings; #use diagnostics; use Time::HiRes qw[gettimeofday tv_interval]; use DBI; From 225d4c1bbf954e52b4f4b3e3dfbfdfb43f6a2069 Mon Sep 17 00:00:00 2001 From: Simon Ye Date: Thu, 21 May 2015 21:42:59 -0400 Subject: [PATCH 15/18] Specify fastq format in ribo snap. --- ribo_snap_bac_euk.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ribo_snap_bac_euk.sh b/ribo_snap_bac_euk.sh index 97aea22..5a8a080 100755 --- a/ribo_snap_bac_euk.sh +++ b/ribo_snap_bac_euk.sh @@ -44,12 +44,12 @@ fasta_to_fastq $inputfile.fasta > $inputfile.fakq # if Bacteria.annotated file h crop_reads.csh $inputfile.fakq 10 75 > $inputfile.fakq.crop # snap against large ribosomal subunit -snap single $SNAP_index_Large $inputfile.fakq.crop -o $inputfile.noLargeS.unmatched.sam -t $cores -x -f -h 250 -d 18 -n 200 -F u +snap single $SNAP_index_Large -fastq $inputfile.fakq.crop -o $inputfile.noLargeS.unmatched.sam -t $cores -x -f -h 250 -d 18 -n 200 -F u egrep -v "^@" $inputfile.noLargeS.unmatched.sam | awk '{if($3 == "*") print "@"$1"\n"$10"\n""+"$1"\n"$11}' > $(echo "$inputfile".noLargeS.unmatched.sam | sed 's/\(.*\)\..*/\1/').fastq echo -e "$(date)\t$scriptname\tDone: first snap alignment" # snap against small ribosomal subunit -snap single $SNAP_index_Small $inputfile.noLargeS.unmatched.fastq -o $inputfile.noSmallS_LargeS.sam -t $cores -h 250 -d 18 -n 200 -F u +snap single $SNAP_index_Small -fastq $inputfile.noLargeS.unmatched.fastq -o $inputfile.noSmallS_LargeS.sam -t $cores -h 250 -d 18 -n 200 -F u echo -e "$(date)\t$scriptname\tDone: second snap alignment" # convert snap unmatched to ribo output to header format From 911a905fcd601fb4a194761eed3c60ee3465f5ab Mon Sep 17 00:00:00 2001 From: Simon Ye Date: Thu, 21 May 2015 21:43:45 -0400 Subject: [PATCH 16/18] No dropcache in snap_nt_combine. --- snap_nt_combine.sh | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/snap_nt_combine.sh b/snap_nt_combine.sh index 08b71d7..cb700aa 100755 --- a/snap_nt_combine.sh +++ b/snap_nt_combine.sh @@ -19,7 +19,7 @@ # Copyright (C) 2014 Charles Chiu - All Rights Reserved # Permission to copy and modify is granted under the BSD license -expected_args=6 +expected_args=5 scriptname=${0##*/} if [ $# -lt $expected_args ] @@ -32,9 +32,8 @@ fi inputfile=$1 SNAP_NT_index_directory=$2 cores=$3 -free_cache_cutoff=$4 -SNAP_d_cutoff=$5 -simultaneous_SNAPs=$6 +SNAP_d_cutoff=$4 +simultaneous_SNAPs=$5 ### echo -e "$(date)\t$scriptname\tStarting SNAP to NT" @@ -55,14 +54,6 @@ for snap_index in $SNAP_NT_index_directory/*; do nopathsnap_index=${snap_index##*/} # remove the path to file echo -e "$(date)\t$scriptname\tStarting SNAP on $nopathsnap_index" - freemem=$(free -g | awk '{print $4}' | head -n 2 | tail -1) - echo -e "$(date)\t$scriptname\tThere is $freemem GB available free memory...[cutoff=$free_cache_cutoff GB]" - if [ $freemem -lt $free_cache_cutoff ] - then - echo -e "$(date)\t$scriptname\tClearing cache..." - dropcache - fi - START_SNAP=$(date +%s) /usr/bin/time -o $basef.$nopathsnap_index.snap.log snap single $snap_index $basef.fastq -o $basef.$nopathsnap_index.sam -t $cores -x -f -h 250 -d $SNAP_d_cutoff -n 25 > $basef.$nopathsnap_index.time.log SNAP_DONE=$(date +%s) From b998dc5f93337628751415b9d4ea7da23fedf952 Mon Sep 17 00:00:00 2001 From: Simon Ye Date: Thu, 21 May 2015 21:46:21 -0400 Subject: [PATCH 17/18] Cleanup and no dropcache. --- abyss_minimus.sh | 53 +++++++++++++----------- extractHeaderFromFastq_ncores.sh | 2 +- preprocess.sh | 28 +++++++------ preprocess_ncores.sh | 61 ++++++++++++--------------- snap_nt.sh | 71 ++++++++++++++++---------------- 5 files changed, 108 insertions(+), 107 deletions(-) diff --git a/abyss_minimus.sh b/abyss_minimus.sh index cc924f5..99d62d7 100755 --- a/abyss_minimus.sh +++ b/abyss_minimus.sh @@ -15,8 +15,9 @@ # Copyright (C) 2014 Samia N Naccache - All Rights Reserved # SURPI has been released under a modified BSD license. # Please see license file for details. - scriptname=${0##*/} +source debug.sh +source logging.sh if [ $# -lt 5 ] then @@ -34,7 +35,7 @@ ignore_barcodes=$6 split_FASTA_size=100000 #### create list of barcodes in the fasta file in the following format: #barcode -echo -e "$(date)\t$scriptname\tDemultiplexing barcodes..." +log "Demultiplexing barcodes..." START_DEMULTIPLEX=$(date +%s) #The sed '/^#$/d' removes a blank #. This is needed in cases where barcodes are present along with reads that have no barcodes @@ -54,10 +55,10 @@ else #$inputfile.barcodes doesn't exist, or size=0, then we only have 1 barcode, echo "#" > $inputfile.barcodes fi -echo -e "$(date)\t$scriptname\tCompleted demultiplexing barcodes." +log "Completed demultiplexing barcodes." END_DEMULTIPLEX=$(date +%s) diff_DEMULTIPLEX=$(( END_DEMULTIPLEX - START_DEMULTIPLEX )) -echo -e "$(date)\t$scriptname\tBarcode demultiplex took $diff_DEMULTIPLEX s." +log "Barcode demultiplex took $diff_DEMULTIPLEX s." ### generate fasta file for every separate barcode (demultiplex) for f in `cat $inputfile.barcodes` ; do @@ -68,30 +69,33 @@ for f in `cat $inputfile.barcodes` ; do grep -E "$f(/|$)" $inputfile -A 1 --no-group-separator > bar$f.$inputfile fi # split demultiplexed fasta file into 100,000 read sub-fastas - echo -e "$(date)\t$scriptname\tSplitting FASTA into chunks of size: $split_FASTA_size." + log "Splitting FASTA into chunks of size: $split_FASTA_size." START_SPLIT=$(date +%s) cp bar$f.$inputfile bar$f.${inputfile}_n # So that the unsplit demultiplexed file is also denovo assembled # split_fasta.pl -i bar$f.$inputfile -o bar$f.$inputfile -n $split_FASTA_size - echo -e "$(date)\t$scriptname\tCompleted splitting FASTA file." + log "Completed splitting FASTA file." END_SPLIT=$(date +%s) diff_SPLIT=$(( $END_SPLIT - $START_SPLIT )) - echo -e "$(date)\t$scriptname\tSplitting FASTA took $diff_SPLIT s." + log "Splitting FASTA took $diff_SPLIT s." ### run abyss (deBruijn assembler) on each 100,000 read demultiplexed fasta file, including the unsplit demultiplexed file - echo -e "$(date)\t$scriptname\tRunning abyss on each $split_FASTA_size chunk..." + log "Running abyss on each $split_FASTA_size chunk..." START_ABYSS=$(date +%s) for d in bar$f.${inputfile}_* ; do - abyss-pe k=$kmer name=$d.f se=$d np=$cores >& $d.abyss.log + echo $d + log "Command: abyss-pe k=$kmer name=$d.f se=$d np=$cores >& $d.abyss.log" + #abyss-pe k=$kmer name=$d.f se=$d np=$cores >& $d.abyss.log + abyss-pe k=$kmer name=$d.f se=$d >& $d.abyss.log done - echo -e "$(date)\t$scriptname\tCompleted running abyss on each $split_FASTA_size chunk." + log "Completed running abyss on each $split_FASTA_size chunk." END_ABYSS=$(date +%s) diff_ABYSS=$(( END_ABYSS - START_ABYSS )) - echo -e "$(date)\t$scriptname\tAbyss took $diff_ABYSS s." + log "Abyss took $diff_ABYSS s." ### ### contigs from split files concatenated, after which reads smaller than the cutoff value are eliminated - echo -e "$(date)\t$scriptname\tStarting concatenating and cutoff of abyss output" + log "Starting concatenating and cutoff of abyss output" START_CATCONTIGS=$(date +%s) #concatenating contig files from different fasta splits, and adding kmer infromation and barcode information to contig headers @@ -99,21 +103,22 @@ for f in `cat $inputfile.barcodes` ; do # only contigs larger than $cutoff_post_Abyss are retained cat all.bar$f.$inputfile.unitigs.fa | awk 'NR%2==1 {x = $0} NR%2==0 { if (length($0) >= '$2') printf("%s\n%s\n",x,$0)}' > all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss.fa - echo -e "$(date)\t$scriptname\tDone concatenating and cutoff of abyss output" + log "Done concatenating and cutoff of abyss output" END_CATCONTIGS=$(date +%s) diff_CATCONTIGS=$(( END_CATCONTIGS - START_CATCONTIGS )) - echo -e "$(date)\t$scriptname\tConcatenating and cutoff of abyss output took $diff_CATCONTIGS s." + log "Concatenating and cutoff of abyss output took $diff_CATCONTIGS s." ### ### run Minimo (OLC assembler) - echo -e "$(date)\t$scriptname\tStarting Minimo..." + log "Starting Minimo..." START_MINIMO=$(date +%s) + log "Command: Minimo all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss.fa -D FASTA_EXP=1" Minimo all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss.fa -D FASTA_EXP=1 - echo -e "$(date)\t$scriptname\tCompleted Minimo." + log "Completed Minimo." END_MINIMO=$(date +%s) diff_MINIMO=$(( END_MINIMO - START_MINIMO )) - echo -e "$(date)\t$scriptname\tMinimo took $diff_MINIMO s." + log "Minimo took $diff_MINIMO s." ########### - echo -e "$(date)\t$scriptname\tStarting cat barcode addition and cutoff of minimo output" + log "Starting cat barcode addition and cutoff of minimo output" START_MIN_PROCESS=$(date +%s) # Minimo output gives more than one line per sequence, so here we linearize sequences (linearization protocol from here http://seqanswers.com/forums/showthread.php?t=27567 ) # then we add the relevant barcode to the end of the header contig. Contigs that survive untouched from abyss already have a barcode at the end of them, so that extra barcode is taken away @@ -123,23 +128,23 @@ for f in `cat $inputfile.barcodes` ; do # only contigs larger than $cutoff_post_Minimo are retained cat all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss-minim.fa | awk 'NR%2==1 {x = $0} NR%2==0 { if (length($0) >= '$3') printf("%s\n%s\n",x,$0)}' > all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss.${cutoff_post_Minimo}-mini.fa - echo -e "$(date)\t$scriptname\tDone cat barcode addition and cutoff of minimo output" + log "Done cat barcode addition and cutoff of minimo output" END_MIN_PROCESS=$(date +%s) diff_MIN_PROCESS=$(( END_MIN_PROCESS - START_MIN_PROCESS )) - echo -e "$(date)\t$scriptname\tcat barcode addition and cutoff of minimo output took $diff_MIN_PROCESS s" + log "cat barcode addition and cutoff of minimo output took $diff_MIN_PROCESS s" done ### ### concatenate deBruijn -> OLC contigs from all barcodes together -echo -e "$(date)\t$scriptname\tConcatenating all barcodes together..." +log "Concatenating all barcodes together..." START_CAT=$(date +%s) cat all.bar*.$inputfile.unitigs.cut$cutoff_post_Abyss.${cutoff_post_Minimo}-mini.fa > all.$inputfile.unitigs.cut$cutoff_post_Abyss.${cutoff_post_Minimo}-mini.fa -echo -e "$(date)\t$scriptname\tCompleted concatenating all barcodes." +log "Completed concatenating all barcodes." END_CAT=$(date +%s) diff_CAT=$(( END_CAT - START_CAT )) -echo -e "$(date)\t$scriptname\tBarcode concatenation took $diff_CAT s." +log "Barcode concatenation took $diff_CAT s." # cleaning up files by organizing directories, moving files into directories, and removing temporary files -mkdir $inputfile.dir +mkdir --parents $inputfile.dir mv bar*$inputfile*.fa $inputfile.dir if [ -e all.$inputfile.contigs.abyssmini.cut$cutoff_post_Abyss.$cutoff_post_Minimo.e1.NR.RAPSearch.m8 ] then diff --git a/extractHeaderFromFastq_ncores.sh b/extractHeaderFromFastq_ncores.sh index c0e63eb..f95e4ee 100755 --- a/extractHeaderFromFastq_ncores.sh +++ b/extractHeaderFromFastq_ncores.sh @@ -50,7 +50,7 @@ split -l $LinesPerCore -a 3 $parentfile $parentfile.SplitXS & awk '{print$1}' $queryfile > $queryfile.header & echo -e "$(date)\t$scriptname\textracting header from $queryfile" -for job in `jobs -p` +for job in $(jobs -p) do wait $job done diff --git a/preprocess.sh b/preprocess.sh index 1791961..032ba42 100755 --- a/preprocess.sh +++ b/preprocess.sh @@ -17,6 +17,8 @@ # Please see license file for details. scriptname=${0##*/} +source debug.sh +source logging.sh if [ $# != 10 ]; then echo "Usage: $scriptname " @@ -44,15 +46,15 @@ fi if [ $quality = "S" ] then - echo -e "$(date)\t$scriptname\tselected Sanger quality" + log "selected Sanger quality" else - echo -e "$(date)\t$scriptname\tselected Illumina quality" + log "selected Illumina quality" fi # fix header if space is present s=`head -1 $inputfile | awk '{if ($0 ~ / /) {print "SPACE"} else {print "NOSPACE"}}'` -echo -e "$(date)\t$scriptname\t$s in header" +log "$s in header" nopathf=${1##*/} basef=${nopathf%.fastq} @@ -60,7 +62,7 @@ basef=${nopathf%.fastq} #################### START OF PREPROCESSING, READ1 ######################### # run cutadapt, Read1 -echo -e "$(date)\t$scriptname\t********** running cutadapt, Read1 **********" +log "********** running cutadapt, Read1 **********" if [ $s == "SPACE" ] then sed "s/\([@HWI|@M00135|@SRR][^ ]*\) \(.\):.:0:\(.*\)/\1#\3\/\2/g" $inputfile > $basef.modheader.fastq @@ -76,12 +78,12 @@ fi END1=$(date +%s) diff=$(( $END1 - $START1 )) -echo -e "$(date)\t$scriptname\tDone cutadapt: CUTADAPT took $diff seconds" +log "Done cutadapt: CUTADAPT took $diff seconds" # run uniq, Read1 if [ $run_uniq == "Y" ] then - echo -e "$(date)\t$scriptname\t********** running uniq, Read1 **********" + log "********** running uniq, Read1 **********" START1=$(date +%s) if [ $quality = "S" ] @@ -93,28 +95,28 @@ then END1=$(date +%s) diff=$(( $END1 - $START1 )) - echo -e "$(date)\t$scriptname\tDone uniq: UNIQ took $diff seconds" + log "Done uniq: UNIQ took $diff seconds" fi # run crop, Read 1 -echo -e "$(date)\t$scriptname\t********** running crop, Read1 **********" +log "********** running crop, Read1 **********" START1=$(date +%s) if [ $run_uniq == "Y" ] then - echo -e "$(date)\t$scriptname\tWe will be using $crop_length as the length of the cropped read" + log "We will be using $crop_length as the length of the cropped read" crop_reads.csh $basef.cutadapt.uniq.fastq $start_nt $crop_length > $basef.cutadapt.uniq.cropped.fastq else - echo -e "$(date)\t$scriptname\tWe will be using $crop_length as the length of the cropped read" + log "We will be using $crop_length as the length of the cropped read" crop_reads.csh $basef.cutadapt.fastq $start_nt $crop_length > $basef.cutadapt.cropped.fastq fi END1=$(date +%s) diff=$(( $END1 - $START1 )) -echo -e "$(date)\t$scriptname\tDone crop: CROP took $diff seconds" +log "Done crop: CROP took $diff seconds" # run dust, Read1 -echo -e "$(date)\t$scriptname\t********** running dust, Read1 **********" +log "********** running dust, Read1 **********" START1=$(date +%s) if [ $run_uniq == "Y" ] @@ -128,4 +130,4 @@ fi END1=$(date +%s) diff=$(( $END1 - $START1 )) -echo -e "$(date)\t$scriptname\tDone dust: DUST took $diff seconds" +log "Done dust: DUST took $diff seconds" diff --git a/preprocess_ncores.sh b/preprocess_ncores.sh index 89760a5..1b8ba03 100755 --- a/preprocess_ncores.sh +++ b/preprocess_ncores.sh @@ -13,8 +13,9 @@ # Please see license file for details. scriptname=${0##*/} +source logging.sh -if [ $# != 12 ]; then +if [ $# != 11 ]; then echo "Usage: $scriptname <# of cores> " exit fi @@ -25,13 +26,12 @@ quality=$2 run_uniq=$3 length_cutoff=$4 cores=$5 -cache_reset=$6 -keep_short_reads=$7 -adapter_set=$8 -start_nt=$9 -crop_length=${10} -temporary_files_directory=${11} -quality_cutoff=${12} +keep_short_reads=$6 +adapter_set=$7 +start_nt=$8 +crop_length=${9} +temporary_files_directory=${10} +quality_cutoff=${11} ### if [ ! -f $inputfile ] @@ -40,40 +40,33 @@ then exit fi -freemem=$(free -g | awk '{print $4}' | head -n 2 | tail -1) -echo -e "$(date)\t$scriptname\tThere is $freemem GB available free memory...[cutoff=$free_cache_cutoff GB]" -if [ $freemem -lt $free_cache_cutoff ] -then - echo -e "$(date)\t$scriptname\tClearing cache..." - dropcache -fi - START=$(date +%s) -echo -e "$(date)\t$scriptname\tSplitting $inputfile..." +log "Splitting $inputfile..." let "numlines = `wc -l $inputfile | awk '{print $1}'`" let "FASTQentries = numlines / 4" -echo -e "$(date)\t$scriptname\tThere are $FASTQentries FASTQ entries in $inputfile" +log "There are $FASTQentries FASTQ entries in $inputfile" let "LinesPerCore = numlines / $cores" -let "FASTQperCore = LinesPerCore / 4" +# Prevent leaving a small remainder file after split by adding 1. +let "FASTQperCore = LinesPerCore / 4 + 1" let "SplitPerCore = FASTQperCore * 4" -echo -e "$(date)\t$scriptname\twill use $cores cores with $FASTQperCore entries per core" +log "will use $cores cores with $FASTQperCore entries per core" split -l $SplitPerCore $inputfile END_SPLIT=$(date +%s) diff_SPLIT=$(( END_SPLIT - START )) -echo -e "$(date)\t$scriptname\tDone splitting: " -echo -e "$(date)\t$scriptname\tSPLITTING took $diff_SPLIT seconds" +log "Done splitting: " +log "SPLITTING took $diff_SPLIT seconds" -echo -e "$(date)\t$scriptname\tRunning preprocess script for each chunk..." +log "Running preprocess script for each chunk..." for f in x?? do mv $f $f.fastq - echo -e "$(date)\t$scriptname\tpreprocess.sh $f.fastq $quality N $length_cutoff $keep_short_reads $adapter_set $start_nt $crop_length $temporary_files_directory >& $f.preprocess.log &" + log "preprocess.sh $f.fastq $quality N $length_cutoff $keep_short_reads $adapter_set $start_nt $crop_length $temporary_files_directory >& $f.preprocess.log &" preprocess.sh $f.fastq $quality N $length_cutoff $keep_short_reads $adapter_set $start_nt $crop_length $temporary_files_directory $quality_cutoff >& $f.preprocess.log & done @@ -82,7 +75,7 @@ do wait $job done -echo -e "$(date)\t$scriptname\tDone preprocessing for each chunk..." +log "Done preprocessing for each chunk..." nopathf2=${1##*/} basef2=${nopathf2%.fastq} @@ -117,11 +110,11 @@ do rm -f $basef.cutadapt.cropped.fastq done -echo -e "$(date)\t$scriptname\tDone concatenating output..." +log "Done concatenating output..." if [ $run_uniq == "Y" ]; # selecting unique reads then - echo -e "$(date)\t$scriptname\tSelecting unique reads" + log "Selecting unique reads" START_UNIQ=$(date +%s) # selecting unique reads sed "n;n;n;d" $basef2.preprocessed.fastq | sed "n;n;d" | sed "s/^@/>/g" > $basef2.preprocessed.fasta @@ -130,28 +123,28 @@ then cp -f $basef2.uniq.fastq $basef2.preprocessed.fastq END_UNIQ=$(date +%s) diff_UNIQ=$(( END_UNIQ - START_UNIQ )) - echo -e "$(date)\t$scriptname\tUNIQ took $diff_UNIQ seconds" + log "UNIQ took $diff_UNIQ seconds" else - echo -e "$(date)\t$scriptname\tIncluding duplicates (did not run UNIQ)" + log "Including duplicates (did not run UNIQ)" fi END=$(date +%s) diff_TOTAL=$(( END - START )) let "avgtime1=`grep CUTADAPT $basef2.preprocess.log | awk '{print $12}' | sort -n | awk '{ a[i++]=$1} END {print a[int(i/2)];}'`" -echo -e "$(date)\t$scriptname\tmedian CUTADAPT time per core: $avgtime1 seconds" +log "median CUTADAPT time per core: $avgtime1 seconds" if [ $run_uniq = "Y" ]; then let "avgtime2 = $diff_UNIQ" - echo -e "$(date)\t$scriptname\tUNIQ time: $diff_UNIQ seconds" + log "UNIQ time: $diff_UNIQ seconds" else let "avgtime2=0" fi let "avgtime3=`grep DUST $basef2.preprocess.log | awk '{print $12}' | sort -n | awk '{ a[i++]=$1} END {print a[int(i/2)];}'`" -echo -e "$(date)\t$scriptname\tmedian DUST time per core: $avgtime3 seconds" +log "median DUST time per core: $avgtime3 seconds" let "totaltime = diff_SPLIT + avgtime1 + avgtime2 + avgtime3" -echo -e "$(date)\t$scriptname\tTOTAL TIME: $totaltime seconds" +log "TOTAL TIME: $totaltime seconds" -echo -e "$(date)\t$scriptname\tTOTAL CLOCK TIME (INCLUDING OVERHEAD): $diff_TOTAL seconds" +log "TOTAL CLOCK TIME (INCLUDING OVERHEAD): $diff_TOTAL seconds" diff --git a/snap_nt.sh b/snap_nt.sh index f0fe92c..c010cff 100755 --- a/snap_nt.sh +++ b/snap_nt.sh @@ -10,13 +10,14 @@ # Copyright (C) 2014 Charles Chiu - All Rights Reserved # SURPI has been released under a modified BSD license. # Please see license file for details. - -expected_args=6 scriptname=${0##*/} +source debug.sh +source logging.sh +expected_args=5 if [ $# -lt $expected_args ] then - echo "Usage: $scriptname " + echo "Usage: $scriptname " exit 65 fi @@ -24,49 +25,48 @@ fi inputfile=$1 SNAP_NT_index_directory=$2 cores=$3 -free_cache_cutoff=$4 -SNAP_d_cutoff=$5 -snap=$6 +SNAP_d_cutoff=$4 +snap=$5 ### -echo -e "$(date)\t$scriptname\tStarting SNAP to NT" +log "Starting SNAP to NT" START1=$(date +%s) -echo -e "$(date)\t$scriptname\tInput file: $inputfile" +log "Input file: $inputfile" nopathf=${inputfile##*/} # remove the path to file -echo -e "$(date)\t$scriptname\tAfter removing path: $nopathf" +log "After removing path: $nopathf" basef=${nopathf%.fastq} # remove FASTQextension -echo -e "$(date)\t$scriptname\tAfter removing FASTQ extension: $basef" - -echo -e "$(date)\t$scriptname\tMapping $basef to NT..." - -rm -f $basef.prev.sam -rm -f $basef.tmp.sam -rm -f $basef.tmp2.sam -rm -f $basef.NT.sam - +log "After removing FASTQ extension: $basef" + +log "Mapping $basef to NT..." + +cleanup() { + rm -f $basef.prev.sam || true + rm -f $basef.tmp.sam || true + rm -f $basef.tmp2.sam || true + rm -f $basef.NT.sam || true + rm $basef.snapNT.log || true + rm $basef.timeNT.log || true +} +cleanup counter=0 -for snap_index in $SNAP_NT_index_directory/* ; do - freemem=$(free -g | awk '{print $4}' | head -n 2 | tail -1) - echo -e "$(date)\t$scriptname\tThere is $freemem GB available free memory...[cutoff=$free_cache_cutoff GB]" - if [ $freemem -lt $free_cache_cutoff ] - then - echo -e "$(date)\t$scriptname\tClearing cache..." - dropcache - fi - nopathsnap_index=${snap_index##*/} # remove the path to file - echo -e "$(date)\t$scriptname\tFound $snap_index ... processing ..." +for snap_index_basename in $(ls -1v "$SNAP_NT_index_directory") ; do + #nopathsnap_index=${snap_index##*/} # remove the path to file + snap_index = "$SNAP_NT_index_directory/$snap_index_basename" + log "Found $snap_index_basename ... processing ..." START2=$(date +%s) if [[ $counter -eq 0 ]] then #running first SNAP chunk - /usr/bin/time -o $basef.snap.log $snap single $snap_index $basef.fastq -o $basef.tmp.sam -t $cores -x -f -h 250 -d $SNAP_d_cutoff -n 25 > $basef.time.log + /usr/bin/time -o $basef.time.log $snap single $snap_index $basef.fastq -o $basef.$nopathsnap_index.sam -t $cores -x -f -h 250 -d $SNAP_d_cutoff -n 25 > $basef.snap.log + ln --symbolic --force $basef.$nopathsnap_index.sam $basef.tmp.sam # cp $basef.tmp.sam temp.sam else #running 2nd SNAP chunk through last SNAP chunk - /usr/bin/time -o $basef.snap.log $snap single $snap_index $basef.tmp.fastq -o $basef.tmp.sam -t $cores -x -f -h 250 -d $SNAP_d_cutoff -n 25 > $basef.time.log + /usr/bin/time -o $basef.time.log $snap single $snap_index $basef.tmp.fastq -o $basef.$nopathsnap_index.sam -t $cores -x -f -h 250 -d $SNAP_d_cutoff -n 25 > $basef.snap.log + ln --symbolic --force $basef.$nopathsnap_index.sam $basef.tmp.sam fi cat $basef.snap.log >> $basef.snapNT.log @@ -80,9 +80,9 @@ for snap_index in $SNAP_NT_index_directory/* ; do END2=$(date +%s) - echo -e "$(date)\t$scriptname\tDone with $snap_index." + log "Done with $snap_index." diff=$(( $END2 - $START2 )) - echo -e "$(date)\t$scriptname\tMapping of $snap_index took $diff seconds." + log "Mapping of $snap_index took $diff seconds." done # need to restore the hits @@ -91,9 +91,10 @@ update_sam.py $basef.prev.sam $basef.NT.sam rm -f $basef.tmp.sam rm -f $basef.tmp.fastq rm -f $basef.prev.sam +rm -f $basef.$nopathsnap_index.* END1=$(date +%s) -echo -e "$(date)\t$scriptname\tDone with SNAP_NT" +log "Done with SNAP_NT" diff=$(( $END1 - $START1 )) -echo -e "$(date)\t$scriptname\toutput written to $basef.NT.sam" -echo -e "$(date)\t$scriptname\tSNAP_NT took $diff seconds" +log "output written to $basef.NT.sam" +log "SNAP_NT took $diff seconds" From f60acf68a3403cb22c517ed68cd0bb190467111f Mon Sep 17 00:00:00 2001 From: Simon Ye Date: Thu, 21 May 2015 21:46:58 -0400 Subject: [PATCH 18/18] Cleanup SURPI No snap-dev, just snap No req on ABYSS-P No dropcache Add specific checkpointing file --- SURPI.sh | 519 ++++++++++++++++++++++++++----------------------------- 1 file changed, 245 insertions(+), 274 deletions(-) diff --git a/SURPI.sh b/SURPI.sh index 1193cbe..fe9a19c 100755 --- a/SURPI.sh +++ b/SURPI.sh @@ -14,14 +14,16 @@ SURPI_version="1.0.22" optspec=":f:d:hvz:" -bold=$(tput bold) -normal=$(tput sgr0) +bold=$(tput bold 2> /dev/null || true) +normal=$(tput sgr0 2> /dev/null || true) green='\e[0;32m' red='\e[0;31m' endColor='\e[0m' host=$(hostname) scriptname=${0##*/} +source debug.sh +source logging.sh while getopts "$optspec" option; do case "${option}" in @@ -29,10 +31,10 @@ while getopts "$optspec" option; do d) ref_path=${OPTARG:-/reference};; # Path to reference db directory. h) HELP=1;; v) VERIFICATION=1;; - z) create_config_file=${OPTARG} + z) create_config_file=${OPTARG} configprefix=${create_config_file%.fastq} ;; - :) echo "Option -$OPTARG requires an argument." >&2 + :) echo "Option -$OPTARG requires an argument." >&2 exit 1 ;; esac @@ -40,58 +42,59 @@ done if [[ $HELP -eq 1 || $# -lt 1 ]] then - cat < go_$configprefix + echo "PATH=$PATH:/usr/local/bin/surpi:/usr/local/bin:/usr/bin/:/bin" > go_$configprefix echo "nohup $scriptname -f $configprefix.config > SURPI.$configprefix.log 2> SURPI.$configprefix.err" >> go_$configprefix chmod +x go_$configprefix @@ -153,7 +156,8 @@ run_mode="Comprehensive" # $basef.cutadapt.fastq # $basef.preprocessed.fastq #preprocess="skip" - +#human_mapping="skip" +#alignment="skip" ########################## # Preprocessing @@ -178,7 +182,7 @@ quality_cutoff=18 ########################## #SNAP executable -snap="/usr/local/bin/snap-dev" +snap="snap" #SNAP edit distance for Computational Subtraction of host genome [Highly recommended default: d_human=12] #see Section 3.1.2 MaxDist description: http://snap.cs.berkeley.edu/downloads/snap-1.0beta-manual.pdf @@ -247,7 +251,7 @@ eBLASTn="1e-15" # SURPI will subtract all SNAP databases found in this directory from the input sequence # Useful if you want to subtract multiple genomes (without combining SNAP databases) # or, if you need to split a db if it is larger than available RAM. -SNAP_subtraction_db="$ref_path/hg19" +SNAP_subtraction_folder="$ref_path/Subtraction_SNAP" # directory for SNAP-indexed databases of NCBI NT (for mapping phase in comprehensive mode) # directory must ONLY contain snap indexed databases @@ -266,11 +270,11 @@ taxonomy_db_directory="$ref_path/taxonomy" #RAPSearch viral database name: indexed protein dataset (all of Viruses) #make sure that directory also includes the .info file -RAPSearch_VIRUS_db="$ref_path/RAPSearch/rapsearch_viral_db" +RAPSearch_VIRUS_db="$ref_path/RAPSearch/rapsearch_viral_aa_130628_db_v2.12" #RAPSearch nr database name: indexed protein dataset (all of NR) #make sure that directory also includes the .info file -RAPSearch_NR_db="$ref_path/RAPSearch/rapsearch_nr_db" +RAPSearch_NR_db="$ref_path/RAPSearch/rapsearch_nr_db_v2.12" ribo_snap_bac_euk_directory="$ref_path/RiboClean_SNAP" @@ -287,11 +291,6 @@ ribo_snap_bac_euk_directory="$ref_path/RiboClean_SNAP" #This folder will not be created by SURPI, so be sure it already exists with proper permissions. temporary_files_directory="/tmp/" -#This parameter controls whether dropcache is used throughout the pipeline. If free RAM is less than cache_reset, -# then dropcache. If cache_reset=0, then dropcache will never be used. -cache_reset="0" - - ########################## # AWS related values ########################## @@ -403,25 +402,6 @@ then exit 65 fi -#set cache_reset. if none specified: -# >500GB -> 200GB -# >200GB -> 150GB -# otherwise -> 50GB -# note: this may cause problems on a machine with <50GB RAM -if [ ! $cache_reset ] -then - total_ram=$(grep MemTotal /proc/meminfo | awk '{print $2}') - if [ "$total_ram" -gt "500000000" ] #500GiB - then - cache_reset=200 # This is 200GB - elif [ "$total_ram" -gt "200000000" ] #200 GiB - then - cache_reset=150 - else - cache_reset=50 - fi -fi - #these 2 parameters are for cropping prior to snap in the preprocessing stage if [ ! $start_nt ] then @@ -512,10 +492,8 @@ seqtk_version=$(seqtk 2>&1 | head -3 | tail -1 | awk '{print $2}') cutadapt_version=$(cutadapt --version) prinseqlite_version=$(prinseq-lite.pl --version 2>&1 | awk '{print $2}') snap_version=$(snap 2>&1 | grep version | awk '{print $5}') -snap_dev_version=$(snap-dev 2>&1 | grep version | awk '{print $5}') rapsearch_version=$(rapsearch 2>&1 | head -2 | tail -1 | awk '{print $2}') abyss_pe_version=$(abyss-pe version | head -2 | tail -1 | awk '{print $3}') -ABYSS_P_version=$(ABYSS-P --version | head -1 | awk '{print $3}') Minimo_version=$(Minimo -h | tail -2 | awk '{print $2}') echo -e "SURPI version: $SURPI_version" echo -e "config file version: $config_file_version" @@ -524,10 +502,8 @@ echo -e "seqtk: $seqtk_version" echo -e "cutadapt: $cutadapt_version" echo -e "prinseq-lite: $prinseqlite_version${endColor}" echo -e "snap: $snap_version${endColor}" -echo -e "snap-dev: $snap_dev_version${endColor}" echo -e "RAPSearch: $rapsearch_version" echo -e "abyss-pe: $abyss_pe_version" -echo -e "ABYSS-P: $ABYSS_P_version" echo -e "Minimo: $Minimo_version" echo "-----------------------------------------------------------------------------------------" @@ -547,9 +523,10 @@ do done echo -e "SNAP Comprehensive Mode database" -for f in $SNAP_COMPREHENSIVE_db_dir/* +for f_basename in $(ls -1v "$SNAP_COMPREHENSIVE_db_dir") do - if [ -f $f/Genome ] + f = "$SNAP_COMPREHENSIVE_db_dir"/"$f_basename" + if [ -f $f/Genome ] then echo -e "\t$f: ${green}OK${endColor}" else @@ -578,7 +555,8 @@ done #verify taxonomy is functioning properly result=$( taxonomy_lookup_embedded.pl -d nucl -q $taxonomy_db_directory 149408158 ) -if [ $result = "149408158" ] +echo "taxonomy result: ${result}" +if [ $result == "149408158" ] then echo -e "taxonomy: ${green}OK${endColor}" else @@ -679,7 +657,6 @@ echo "contigcutoff for abyss assembly unitigs: $contigcutoff" echo "abysskmer length: $abysskmer" echo "Ignore barcodes for assembly? $ignore_barcodes_for_de_novo" -echo "cache_reset (if 0, then dropcache will never be used): $cache_reset" echo "start_nt: $start_nt" echo "crop_length: $crop_length" @@ -736,10 +713,10 @@ then exit fi ########################################################### -echo -e "$(date)\t$scriptname\t########## STARTING SURPI PIPELINE ##########" +log "########## STARTING SURPI PIPELINE ##########" START_PIPELINE=$(date +%s) -echo -e "$(date)\t$scriptname\tFound file $FASTQ_file" -echo -e "$(date)\t$scriptname\tAfter removing path: $nopathf" +log "Found file $FASTQ_file" +log "After removing path: $nopathf" ############ Start up AWS slave machines ################## file_with_slave_ips="slave_list.txt" @@ -747,49 +724,51 @@ file_with_slave_ips="slave_list.txt" if [ "$snap_nt_procedure" = "AWS_master_slave" ] then # start the slaves as a background process. They should be ready to run at the SNAP to NT step in the pipeline. - start_slaves.sh $ami_id $actual_slave_instances $instance_type $keypair $security_group $availability_zone $file_with_slave_ips $placement_group & # > $basef.AWS.log 2>&1 + start_slaves.sh $ami_id $actual_slave_instances $instance_type $keypair $security_group $availability_zone $file_with_slave_ips $placement_group & # > $basef.AWS.log "2>&1 fi ############ PREPROCESSING ################## -if [ "$preprocess" != "skip" ] +if grep -qs preprocess_complete "$basef.checkpoint"; then + preprocess_complete=1 +fi +if [[ "$preprocess" != "skip" ]] && [[ "$preprocess_complete" != 1 ]] then - echo -e "$(date)\t$scriptname\t############### PREPROCESSING ###############" - echo -e "$(date)\t$scriptname\tStarting: preprocessing using $cores cores " + if [[ -f $basef.cutadapt.fastq.done ]]; then + cp $basef.cutadapt.fastq + fi + log "############### PREPROCESSING ###############" + log "Starting: preprocessing using $cores cores " START_PREPROC=$(date +%s) - echo -e "$(date)\t$scriptname\tParameters: preprocess_ncores.sh $basef.fastq $quality N $length_cutoff $cores Y N $adapter_set $start_nt $crop_length $temporary_files_directory >& $basef.preprocess.log" - preprocess_ncores.sh $basef.fastq $quality N $length_cutoff $cores $cache_reset N $adapter_set $start_nt $crop_length $temporary_files_directory $quality_cutoff >& $basef.preprocess.log - echo -e "$(date)\t$scriptname\tDone: preprocessing " + log "Parameters: preprocess_ncores.sh $basef.fastq $quality N $length_cutoff $cores Y N $adapter_set $start_nt $crop_length $temporary_files_directory >& $basef.preprocess.log" + preprocess_ncores.sh $basef.fastq $quality N $length_cutoff $cores N $adapter_set $start_nt $crop_length $temporary_files_directory $quality_cutoff >& $basef.preprocess.log + log "Done: preprocessing " END_PREPROC=$(date +%s) diff_PREPROC=$(( END_PREPROC - START_PREPROC )) - echo -e "$(date)\t$scriptname\tPreprocessing took $diff_PREPROC seconds" | tee timing.$basef.log -fi -# verify preprocessing step -if [ ! -s $basef.cutadapt.fastq ] || [ ! -s $basef.preprocessed.fastq ] -then - echo -e "$(date)\t$scriptname\t${red}Preprocessing appears to have failed. One of the following files does not exist, or is of 0 size:${endColor}" - echo "$basef.cutadapt.fastq" - echo "$basef.preprocessed.fastq" - exit + log "Preprocessing took $diff_PREPROC seconds" | tee timing.$basef.log + + # verify preprocessing step + if [ ! -s $basef.cutadapt.fastq ] || [ ! -s $basef.preprocessed.fastq ] + then + log "${red}Preprocessing appears to have failed. One of the following files does not exist, or is of 0 size:${endColor}" + echo "$basef.cutadapt.fastq" + echo "$basef.preprocessed.fastq" + exit + else + echo "preprocess_complete" >> "$basef.checkpoint" + fi fi ############# BEGIN SNAP PIPELINE ################# -freemem=$(free -g | awk '{print $4}' | head -n 2 | tail -1) -echo -e "$(date)\t$scriptname\tThere is $freemem GB available free memory...[cutoff=$cache_reset GB]" -if [[ $dropcache == "Y" ]] -then - if [ "$freemem" -lt "$cache_reset" ] - then - echo -e "$(date)\t$scriptname\tClearing cache..." - dropcache - fi -fi +basef_h=${nopathf%.fastq}.preprocessed.s20.h250n25d${d_human}xfu # remove fastq extension ############# HUMAN MAPPING ################# -if [ "$human_mapping" != "skip" ] +if grep -qs human_mapping_complete "$basef.checkpoint"; then + human_mapping_complete=1 +fi +if [ "$human_mapping" != "skip" ] && [[ "$human_mapping_complete" != 1 ]] then - echo -e "$(date)\t$scriptname\t############### SNAP TO HUMAN ###############" - basef_h=${nopathf%.fastq}.preprocessed.s20.h250n25d${d_human}xfu # remove fastq extension - echo -e "$(date)\t$scriptname\tBase file: $basef_h" - echo -e "$(date)\t$scriptname\tStarting: $basef_h human mapping" + log "############### SNAP TO HUMAN ###############" + log "Base file: $basef_h" + log "Starting: $basef_h human mapping" file_to_subtract="$basef.preprocessed.fastq" subtracted_output_file="$basef_h.human.snap.unmatched.sam" @@ -799,50 +778,43 @@ then for SNAP_subtraction_db in $SNAP_subtraction_folder/*; do SUBTRACTION_COUNTER=$[$SUBTRACTION_COUNTER +1] # check if SNAP db is cached in RAM, use optimal parameters depending on result - SNAP_db_cached=$(vmtouch -m500G -f "$SNAP_subtraction_db" | grep 'Resident Pages' | awk '{print $5}') - if [[ "$SNAP_db_cached" == "100%" ]] - then - echo -e "$(date)\t$scriptname\tSNAP database is cached ($SNAP_db_cached)." - SNAP_cache_option=" -map " - else - echo -e "$(date)\t$scriptname\tSNAP database is not cached ($SNAP_db_cached)." - SNAP_cache_option=" -pre -map " - fi - echo -e "$(date)\t$scriptname\tParameters: snap-dev single $SNAP_subtraction_db $file_to_subtract -o -sam $subtracted_output_file.$SUBTRACTION_COUNTER.sam -t $cores -x -f -h 250 -d ${d_human} -n 25 -F u $SNAP_cache_option" + #SNAP_db_cached=$(vmtouch -m500G -f "$SNAP_subtraction_db" | grep 'Resident Pages' | awk '{print $5}') + #if [[ "$SNAP_db_cached" == "100%" ]] + #then + #log "SNAP database is cached ($SNAP_db_cached)." + #SNAP_cache_option=" -map " + #else + #log "SNAP database is not cached ($SNAP_db_cached)." + #SNAP_cache_option=" -pre -map " + #fi + log "Parameters: snap single $SNAP_subtraction_db $file_to_subtract -o -sam $subtracted_output_file.$SUBTRACTION_COUNTER.sam -t $cores -x -f -h 250 -d ${d_human} -n 25 -F u $SNAP_cache_option" START_SUBTRACTION_STEP=$(date +%s) - snap-dev single "$SNAP_subtraction_db" "$file_to_subtract" -o -sam "$subtracted_output_file.$SUBTRACTION_COUNTER.sam" -t $cores -x -f -h 250 -d ${d_human} -n 25 -F u $SNAP_cache_option + snap single "$SNAP_subtraction_db" "$file_to_subtract" -o -sam "$subtracted_output_file.$SUBTRACTION_COUNTER.sam" -t $cores -x -f -h 250 -d ${d_human} -n 25 -F u $SNAP_cache_option END_SUBTRACTION_STEP=$(date +%s) - echo -e "$(date)\t$scriptname\tDone: SNAP to human" + log "Done: SNAP to human" diff_SUBTRACTION_STEP=$(( END_SUBTRACTION_STEP - START_SUBTRACTION_STEP )) - echo -e "$(date)\t$scriptname\tSubtraction step: $SUBTRACTION_COUNTER took $diff_SUBTRACTION_STEP seconds" + log "Subtraction step: $SUBTRACTION_COUNTER took $diff_SUBTRACTION_STEP seconds" file_to_subtract="$subtracted_output_file.$SUBTRACTION_COUNTER.sam" done egrep -v "^@" "$subtracted_output_file.$SUBTRACTION_COUNTER.sam" | awk '{if($3 == "*") print "@"$1"\n"$10"\n""+"$1"\n"$11}' > $(echo "$basef_h".human.snap.unmatched.sam | sed 's/\(.*\)\..*/\1/').fastq END_SUBTRACTION=$(date +%s) diff_SUBTRACTION=$(( END_SUBTRACTION - START_SUBTRACTION )) - rm $subtracted_output_file.*.sam - echo -e "$(date)\t$scriptname\tSubtraction took $diff_SUBTRACTION seconds" | tee -a timing.$basef.log -fi -######dropcache?############# -freemem=$(free -g | awk '{print $4}' | head -n 2 | tail -1) -echo -e "$(date)\t$scriptname\tThere is $freemem GB available free memory...[cutoff=$cache_reset GB]" -if [[ $dropcache == "Y" ]] -then - if [ "$freemem" -lt "$cache_reset" ] - then - echo -e "$(date)\t$scriptname\tClearing cache..." - dropcache - fi + # rm $subtracted_output_file.*.sam + log "Subtraction took $diff_SUBTRACTION seconds" | tee -a timing.$basef.log + echo "human_mapping_complete" >> "$basef.checkpoint" fi + ############################# SNAP TO NT ############################## if [ "$alignment" != "skip" ] then - if [ ! -f $basef.NT.snap.sam ] + if [ -f $basef.NT.snap.sam ] # Cache SNAP NT hits. then - echo -e "$(date)\t$scriptname\t####### SNAP UNMATCHED SEQUENCES TO NT ######" - echo -e -n "$(date)\t$scriptname\tCalculating number of sequences to analyze using SNAP to NT: " - echo $(awk 'NR%4==1' "$basef_h".human.snap.unmatched.fastq | wc -l) - echo -e "$(date)\t$scriptname\tStarting: Mapping by SNAP to NT from $basef_h.human.snap.unmatched.fastq" + log "Using cached $basef.NT.snap.sam" + else + log "####### SNAP UNMATCHED SEQUENCES TO NT ######" + num_seq_snap_nt = $(awk 'NR%4==1' "$basef_h".human.snap.unmatched.fastq | wc -l) + log "Calculating number of sequences to analyze using SNAP to NT: $num_seq_snap_nt" + log "Starting: Mapping by SNAP to NT from $basef_h.human.snap.unmatched.fastq" START_SNAPNT=$(date +%s) # SNAP to NT for unmatched reads (d value threshold cutoff = 12) @@ -850,8 +822,8 @@ then then if [ $snap_integrator = "inline" ] then - echo -e "$(date)\t$scriptname\tParameters: snap_nt.sh $basef_h.human.snap.unmatched.fastq ${SNAP_COMPREHENSIVE_db_dir} $cores $cache_reset $d_NT_alignment $snap" - snap_nt.sh "$basef_h.human.snap.unmatched.fastq" "${SNAP_COMPREHENSIVE_db_dir}" "$cores" "$cache_reset" "$d_NT_alignment" "$snap" + log "Parameters: snap_nt.sh $basef_h.human.snap.unmatched.fastq ${SNAP_COMPREHENSIVE_db_dir} $cores $d_NT_alignment $snap" + snap_nt.sh "$basef_h.human.snap.unmatched.fastq" "${SNAP_COMPREHENSIVE_db_dir}" "$cores" "$d_NT_alignment" "$snap" elif [ $snap_integrator = "end" ] then if [ "$snap_nt_procedure" = "AWS_master_slave" ] @@ -866,38 +838,40 @@ then sleep 2 done echo - echo -e "$(date)\t$scriptname\tParameters: snap_on_slave.sh $basef_h.human.snap.unmatched.fastq $pemkey $file_with_slave_ips $incoming_dir ${basef}.NT.snap.sam $d_NT_alignment" + log "Parameters: snap_on_slave.sh $basef_h.human.snap.unmatched.fastq $pemkey $file_with_slave_ips $incoming_dir ${basef}.NT.snap.sam $d_NT_alignment" snap_on_slave.sh "$basef_h.human.snap.unmatched.fastq" "$pemkey" "$file_with_slave_ips" "$incoming_dir" "${basef}.NT.snap.sam" "$d_human"> "$basef.AWS.log" 2>&1 elif [ "$snap_nt_procedure" = "solo" ] then - echo -e "$(date)\t$scriptname\tParameters: snap_nt_combine.sh $basef_h.human.snap.unmatched.fastq ${SNAP_COMPREHENSIVE_db_dir} $cores $cache_reset $d_NT_alignment $num_simultaneous_SNAP_runs" - snap_nt_combine.sh "$basef_h.human.snap.unmatched.fastq" "${SNAP_COMPREHENSIVE_db_dir}" "$cores" "$cache_reset" "$d_NT_alignment" "$num_simultaneous_SNAP_runs" + log "Parameters: snap_nt_combine.sh $basef_h.human.snap.unmatched.fastq ${SNAP_COMPREHENSIVE_db_dir} $cores $d_NT_alignment $num_simultaneous_SNAP_runs" + snap_nt_combine.sh "$basef_h.human.snap.unmatched.fastq" "${SNAP_COMPREHENSIVE_db_dir}" "$cores" "$d_NT_alignment" "$num_simultaneous_SNAP_runs" fi fi elif [ $run_mode = "Fast" ] then - echo -e "$(date)\t$scriptname\tParameters: snap_nt.sh $basef_h.human.snap.unmatched.fastq ${SNAP_FAST_db_dir} $cores $cache_reset $d_NT_alignment $snap" - snap_nt.sh "$basef_h.human.snap.unmatched.fastq" "${SNAP_FAST_db_dir}" "$cores" "$cache_reset" "$d_NT_alignment" "$snap" + log "Parameters: snap_nt.sh $basef_h.human.snap.unmatched.fastq ${SNAP_FAST_db_dir} $cores $d_NT_alignment $snap" + snap_nt.sh "$basef_h.human.snap.unmatched.fastq" "${SNAP_FAST_db_dir}" "$cores" "$d_NT_alignment" "$snap" fi - echo -e "$(date)\t$scriptname\tDone: SNAP to NT" + log "Done: SNAP to NT" END_SNAPNT=$(date +%s) diff_SNAPNT=$(( END_SNAPNT - START_SNAPNT )) - echo -e "$(date)\t$scriptname\tSNAP to NT took $diff_SNAPNT seconds." | tee -a timing.$basef.log + log "SNAP to NT took $diff_SNAPNT seconds." | tee -a timing.$basef.log mv -f "$basef_h.human.snap.unmatched.NT.sam" "$basef.NT.snap.sam" fi - echo -e "$(date)\t$scriptname\tStarting: parsing $basef.NT.snap.sam" - echo -e "$(date)\t$scriptname\textract matched/unmatched $basef.NT.snap.sam" - egrep -v "^@" $basef.NT.snap.sam | awk '{if($3 != "*") print }' > $basef.NT.snap.matched.sam - egrep -v "^@" $basef.NT.snap.sam | awk '{if($3 == "*") print }' > $basef.NT.snap.unmatched.sam - echo -e "$(date)\t$scriptname\tconvert sam to fastq from $basef.NT.snap.sam" - echo -e "$(date)\t$scriptname\tDone: parsing $basef.NT.snap.unmatched.sam" - if [ ! -f "$basef.NT.snap.matched.all.annotated" ] + + if [ ! -f "$basef.NT.snap.matched.fulllength.all.annotated.sorted" ] then + log "Starting: parsing $basef.NT.snap.sam" + log "extract matched/unmatched $basef.NT.snap.sam" + egrep -v "^@" $basef.NT.snap.sam | awk '{if($3 != "*") print }' > $basef.NT.snap.matched.sam + egrep -v "^@" $basef.NT.snap.sam | awk '{if($3 == "*") print }' > $basef.NT.snap.unmatched.sam + + log "convert sam to fastq from $basef.NT.snap.sam" + log "Done: parsing $basef.NT.snap.unmatched.sam" ## convert to FASTQ and retrieve full-length sequences - echo -e "$(date)\t$scriptname\tconvert to FASTQ and retrieve full-length sequences for SNAP NT matched hits" - echo -e "$(date)\t$scriptname\tParameters: extractHeaderFromFastq_ncores.sh $cores $basef.cutadapt.fastq $basef.NT.snap.matched.sam $basef.NT.snap.matched.fulllength.fastq $basef.NT.snap.unmatched.sam $basef.NT.snap.unmatched.fulllength.fastq" + log "convert to FASTQ and retrieve full-length sequences for SNAP NT matched hits" + log "Parameters: extractHeaderFromFastq_ncores.sh $cores $basef.cutadapt.fastq $basef.NT.snap.matched.sam $basef.NT.snap.matched.fulllength.fastq $basef.NT.snap.unmatched.sam $basef.NT.snap.unmatched.fulllength.fastq" extractHeaderFromFastq_ncores.sh "$cores" "$basef.cutadapt.fastq" "$basef.NT.snap.matched.sam" "$basef.NT.snap.matched.fulllength.fastq" "$basef.NT.snap.unmatched.sam" "$basef.NT.snap.unmatched.fulllength.fastq" #SNN140507 sort -k1,1 "$basef.NT.snap.matched.sam" > "$basef.NT.snap.matched.sorted.sam" cut -f1-9 "$basef.NT.snap.matched.sorted.sam" > "$basef.NT.snap.matched.sorted.sam.tmp1" @@ -905,72 +879,77 @@ then awk '(NR%4==1) {printf("%s\t",$0)} (NR%4==2) {printf("%s\t", $0)} (NR%4==0) {printf("%s\n",$0)}' "$basef.NT.snap.matched.fulllength.fastq" | sort -k1,1 | awk '{print $2 "\t" $3}' > "$basef.NT.snap.matched.fulllength.sequence.txt" #SNN140507 change this to bring in quality lines as well paste "$basef.NT.snap.matched.sorted.sam.tmp1" "$basef.NT.snap.matched.fulllength.sequence.txt" "$basef.NT.snap.matched.sorted.sam.tmp2" > "$basef.NT.snap.matched.fulllength.sam" ###retrieve taxonomy matched to NT ### - echo -e "$(date)\t$scriptname\ttaxonomy retrieval for $basef.NT.snap.matched.fulllength.sam" - echo -e "$(date)\t$scriptname\tParameters: taxonomy_lookup.pl $basef.NT.snap.matched.fulllength.sam sam nucl $cores $taxonomy_db_directory" + log "taxonomy retrieval for $basef.NT.snap.matched.fulllength.sam" + log "Parameters: taxonomy_lookup.pl $basef.NT.snap.matched.fulllength.sam sam nucl $cores $taxonomy_db_directory" taxonomy_lookup.pl "$basef.NT.snap.matched.fulllength.sam" sam nucl $cores $taxonomy_db_directory sort -k 13.7n "$basef.NT.snap.matched.fulllength.all.annotated" > "$basef.NT.snap.matched.fulllength.all.annotated.sorted" # sam format is no longer disturbed rm -f "$basef.NT.snap.matched.fulllength.gi" "$basef.NT.snap.matched.fullength.gi.taxonomy" fi -# adjust filenames for FAST mode - grep "Viruses;" "$basef.NT.snap.matched.fulllength.all.annotated.sorted" > "$basef.NT.snap.matched.fl.Viruses.annotated" - grep "Bacteria;" "$basef.NT.snap.matched.fulllength.all.annotated.sorted" > "$basef.NT.snap.matched.fl.Bacteria.annotated" - - ##SNN140507 cleanup bacterial reads - echo -e "$(date)\t$scriptname\tParameters: ribo_snap_bac_euk.sh $basef.NT.snap.matched.fl.Bacteria.annotated BAC $cores $ribo_snap_bac_euk_directory" - ribo_snap_bac_euk.sh $basef.NT.snap.matched.fl.Bacteria.annotated BAC $cores $ribo_snap_bac_euk_directory #SNN140507 - if [ $run_mode = "Comprehensive" ] - then - grep "Primates;" "$basef.NT.snap.matched.fulllength.all.annotated.sorted" > "$basef.NT.snap.matched.fl.Primates.annotated" - grep -v "Primates" "$basef.NT.snap.matched.fulllength.all.annotated.sorted" | grep "Mammalia" > "$basef.NT.snap.matched.fl.nonPrimMammal.annotated" - grep -v "Mammalia" "$basef.NT.snap.matched.fulllength.all.annotated.sorted" | grep "Chordata" > "$basef.NT.snap.matched.fl.nonMammalChordat.annotated" - grep -v "Chordata" "$basef.NT.snap.matched.fulllength.all.annotated.sorted" | grep "Eukaryota" > "$basef.NT.snap.matched.fl.nonChordatEuk.annotated" - ribo_snap_bac_euk.sh $basef.NT.snap.matched.fl.nonChordatEuk.annotated EUK $cores $ribo_snap_bac_euk_directory - fi - echo -e "$(date)\t$scriptname\tDone taxonomy retrieval" - echo -e "$(date)\t$scriptname\tParameters: table_generator.sh $basef.NT.snap.matched.fl.Viruses.annotated SNAP Y Y Y Y>& $basef.table_generator_snap.matched.fl.log" - table_generator.sh "$basef.NT.snap.matched.fl.Viruses.annotated" SNAP Y Y Y Y>& "$basef.table_generator_snap.matched.fl.log" - if [ $run_mode = "Comprehensive" ] - then - ### convert to FASTQ and retrieve full-length sequences to add to unmatched SNAP for viral RAPSearch### - egrep -v "^@" "$basef.NT.snap.matched.fl.Viruses.annotated" | awk '{if($3 != "*") print "@"$1"\n"$10"\n""+"$1"\n"$11}' > $(echo "$basef.NT.snap.matched.fl.Viruses.annotated" | sed 's/\(.*\)\..*/\1/').fastq - echo -e "$(date)\t$scriptname\tDone: convert to FASTQ and retrieve full-length sequences for SNAP NT hits " - fi - echo -e "$(date)\t$scriptname\t############# SORTING unmatched to NT BY LENGTH AND UNIQ AND LOOKUP ORIGINAL SEQUENCES #################" - if [ $run_mode = "Comprehensive" ] - then - #SNN 140507 extractHeaderFromFastq.csh "$basef.NT.snap.unmatched.fastq" FASTQ "$basef.cutadapt.fastq" "$basef.NT.snap.unmatched.fulllength.fastq" - sed "n;n;n;d" "$basef.NT.snap.unmatched.fulllength.fastq" | sed "n;n;d" | sed "s/^@/>/g" > "$basef.NT.snap.unmatched.fulllength.fasta" - fi - cat "$basef.NT.snap.unmatched.fulllength.fasta" | perl -e 'while (<>) {$h=$_; $s=<>; $seqs{$h}=$s;} foreach $header (reverse sort {length($seqs{$a}) <=> length($seqs{$b})} keys %seqs) {print $header.$seqs{$header}}' > $basef.NT.snap.unmatched.fulllength.sorted.fasta - if [ $run_mode = "Comprehensive" ] - then - echo -e "$(date)\t$scriptname\twe will be using 50 as the length of the cropped read for removing unique and low-complexity reads" - echo -e "$(date)\t$scriptname\tParameters: crop_reads.csh $basef.NT.snap.unmatched.fulllength.sorted.fasta 25 50 > $basef.NT.snap.unmatched.fulllength.sorted.cropped.fasta" - crop_reads.csh "$basef.NT.snap.unmatched.fulllength.sorted.fasta" 25 50 > "$basef.NT.snap.unmatched.fulllength.sorted.cropped.fasta" - echo -e "$(date)\t$scriptname\t*** reads cropped ***" - echo -e "$(date)\t$scriptname\tParameters: gt sequniq -seqit -force -o $basef.NT.snap.unmatched.fulllength.sorted.cropped.uniq.fasta $basef.NT.snap.unmatched.fulllength.sorted.cropped.fasta" - gt sequniq -seqit -force -o "$basef.NT.snap.unmatched.fulllength.sorted.cropped.uniq.fasta" "$basef.NT.snap.unmatched.fulllength.sorted.cropped.fasta" - echo -e "$(date)\t$scriptname\tParameters: extractAlltoFast.sh $basef.NT.snap.unmatched.fulllength.sorted.cropped.uniq.fasta FASTA $basef.NT.snap.unmatched.fulllength.fasta FASTA $basef.NT.snap.unmatched.uniq.fl.fasta FASTA" - extractAlltoFast.sh "$basef.NT.snap.unmatched.fulllength.sorted.cropped.uniq.fasta" FASTA "$basef.NT.snap.unmatched.fulllength.fasta" FASTA "$basef.NT.snap.unmatched.uniq.fl.fasta" FASTA #SNN140507 - fi - echo -e "$(date)\t$scriptname\tDone uniquing full length sequences of unmatched to NT" + + if [ ! -f "$basef.NT.snap.unmatched.uniq.fl.fasta" ] + then + # adjust filenames for FAST mode + grep "Viruses;" "$basef.NT.snap.matched.fulllength.all.annotated.sorted" > "$basef.NT.snap.matched.fl.Viruses.annotated" + grep "Bacteria;" "$basef.NT.snap.matched.fulllength.all.annotated.sorted" > "$basef.NT.snap.matched.fl.Bacteria.annotated" + + ##SNN140507 cleanup bacterial reads + log "Parameters: ribo_snap_bac_euk.sh $basef.NT.snap.matched.fl.Bacteria.annotated BAC $cores $ribo_snap_bac_euk_directory" + ribo_snap_bac_euk.sh $basef.NT.snap.matched.fl.Bacteria.annotated BAC $cores $ribo_snap_bac_euk_directory #SNN140507 + if [ $run_mode = "Comprehensive" ] + then + grep "Primates;" "$basef.NT.snap.matched.fulllength.all.annotated.sorted" > "$basef.NT.snap.matched.fl.Primates.annotated" + grep -v "Primates" "$basef.NT.snap.matched.fulllength.all.annotated.sorted" | grep "Mammalia" > "$basef.NT.snap.matched.fl.nonPrimMammal.annotated" + grep -v "Mammalia" "$basef.NT.snap.matched.fulllength.all.annotated.sorted" | grep "Chordata" > "$basef.NT.snap.matched.fl.nonMammalChordat.annotated" + grep -v "Chordata" "$basef.NT.snap.matched.fulllength.all.annotated.sorted" | grep "Eukaryota" > "$basef.NT.snap.matched.fl.nonChordatEuk.annotated" + ribo_snap_bac_euk.sh $basef.NT.snap.matched.fl.nonChordatEuk.annotated EUK $cores $ribo_snap_bac_euk_directory + fi + log "Done taxonomy retrieval" + log "Parameters: table_generator.sh $basef.NT.snap.matched.fl.Viruses.annotated SNAP Y Y Y Y>& $basef.table_generator_snap.matched.fl.log" + table_generator.sh "$basef.NT.snap.matched.fl.Viruses.annotated" SNAP Y Y Y Y>& "$basef.table_generator_snap.matched.fl.log" + if [ $run_mode = "Comprehensive" ] + then + ### convert to FASTQ and retrieve full-length sequences to add to unmatched SNAP for viral RAPSearch### + egrep -v "^@" "$basef.NT.snap.matched.fl.Viruses.annotated" | awk '{if($3 != "*") print "@"$1"\n"$10"\n""+"$1"\n"$11}' > $(echo "$basef.NT.snap.matched.fl.Viruses.annotated" | sed 's/\(.*\)\..*/\1/').fastq + log "Done: convert to FASTQ and retrieve full-length sequences for SNAP NT hits " + fi + log "############# SORTING unmatched to NT BY LENGTH AND UNIQ AND LOOKUP ORIGINAL SEQUENCES #################" + if [ $run_mode = "Comprehensive" ] + then + #SNN 140507 extractHeaderFromFastq.csh "$basef.NT.snap.unmatched.fastq" FASTQ "$basef.cutadapt.fastq" "$basef.NT.snap.unmatched.fulllength.fastq" + sed "n;n;n;d" "$basef.NT.snap.unmatched.fulllength.fastq" | sed "n;n;d" | sed "s/^@/>/g" > "$basef.NT.snap.unmatched.fulllength.fasta" + fi + cat "$basef.NT.snap.unmatched.fulllength.fasta" | perl -e 'while (<>) {$h=$_; $s=<>; $seqs{$h}=$s;} foreach $header (reverse sort {length($seqs{$a}) <=> length($seqs{$b})} keys %seqs) {print $header.$seqs{$header}}' > $basef.NT.snap.unmatched.fulllength.sorted.fasta + if [ $run_mode = "Comprehensive" ] + then + log "we will be using 50 as the length of the cropped read for removing unique and low-complexity reads" + log "Parameters: crop_reads.csh $basef.NT.snap.unmatched.fulllength.sorted.fasta 25 50 > $basef.NT.snap.unmatched.fulllength.sorted.cropped.fasta" + crop_reads.csh "$basef.NT.snap.unmatched.fulllength.sorted.fasta" 25 50 > "$basef.NT.snap.unmatched.fulllength.sorted.cropped.fasta" + log "*** reads cropped ***" + log "Parameters: gt sequniq -seqit -force -o $basef.NT.snap.unmatched.fulllength.sorted.cropped.uniq.fasta $basef.NT.snap.unmatched.fulllength.sorted.cropped.fasta" + gt sequniq -seqit -force -o "$basef.NT.snap.unmatched.fulllength.sorted.cropped.uniq.fasta" "$basef.NT.snap.unmatched.fulllength.sorted.cropped.fasta" + log "Parameters: extractAlltoFast.sh $basef.NT.snap.unmatched.fulllength.sorted.cropped.uniq.fasta FASTA $basef.NT.snap.unmatched.fulllength.fasta FASTA $basef.NT.snap.unmatched.uniq.fl.fasta FASTA" + extractAlltoFast.sh "$basef.NT.snap.unmatched.fulllength.sorted.cropped.uniq.fasta" FASTA "$basef.NT.snap.unmatched.fulllength.fasta" FASTA "$basef.NT.snap.unmatched.uniq.fl.fasta" FASTA #SNN140507 + fi + log "Done uniquing full length sequences of unmatched to NT" + fi fi + ####################### DENOVO CONTIG ASSEMBLY ##### if [ $run_mode = "Comprehensive" ] then - echo -e "$(date)\t$scriptname\t######### Running ABYSS and Minimus #########" + log "######### Running ABYSS and Minimus #########" START_deNovo=$(date +%s) - echo -e "$(date)\t$scriptname\tAdding matched viruses to NT unmatched" + log "Adding matched viruses to NT unmatched" sed "n;n;n;d" "$basef.NT.snap.matched.fl.Viruses.fastq" | sed "n;n;d" | sed "s/^@/>/g" | sed 's/>/>Vir/g' > "$basef.NT.snap.matched.fl.Viruses.fasta" gt sequniq -seqit -force -o "$basef.NT.snap.matched.fl.Viruses.uniq.fasta" "$basef.NT.snap.matched.fl.Viruses.fasta" cat "$basef.NT.snap.unmatched.uniq.fl.fasta" "$basef.NT.snap.matched.fl.Viruses.uniq.fasta" > "$basef.NT.snap.unmatched_addVir_uniq.fasta" - echo -e "$(date)\t$scriptname\tStarting deNovo assembly" - echo -e "$(date)\t$scriptname\tParameters: abyss_minimus.sh $basef.NT.snap.unmatched_addVir_uniq.fasta $length $contigcutoff $cores $abysskmer $ignore_barcodes_for_de_novo" + log "Starting deNovo assembly" + log "Parameters: abyss_minimus.sh $basef.NT.snap.unmatched_addVir_uniq.fasta $length $contigcutoff $cores $abysskmer $ignore_barcodes_for_de_novo" abyss_minimus.sh "$basef.NT.snap.unmatched_addVir_uniq.fasta" "$length" "$contigcutoff" "$cores" "$abysskmer" "$ignore_barcodes_for_de_novo" - echo -e "$(date)\t$scriptname\tCompleted deNovo assembly: generated all.$basef.NT.snap.unmatched_addVir_uniq.fasta.unitigs.cut${length}.${contigcutoff}-mini.fa" + log "Completed deNovo assembly: generated all.$basef.NT.snap.unmatched_addVir_uniq.fasta.unitigs.cut${length}.${contigcutoff}-mini.fa" END_deNovo=$(date +%s) diff_deNovo=$(( END_deNovo - START_deNovo )) - echo -e "$(date)\t$scriptname\tdeNovo Assembly took $diff_deNovo seconds." | tee -a timing.$basef.log + log "deNovo Assembly took $diff_deNovo seconds." | tee -a timing.$basef.log fi #######RAPSearch##### #################### RAPSearch to Vir ########### @@ -980,59 +959,55 @@ then then if [ -f "$basef.NT.snap.unmatched.uniq.fl.fasta" ] then - echo -e "$(date)\t$scriptname\t############# RAPSearch to ${RAPSearch_VIRUS_db} ON NT-UNMATCHED SEQUENCES #################" - if [[ $dropcache == "Y" ]] - then - dropcache - fi - echo -e "$(date)\t$scriptname\tStarting: RAPSearch $basef.NT.snap.unmatched.uniq.fl.fasta " + log "############# RAPSearch to ${RAPSearch_VIRUS_db} ON NT-UNMATCHED SEQUENCES #################" + log "Starting: RAPSearch $basef.NT.snap.unmatched.uniq.fl.fasta " START14=$(date +%s) - echo -e "$(date)\t$scriptname\tParameters: rapsearch -q $basef.NT.snap.unmatched.uniq.fl.fasta -d $RAPSearch_VIRUS_db -o $basef.$rapsearch_database.RAPSearch.e1 -z $cores -e $ecutoff_Vir -v 1 -b 1 -t N >& $basef.$rapsearch_database.RAPSearch.log" + log "Parameters: rapsearch -q $basef.NT.snap.unmatched.uniq.fl.fasta -d $RAPSearch_VIRUS_db -o $basef.$rapsearch_database.RAPSearch.e1 -z $cores -e $ecutoff_Vir -v 1 -b 1 -t N >& $basef.$rapsearch_database.RAPSearch.log" rapsearch -q "$basef.NT.snap.unmatched.uniq.fl.fasta" -d $RAPSearch_VIRUS_db -o $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir} -z "$cores" -e "$ecutoff_Vir" -v 1 -b 1 -t N >& $basef.$rapsearch_database.RAPSearch.log - echo -e "$(date)\t$scriptname\tDone RAPSearch" + log "Done RAPSearch" END14=$(date +%s) diff=$(( END14 - START14 )) - echo -e "$(date)\t$scriptname\tRAPSearch to Vir Took $diff seconds" - echo -e "$(date)\t$scriptname\tStarting: add FASTA sequences to RAPSearch m8 output file " + log "RAPSearch to Vir Took $diff seconds" + log "Starting: add FASTA sequences to RAPSearch m8 output file " START15=$(date +%s) sed -i '/^#/d' $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8 seqtk subseq $basef.NT.snap.unmatched.uniq.fl.fasta $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8 > $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta sed '/>/d' $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta > $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta.seq paste $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8 $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta.seq > $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.addseq.m8 - echo -e "$(date)\t$scriptname\tParameters: taxonomy_lookup.pl $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.addseq.m8 blast prot $cores $taxonomy_db_directory" + log "Parameters: taxonomy_lookup.pl $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.addseq.m8 blast prot $cores $taxonomy_db_directory" taxonomy_lookup.pl $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.addseq.m8 blast prot $cores $taxonomy_db_directory mv $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.addseq.all.annotated $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.annotated - echo -e "$(date)\t$scriptname\tParameters: table_generator.sh $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.annotated RAP N Y N N" + log "Parameters: table_generator.sh $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.annotated RAP N Y N N" table_generator.sh $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.annotated RAP N Y N N - echo -e "$(date)\t$scriptname\tDone: converting RAPSearch Vir output to fasta" + log "Done: converting RAPSearch Vir output to fasta" END15=$(date +%s) diff=$(( END15 - START15 )) - echo -e "$(date)\t$scriptname\tConverting RAPSearch Vir output to fasta sequences took $diff seconds." | tee -a timing.$basef.log + log "Converting RAPSearch Vir output to fasta sequences took $diff seconds." | tee -a timing.$basef.log cat "$basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta" "all.$basef.NT.snap.unmatched_addVir_uniq.fasta.unitigs.cut${length}.${contigcutoff}-mini.fa" > "$basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta" else - echo -e "$(date)\t$scriptname\tCannot run viral RAPSearch - necessary input file ($basef.$rapsearch_database.RAPSearch.e$ecutoff_Vir.m8) does not exist" - echo -e "$(date)\t$scriptname\tconcatenating RAPSearchvirus output and Contigs" + log "Cannot run viral RAPSearch - necessary input file ($basef.$rapsearch_database.RAPSearch.e$ecutoff_Vir.m8) does not exist" + log "concatenating RAPSearchvirus output and Contigs" fi - echo -e "$(date)\t$scriptname\t############# Cleanup RAPSearch Vir by RAPSearch to NR #################" + log "############# Cleanup RAPSearch Vir by RAPSearch to NR #################" if [ -f $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta ] then - echo -e "$(date)\t$scriptname\tStarting: RAPSearch to $RAPSearch_NR_db of $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta :" + log "Starting: RAPSearch to $RAPSearch_NR_db of $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta :" START16=$(date +%s) - echo -e "$(date)\t$scriptname\tParameters: rapsearch -q $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta -d $RAPSearch_NR_db -o $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR} -z $cores -e $ecutoff_NR -v 1 -b 1 -t N -a T" + log "Parameters: rapsearch -q $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta -d $RAPSearch_NR_db -o $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR} -z $cores -e $ecutoff_NR -v 1 -b 1 -t N -a T" rapsearch -q $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta -d $RAPSearch_NR_db -o $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR} -z $cores -e $ecutoff_NR -v 1 -b 1 -t N -a T - echo -e "$(date)\t$scriptname\trapsearch to nr done" + log "rapsearch to nr done" sed -i '/^#/d' $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.m8 - echo -e "$(date)\t$scriptname\tremoved extra #" + log "removed extra #" END16=$(date +%s) diff=$(( END16 - START16 )) - echo -e "$(date)\t$scriptname\tRAPSearch to NR took $diff seconds." | tee -a timing.$basef.log - echo -e "$(date)\t$scriptname\tStarting: Seq retrieval and Taxonomy $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}" + log "RAPSearch to NR took $diff seconds." | tee -a timing.$basef.log + log "Starting: Seq retrieval and Taxonomy $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}" START17=$(date +%s) seqtk subseq $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta \ $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.m8 > \ $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.m8.fasta - echo -e "$(date)\t$scriptname\tretrieved sequences" + log "retrieved sequences" cat $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.m8.fasta | \ awk '{if (substr($0,1,1)==">"){if (p){print "\n";} print $0} else printf("%s",$0);p++;}END{print "\n"}' | \ sed '/^$/d' | \ @@ -1040,25 +1015,25 @@ then paste $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e$ecutoff_Vir.NR.e${ecutoff_NR}.m8 \ $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.m8.fasta.seq > \ $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.addseq.m8 - echo -e "$(date)\t$scriptname\tmade addseq file" - echo -e "$(date)\t$scriptname\t############# RAPSearch Taxonomy" - echo -e "$(date)\t$scriptname\tParameters: taxonomy_lookup.pl $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.addseq.m8 blast prot $cores $taxonomy_db_directory" + log "made addseq file" + log "############# RAPSearch Taxonomy" + log "Parameters: taxonomy_lookup.pl $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.addseq.m8 blast prot $cores $taxonomy_db_directory" taxonomy_lookup.pl $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.addseq.m8 blast prot $cores $taxonomy_db_directory cp $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.addseq.all.annotated $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.annotated - echo -e "$(date)\t$scriptname\tretrieved taxonomy" + log "retrieved taxonomy" grep "Viruses" $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.annotated > $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.Viruses.annotated egrep "^contig" $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.annotated > $basef.Contigs.NR.RAPSearch.e${ecutoff_NR}.annotated - echo -e "$(date)\t$scriptname\textracted RAPSearch taxonomy" - echo -e "$(date)\t$scriptname\tStarting Readcount table" - echo -e "$(date)\t$scriptname\tParameters: table_generator.sh $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.Viruses.annotated RAP Y Y Y Y" + log "extracted RAPSearch taxonomy" + log "Starting Readcount table" + log "Parameters: table_generator.sh $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.Viruses.annotated RAP Y Y Y Y" table_generator.sh $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.Viruses.annotated RAP Y Y Y Y - echo -e "$(date)\t$scriptname\tParameters: table_generator.sh $basef.Contigs.NR.RAPSearch.e${ecutoff_NR}.annotated RAP Y Y Y Y" + log "Parameters: table_generator.sh $basef.Contigs.NR.RAPSearch.e${ecutoff_NR}.annotated RAP Y Y Y Y" table_generator.sh $basef.Contigs.NR.RAPSearch.e${ecutoff_NR}.annotated RAP Y Y Y Y #allow contigs to be incorporated into coverage maps by making contig barcodes the same as non-contig barcodes (removing the @) sed 's/@//g' $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.Viruses.annotated > $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.Viruses.annotated.bar.inc - echo -e "$(date)\t$scriptname\tmaking coverage maps" + log "making coverage maps" # coverage_generator_bp.sh (divides each fasta file into $cores cores then runs BLASTn using one core each. - echo -e "$(date)\t$scriptname\tParameters: coverage_generator_bp.sh $basef.NT.snap.matched.fl.Viruses.annotated $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.Viruses.annotated.bar.inc $eBLASTn $cores 10 1 $basef" + log "Parameters: coverage_generator_bp.sh $basef.NT.snap.matched.fl.Viruses.annotated $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.Viruses.annotated.bar.inc $eBLASTn $cores 10 1 $basef" coverage_generator_bp.sh $basef.NT.snap.matched.fl.Viruses.annotated $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.Viruses.annotated.bar.inc $eBLASTn $cores 10 1 $basef awk '{print$1}' $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.annotated > $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.annotated.header @@ -1078,38 +1053,38 @@ then cat $basef.not.in.NR.[a-z][a-z][a-z][a-z][a-z][a-z].annotated > $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.annotated.not.in.NR.annotated rm -r $basef.not.in.NR.[a-z][a-z][a-z][a-z][a-z][a-z] rm -r $basef.not.in.NR.[a-z][a-z][a-z][a-z][a-z][a-z].annotated - echo -e "$(date)\t$scriptname\tParameters: table_generator.sh $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.annotated.not.in.NR.annotated RAP N Y N N" + log "Parameters: table_generator.sh $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.annotated.not.in.NR.annotated RAP N Y N N" table_generator.sh $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.annotated.not.in.NR.annotated RAP N Y N N END17=$(date +%s) diff=$(( END17 - START17 )) - echo -e "$(date)\t$scriptname\tRAPSearch seq retrieval, taxonomy and readcount took $diff seconds" | tee -a timing.$basef.log + log "RAPSearch seq retrieval, taxonomy and readcount took $diff seconds" | tee -a timing.$basef.log else - echo -e "$(date)\t$scriptname\tCannot run RAPSearch to NR - necessary input file ($basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta) does not exist" + log "Cannot run RAPSearch to NR - necessary input file ($basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta) does not exist" fi fi ##################RAPSearch to NR ####### if [ "$rapsearch_database" == "NR" ] then - echo -e "$(date)\t$scriptname\t#################### RAPSearch to NR ###########" + log "#################### RAPSearch to NR ###########" cat "$basef.NT.snap.unmatched.uniq.fl.fasta" "all.$basef.NT.snap.unmatched_addVir_uniq.fasta.unitigs.cut${length}.${contigcutoff}-mini.fa" > "$basef.Contigs.NT.snap.unmatched.uniq.fl.fasta" if [ -f $basef.Contigs.NT.snap.unmatched.uniq.fl.fasta ] then - echo -e "$(date)\t$scriptname\t############# RAPSearch to NR #################" - echo -e "$(date)\t$scriptname\tStarting: RAPSearch to NR $basef.Contigs.NT.snap.unmatched.uniq.fl.fasta" + log "############# RAPSearch to NR #################" + log "Starting: RAPSearch to NR $basef.Contigs.NT.snap.unmatched.uniq.fl.fasta" START16=$(date +%s) - echo -e "$(date)\t$scriptname\tParameters:rapsearch -q $basef.Contigs.NT.snap.unmatched.uniq.fl.fasta -d $RAPSearch_NR_db -o $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR} -z $cores -e $ecutoff_NR -v 1 -b 1 -t N -a $RAPSearch_NR_fast_mode" + log "Parameters:rapsearch -q $basef.Contigs.NT.snap.unmatched.uniq.fl.fasta -d $RAPSearch_NR_db -o $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR} -z $cores -e $ecutoff_NR -v 1 -b 1 -t N -a $RAPSearch_NR_fast_mode" rapsearch -q "$basef.Contigs.NT.snap.unmatched.uniq.fl.fasta" -d "$RAPSearch_NR_db" -o "$basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}" -z $cores -e $ecutoff_NR -v 1 -b 1 -t N -a $RAPSearch_NR_fast_mode - echo -e "$(date)\t$scriptname\tRAPSearch to NR done" + log "RAPSearch to NR done" sed -i '/^#/d' $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.m8 - echo -e "$(date)\t$scriptname\tremoved extra #" + log "removed extra #" END16=$(date +%s) diff=$(( END16 - START16 )) - echo -e "$(date)\t$scriptname\tRAPSearch to NR took $diff seconds" | tee -a timing.$basef.log - echo -e "$(date)\t$scriptname\tStarting: Seq retrieval and Taxonomy $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}" + log "RAPSearch to NR took $diff seconds" | tee -a timing.$basef.log + log "Starting: Seq retrieval and Taxonomy $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}" START17=$(date +%s) seqtk subseq $basef.Contigs.NT.snap.unmatched.uniq.fl.fasta $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.m8 > $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.m8.fasta - echo -e "$(date)\t$scriptname\tretrieved sequences" + log "retrieved sequences" cat $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.m8.fasta | \ awk '{if (substr($0,1,1)==">"){if (p){print "\n";} print $0} else printf("%s",$0);p++;}END{print "\n"}' | \ sed '/^$/d' | \ @@ -1119,40 +1094,36 @@ then paste $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.m8 \ $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.m8.fasta.seq > \ $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.addseq.m8 - echo -e "$(date)\t$scriptname\tmade addseq file" - echo -e "$(date)\t$scriptname\t############# RAPSearch Taxonomy" - echo -e "$(date)\t$scriptname\tParameters: taxonomy_lookup.pl $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.addseq.m8 blast prot $cores $taxonomy_db_directory" + log "made addseq file" + log "############# RAPSearch Taxonomy" + log "Parameters: taxonomy_lookup.pl $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.addseq.m8 blast prot $cores $taxonomy_db_directory" taxonomy_lookup.pl $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.addseq.m8 blast prot $cores $taxonomy_db_directory cp $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.addseq.all.annotated $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.annotated - echo -e "$(date)\t$scriptname\tretrieved taxonomy" + log "retrieved taxonomy" grep "Viruses" $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.annotated > $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.Viruses.annotated egrep "^contig" $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.annotated > $basef.Contigs.$rapsearch_database.RAPSearch.e${ecutoff_NR}.annotated - echo -e "$(date)\t$scriptname\textracted RAPSearch taxonomy" - echo -e "$(date)\t$scriptname\tStarting Readcount table" - echo -e "$(date)\t$scriptname\tParameters: table_generator.sh $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.Viruses.annotated RAP Y Y Y Y" + log "extracted RAPSearch taxonomy" + log "Starting Readcount table" + log "Parameters: table_generator.sh $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.Viruses.annotated RAP Y Y Y Y" table_generator.sh $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.Viruses.annotated RAP Y Y Y Y grep -v Viruses $basef.Contigs.$rapsearch_database.RAPSearch.e${ecutoff_NR}.annotated > $basef.Contigs.$rapsearch_database.RAPSearch.e${ecutoff_NR}.noVir.annotated - echo -e "$(date)\t$scriptname\tParameters: table_generator.sh $basef.Contigs.$rapsearch_database.RAPSearch.e${ecutoff_NR}.noVir.annotated RAP N Y N N" + log "Parameters: table_generator.sh $basef.Contigs.$rapsearch_database.RAPSearch.e${ecutoff_NR}.noVir.annotated RAP N Y N N" table_generator.sh $basef.Contigs.$rapsearch_database.RAPSearch.e${ecutoff_NR}.noVir.annotated RAP N Y N N sed 's/@//g' $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.Viruses.annotated > $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.Viruses.annotated.bar.inc - echo -e "$(date)\t$scriptname\tmaking coverage maps" - echo -e "$(date)\t$scriptname\tParameters: coverage_generator_bp.sh $basef.NT.snap.matched.fl.Viruses.annotated $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.Viruses.annotated.bar.inc $eBLASTn 10 10 1 $basef" + log "making coverage maps" + log "Parameters: coverage_generator_bp.sh $basef.NT.snap.matched.fl.Viruses.annotated $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.Viruses.annotated.bar.inc $eBLASTn 10 10 1 $basef" coverage_generator_bp.sh $basef.NT.snap.matched.fl.Viruses.annotated $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.Viruses.annotated.bar.inc $eBLASTn 10 10 1 $basef END17=$(date +%s) diff=$(( END17 - START17 )) - echo -e "$(date)\t$scriptname\tRAPSearch seq retrieval, taxonomy and table readcount and coverage took $diff seconds." | tee -a timing.$basef.log + log "RAPSearch seq retrieval, taxonomy and table readcount and coverage took $diff seconds." | tee -a timing.$basef.log else - echo -e "$(date)\t$scriptname\tCannot run RAPSearch to NR - necessary input file ($basef.Contigs.NT.snap.unmatched.uniq.fl.fasta) does not exist" + log "Cannot run RAPSearch to NR - necessary input file ($basef.Contigs.NT.snap.unmatched.uniq.fl.fasta) does not exist" fi fi - if [[ $dropcache == "Y" ]] - then - dropcache - fi fi ############################# OUTPUT FINAL COUNTS ############################# -echo -e "$(date)\t$scriptname\tStarting: generating readcounts.$basef.log report" +log "Starting: generating readcounts.$basef.log report" START_READCOUNT=$(date +%s) headerid_top=$(head -1 $basef.fastq | cut -c1-4) @@ -1163,8 +1134,8 @@ if [ "$headerid_top" == "$headerid_bottom" ] # We should adjust this code to check that all headers are unique, rather than just the first and last then headerid=$(head -1 $basef.fastq | cut -c1-4 | sed 's/@//g') - echo -e "$(date)\t$scriptname\theaderid_top $headerid_top = headerid_bottom $headerid_bottom and headerid = $headerid" - echo -e "$(date)\t$scriptname\tParameters: readcount.sh $basef $headerid Y $basef.fastq $basef.preprocessed.fastq $basef.preprocessed.s20.h250n25d12xfu.human.snap.unmatched.fastq $basef.NT.snap.matched.fulllength.all.annotated.sorted $basef.NT.snap.matched.fl.Viruses.annotated $basef.NT.snap.matched.fl.Bacteria.annotated $basef.NT.snap.matched.fl.nonChordatEuk.annotated $basef.NT.snap.unmatched.sam $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.Viruses.annotated" + log "headerid_top $headerid_top = headerid_bottom $headerid_bottom and headerid = $headerid" + log "Parameters: readcount.sh $basef $headerid Y $basef.fastq $basef.preprocessed.fastq $basef.preprocessed.s20.h250n25d12xfu.human.snap.unmatched.fastq $basef.NT.snap.matched.fulllength.all.annotated.sorted $basef.NT.snap.matched.fl.Viruses.annotated $basef.NT.snap.matched.fl.Bacteria.annotated $basef.NT.snap.matched.fl.nonChordatEuk.annotated $basef.NT.snap.unmatched.sam $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.Viruses.annotated" readcount.sh $basef $headerid Y $basef.fastq \ $basef.preprocessed.fastq \ $basef.preprocessed.s20.h250n25d12xfu.human.snap.unmatched.fastq \ @@ -1174,18 +1145,18 @@ then $basef.NT.snap.matched.fl.nonChordatEuk.annotated \ $basef.NT.snap.unmatched.sam \ $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.Viruses.annotated - echo -e "$(date)\t$scriptname\tDone: generating readcounts.$basef.log report" + log "Done: generating readcounts.$basef.log report" END_READCOUNT=$(date +%s) diff_READCOUNT=$(( END_READCOUNT - START_READCOUNT )) - echo -e "$(date)\t$scriptname\tGenerating read count report Took $diff_READCOUNT seconds" | tee -a timing.$basef.log + log "Generating read count report Took $diff_READCOUNT seconds" | tee -a timing.$basef.log else - echo -e "$(date)\t$scriptname\treadcount.sh aborted due to non-unique header id" + log "readcount.sh aborted due to non-unique header id" fi -echo -e "$(date)\t$scriptname\t#################### SURPI PIPELINE COMPLETE ##################" +log "#################### SURPI PIPELINE COMPLETE ##################" END_PIPELINE=$(date +%s) diff_PIPELINE=$(( END_PIPELINE - START_PIPELINE )) -echo -e "$(date)\t$scriptname\tTotal run time of pipeline took $diff_PIPELINE seconds" | tee -a timing.$basef.log +log "Total run time of pipeline took $diff_PIPELINE seconds" | tee -a timing.$basef.log echo "Script and Parameters = $0 $@ " > $basef.pipeline_parameters.log echo "Raw Read quality = $quality" >> $basef.pipeline_parameters.log