From 57a0e5bad30f282c1ec909f66819ac957f5d005e Mon Sep 17 00:00:00 2001
From: Simon Ye <sye737@gmail.com>
Date: Thu, 16 Apr 2015 14:00:28 -0400
Subject: [PATCH 01/18] Fix perms on txt files.

---
 License.txt | 0
 Readme.txt  | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 mode change 100755 => 100644 License.txt
 mode change 100755 => 100644 Readme.txt

diff --git a/License.txt b/License.txt
old mode 100755
new mode 100644
diff --git a/Readme.txt b/Readme.txt
old mode 100755
new mode 100644

From b831cacc7ca9d9c8d65c3daf0392098db44fdc64 Mon Sep 17 00:00:00 2001
From: Simon Ye <sye737@gmail.com>
Date: Thu, 16 Apr 2015 14:01:32 -0400
Subject: [PATCH 02/18] Add -d option in SURPI.sh for changing full database
 path.

---
 SURPI.sh | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/SURPI.sh b/SURPI.sh
index 293e7f5..3c4264a 100755
--- a/SURPI.sh
+++ b/SURPI.sh
@@ -13,7 +13,7 @@
 #
 SURPI_version="1.0.22"
 
-optspec=":f:hvz:"
+optspec=":f:d:hvz:"
 bold=$(tput bold)
 normal=$(tput sgr0)
 green='\e[0;32m'
@@ -26,6 +26,7 @@ scriptname=${0##*/}
 while getopts "$optspec" option; do
 	case "${option}" in
 		f) config_file=${OPTARG};; # get parameters from config file if specified
+    d) ref_path=${OPTARG};; # Path to reference db directory.
 		h) HELP=1;;
 		v) VERIFICATION=1;;
 		z)	create_config_file=${OPTARG}
@@ -246,32 +247,32 @@ eBLASTn="1e-15"
 # SURPI will subtract all SNAP databases found in this directory from the input sequence
 # Useful if you want to subtract multiple genomes (without combining SNAP databases)
 # or, if you need to split a db if it is larger than available RAM.
-SNAP_subtraction_folder="/reference/hg19"
+SNAP_subtraction_db="$ref_path/hg19"
 
 # directory for SNAP-indexed databases of NCBI NT (for mapping phase in comprehensive mode)
 # directory must ONLY contain snap indexed databases
-SNAP_COMPREHENSIVE_db_dir="/reference/COMP_SNAP"
+SNAP_COMPREHENSIVE_db_dir="$ref_path/COMP_SNAP"
 
 # directory for SNAP-indexed databases for mapping phase in FAST mode
 # directory must ONLY contain snap indexed databases
-SNAP_FAST_db_dir="/reference/FAST_SNAP"
+SNAP_FAST_db_dir="$ref_path/FAST_SNAP"
 
 #Taxonomy Reference data directory
 #This folder should contain the 3 SQLite files created by the script "create_taxonomy_db.sh"
 #gi_taxid_nucl.db - nucleotide db of gi/taxonid
 #gi_taxid_prot.db - protein db of gi/taxonid
 #names_nodes_scientific.db - db of taxonid/taxonomy
-taxonomy_db_directory="/reference/taxonomy"
+taxonomy_db_directory="$ref_path/taxonomy"
 
 #RAPSearch viral database name: indexed protein dataset (all of Viruses)
-#make sure that directory also includes the .info file 
-RAPSearch_VIRUS_db="/reference/RAPSearch/rapsearch_viral_db"
+#make sure that directory also includes the .info file
+RAPSearch_VIRUS_db="$ref_path/RAPSearch/rapsearch_viral_db"
 
 #RAPSearch nr database name: indexed protein dataset (all of NR)
-#make sure that directory also includes the .info file 
-RAPSearch_NR_db="/reference/RAPSearch/rapsearch_nr_db"
+#make sure that directory also includes the .info file
+RAPSearch_NR_db="$ref_path/RAPSearch/rapsearch_nr_db"
 
-ribo_snap_bac_euk_directory="/reference/RiboClean_SNAP"
+ribo_snap_bac_euk_directory="$ref_path/RiboClean_SNAP"
 
 ##########################
 # Server related values
@@ -296,7 +297,7 @@ cache_reset="0"
 ##########################
 
 # AWS_master_slave will start up a slave instance on AWS for each division of the nt database
-# It will be more costly, but should run significantly faster than the solo method, which 
+# It will be more costly, but should run significantly faster than the solo method, which
 # runs each NT division through SNAP serially on a single machine.
 # If using the "AWS_master_slave" option, be sure that all parameters in the AWS section below are
 # set properly.
@@ -306,7 +307,7 @@ cache_reset="0"
 
 #Which method to use for SNAP to nt [AWS_master_slave/solo]
 # AWS_master_slave will start up a slave instance on AWS for each division of the nt database
-# It will be more costly, but should run significantly faster than the solo method, which 
+# It will be more costly, but should run significantly faster than the solo method, which
 # runs each NT division through SNAP serially on a single machine.
 # If using the "AWS_master_slave" option, be sure that all parameters in the AWS section below are
 # set properly.
@@ -318,7 +319,7 @@ snap_nt_procedure="solo"
 ami_id="ami-5ef61936"
 
 #Number of slave instances will not exceed this value. Useful for testing, in order to restrict instance count.
-#Otherwise, number of instances should be equal to number of SNAP-NT database divisions. This value is 
+#Otherwise, number of instances should be equal to number of SNAP-NT database divisions. This value is
 #automatically calculated by SURPI.
 max_slave_instances=29
 
@@ -959,7 +960,7 @@ if [ $run_mode = "Comprehensive" ]
 then
 	echo -e "$(date)\t$scriptname\t######### Running ABYSS and Minimus #########"
 	START_deNovo=$(date +%s)
-	echo -e "$(date)\t$scriptname\tAdding matched viruses to NT unmatched" 
+	echo -e "$(date)\t$scriptname\tAdding matched viruses to NT unmatched"
 	sed "n;n;n;d" "$basef.NT.snap.matched.fl.Viruses.fastq" | sed "n;n;d" | sed "s/^@/>/g" | sed 's/>/>Vir/g' > "$basef.NT.snap.matched.fl.Viruses.fasta"
 	gt sequniq -seqit -force -o "$basef.NT.snap.matched.fl.Viruses.uniq.fasta" "$basef.NT.snap.matched.fl.Viruses.fasta"
 	cat "$basef.NT.snap.unmatched.uniq.fl.fasta" "$basef.NT.snap.matched.fl.Viruses.uniq.fasta" > "$basef.NT.snap.unmatched_addVir_uniq.fasta"
@@ -1062,7 +1063,7 @@ then
 
 			awk '{print$1}'  $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.annotated > $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.annotated.header
 			awk '{print$1}' $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.annotated > $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.annotated.header
-			# find headers in viral rapsearch that are no longer found in rapsearch to nr 
+			# find headers in viral rapsearch that are no longer found in rapsearch to nr
 			sort $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.annotated.header \
 					$basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.annotated.header | \
 					uniq -d | \
@@ -1125,7 +1126,7 @@ then
 			cp $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.addseq.all.annotated $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.annotated
 			echo -e "$(date)\t$scriptname\tretrieved taxonomy"
 			grep "Viruses" $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.annotated > $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.Viruses.annotated
-			egrep "^contig" $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.annotated > $basef.Contigs.$rapsearch_database.RAPSearch.e${ecutoff_NR}.annotated 
+			egrep "^contig" $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.annotated > $basef.Contigs.$rapsearch_database.RAPSearch.e${ecutoff_NR}.annotated
 			echo -e "$(date)\t$scriptname\textracted RAPSearch taxonomy"
 			echo -e "$(date)\t$scriptname\tStarting Readcount table"
 			echo -e "$(date)\t$scriptname\tParameters: table_generator.sh $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.Viruses.annotated RAP Y Y Y Y"

From b56f0b8e9573ea0961cc5fc7e0eec7719b6e0c2a Mon Sep 17 00:00:00 2001
From: Simon Ye <sye737@gmail.com>
Date: Thu, 16 Apr 2015 14:05:14 -0400
Subject: [PATCH 03/18] Remove date naming on files in create_SURPI_data.

Fix bugs with insufficient locationSize and depreacted 0factor in SNAP.
---
 create_SURPI_data.sh | 44 ++++++++++++++++++++------------------------
 1 file changed, 20 insertions(+), 24 deletions(-)

diff --git a/create_SURPI_data.sh b/create_SURPI_data.sh
index 8e13c42..9a4649d 100755
--- a/create_SURPI_data.sh
+++ b/create_SURPI_data.sh
@@ -23,8 +23,9 @@ red='\e[0;31m'
 endColor='\e[0m'
 
 DATE=$(date +%m%d%Y)
-db_dir="NCBI_$DATE"
-curated_dir="curated_$DATE"
+echo $DATE > DATE.txt
+db_dir="NCBI"
+curated_dir="curated"
 
 cleanup_dir="rawdata"
 
@@ -53,9 +54,6 @@ if [ ! -d "$reference_dir" ]; then
 	mkdir "$reference_dir"
 fi
 
-#set SNAP index Ofactor. See SNAP documentation for details.
-Ofactor=1000
-
 #download NCBI data to $db_dir, curated data to $curated_dir
 echo -e "$(date)\t$scriptname\tdownload_SURPI_data.sh -d $db_dir -c $curated_dir"
 download_SURPI_data.sh -d "$db_dir" -c "$curated_dir"
@@ -68,12 +66,10 @@ if [ ! -d "$reference_dir/$taxonomy_dir" ]; then
 fi
 
 create_taxonomy_db.sh -d "$db_dir"
-mv gi_taxid_nucl.db "$reference_dir/$taxonomy_dir/gi_taxid_nucl_$DATE.db"
-mv gi_taxid_prot.db "$reference_dir/$taxonomy_dir/gi_taxid_prot_$DATE.db"
-mv names_nodes_scientific.db "$reference_dir/$taxonomy_dir/names_nodes_scientific_$DATE.db"
-ln -s "gi_taxid_nucl_$DATE.db" "$reference_dir/$taxonomy_dir/gi_taxid_nucl.db"
-ln -s "gi_taxid_prot_$DATE.db" "$reference_dir/$taxonomy_dir/gi_taxid_prot.db"
-ln -s "names_nodes_scientific_$DATE.db" "$reference_dir/$taxonomy_dir/names_nodes_scientific.db"
+mv gi_taxid_nucl.db "$reference_dir/$taxonomy_dir/gi_taxid_nucl.db"
+mv gi_taxid_prot.db "$reference_dir/$taxonomy_dir/gi_taxid_prot.db"
+mv names_nodes_scientific.db "$reference_dir/$taxonomy_dir/names_nodes_scientific.db"
+
 #
 ##create RAPSearch nr db and move into $RAPSearch_dir
 #
@@ -85,10 +81,10 @@ echo -e "$(date)\t$scriptname\tDecompressing nr..."
 pigz -dc -k "$db_dir/nr.gz" > nr
 
 echo -e "$(date)\t$scriptname\tStarting prerapsearch on nr..."
-prerapsearch -d nr -n "rapsearch_nr_${DATE}_db_v2.12"
+prerapsearch -d nr -n "rapsearch_nr_db_v2.12"
 echo -e "$(date)\t$scriptname\tCompleted prerapsearch on nr."
-mv rapsearch_nr_${DATE}_db_v2.12 "$reference_dir/$RAPSearch_dir"
-mv rapsearch_nr_${DATE}_db_v2.12.info "$reference_dir/$RAPSearch_dir"
+mv rapsearch_nr_db_v2.12 "$reference_dir/$RAPSearch_dir"
+mv rapsearch_nr_db_v2.12.info "$reference_dir/$RAPSearch_dir"
 
 #
 ## Index curated data
@@ -111,13 +107,13 @@ pigz -dc -k "$curated_dir/rdp_typed_iso_goodq_9210seqs.fa.gz" > rdp_typed_iso_go
 echo -e "$(date)\t$scriptname\tIndexing curated data..."
 
 echo -e "$(date)\t$scriptname\tSNAP indexing hg19_rRNA_mito_Hsapiens_rna.fa..."
-snap index hg19_rRNA_mito_Hsapiens_rna.fa snap_index_hg19_rRNA_mito_Hsapiens_rna -hg19 -O$Ofactor
+snap index hg19_rRNA_mito_Hsapiens_rna.fa snap_index_hg19_rRNA_mito_Hsapiens_rna -hg19 -locationSize 5
 echo -e "$(date)\t$scriptname\tSNAP indexing Bacterial_Refseq_05172012.CLEAN.LenFiltered.uniq.fa..."
-snap index Bacterial_Refseq_05172012.CLEAN.LenFiltered.uniq.fa snap_index_Bacterial_Refseq_05172012.CLEAN.LenFiltered.uniq_s16 -s 16 -O$Ofactor
+snap index Bacterial_Refseq_05172012.CLEAN.LenFiltered.uniq.fa snap_index_Bacterial_Refseq_05172012.CLEAN.LenFiltered.uniq_s16 -s 16 -locationSize 5
 echo -e "$(date)\t$scriptname\tStarting prerapsearch on rapsearch_viral_aa_130628_db_v2.12.fasta..."
 prerapsearch -d rapsearch_viral_aa_130628_db_v2.12.fasta -n "rapsearch_viral_aa_130628_db_v2.12"
 echo -e "$(date)\t$scriptname\tSNAP indexing viruses-5-2012_trimmedgi-MOD_addedgi.fa..."
-snap index viruses-5-2012_trimmedgi-MOD_addedgi.fa snap_index_viruses-5-2012_trimmedgi-MOD_addedgi -O$Ofactor
+snap index viruses-5-2012_trimmedgi-MOD_addedgi.fa snap_index_viruses-5-2012_trimmedgi-MOD_addedgi -locationSize 5
 
 #RiboClean additions
 if [ ! -d "$reference_dir/$RiboClean_dir" ]; then
@@ -125,13 +121,13 @@ if [ ! -d "$reference_dir/$RiboClean_dir" ]; then
 fi
 
 echo -e "$(date)\t$scriptname\tSNAP indexing 18s_rRNA_gene_not_partial.fa..."
-snap index 18s_rRNA_gene_not_partial.fa snap_index_18s_rRNA_gene_not_partial.fa -O$Ofactor
+snap index 18s_rRNA_gene_not_partial.fa snap_index_18s_rRNA_gene_not_partial.fa -locationSize 5
 echo -e "$(date)\t$scriptname\tSNAP indexing viruses-5-2012_trimmedgi-MOD_addedgi.fa..."
-snap index 23s.fa snap_index_23sRNA -O$Ofactor
+snap index 23s.fa snap_index_23sRNA -locationSize 5
 echo -e "$(date)\t$scriptname\tSNAP indexing rdp_typed_iso_goodq_9210seqs.fa..."
-snap index 28s_rRNA_gene_NOT_partial_18s_spacer_5.8s.fa snap_index_28s_rRNA_gene_NOT_partial_18s_spacer_5.8s.fa -O$Ofactor
+snap index 28s_rRNA_gene_NOT_partial_18s_spacer_5.8s.fa snap_index_28s_rRNA_gene_NOT_partial_18s_spacer_5.8s.fa -locationSize 5
 echo -e "$(date)\t$scriptname\tSNAP indexing viruses-5-2012_trimmedgi-MOD_addedgi.fa..."
-snap index rdp_typed_iso_goodq_9210seqs.fa snap_index_rdp_typed_iso_goodq_9210seqs -O$Ofactor
+snap index rdp_typed_iso_goodq_9210seqs.fa snap_index_rdp_typed_iso_goodq_9210seqs -locationSize 5
 
 if [ ! -d "$reference_dir/$FAST_dir" ]; then
 	mkdir "$reference_dir/$FAST_dir"
@@ -160,14 +156,14 @@ else
 fi
 echo -e "$(date)\t$scriptname\tStarting indexing of SNAP-nt..."
 echo -e "$(date)\t$scriptname\tcreate_snap_to_nt.sh -n $SNAP_nt_chunks -f nt"
-create_snap_to_nt.sh -n "$SNAP_nt_chunks" -f "nt" -p "$DATE"
+create_snap_to_nt.sh -n "$SNAP_nt_chunks" -f "nt" -p "prefix"
 echo -e "$(date)\t$scriptname\tCompleted indexing of SNAP-nt."
 
 if [ ! -d "$reference_dir/$SNAP_nt_dir" ]; then
 	mkdir "$reference_dir/$SNAP_nt_dir"
 fi
 
-mv "snap_index_${DATE}.nt."* "$reference_dir/$SNAP_nt_dir"
+mv "snap_index_prefix.nt."* "$reference_dir/$SNAP_nt_dir"
 
 #
 ## Cleanup
@@ -181,6 +177,6 @@ mv *.fa "$cleanup_dir"
 mv *.fasta "$cleanup_dir"
 mv nr "$cleanup_dir"
 mv nt "$cleanup_dir"
-mv "$DATE"* "$cleanup_dir"
+mv "prefix"* "$cleanup_dir"
 
 echo -e "$(date)\t$scriptname\t${green}Completed creation of SURPI reference data.${endColor}"

From e9268e3fd5377afba10ab9646516cf05b4ce4375 Mon Sep 17 00:00:00 2001
From: Simon Ye <sye737@gmail.com>
Date: Thu, 16 Apr 2015 14:07:27 -0400
Subject: [PATCH 04/18] Actually make -p option in create_snap_to_nt work.

---
 create_snap_to_nt.sh | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/create_snap_to_nt.sh b/create_snap_to_nt.sh
index edb297e..3ea42a1 100755
--- a/create_snap_to_nt.sh
+++ b/create_snap_to_nt.sh
@@ -15,7 +15,7 @@
 # Copyright (C) 2014 Scot Federman - All Rights Reserved
 # SURPI has been released under a modified BSD license.
 # Please see license file for details.
-# Last revised 7/2/2014  
+# Last revised 7/2/2014
 
 scriptname=${0##*/}
 bold=$(tput bold)
@@ -25,10 +25,7 @@ red='\e[0;31m'
 endColor='\e[0m'
 prefix=$(date +%m%d%Y)
 
-#set SNAP index Ofactor. See SNAP documentation for details
-Ofactor=1000
-
-while getopts ":n:hs:f:" option; do
+while getopts ":n:hp:s:f:" option; do
 	case "${option}" in
 		f) fasta_file_to_index=${OPTARG};;
 		n) num_chunks=${OPTARG};;
@@ -44,20 +41,20 @@ done
 if [[ ${HELP} -eq 1  ||  $# -lt 1 ]]
 then
 	cat <<USAGE
-	
+
 ${bold}$scriptname${normal}
 
-This program will download necessary databases and run snap index NCBI NT for use with SURPI. 
+This program will download necessary databases and run snap index NCBI NT for use with SURPI.
 
 ${bold}Command Line Switches:${normal}
 
 	-h	Show this help
 
 	-f	Specify path to nt file (or other FASTA file to be indexed)
-	
+
 	-n	Specify number of chunks
-	
-	-s	Specify size of chunks	
+
+	-s	Specify size of chunks
 
 ${bold}Usage:${normal}
 
@@ -108,7 +105,7 @@ else
 	echo -e "$(date)\t$scriptname\tSplit file already present."
 	echo -e "$(date)\t$scriptname\t$scriptname is currently not capable of verifying the split files."
 	echo -e "$(date)\t$scriptname\tIn order to complete the indexing, please delete, or move the split files out of this directory."
-	exit
+#	exit
 fi
 
 #SNAP index each chunk
@@ -116,7 +113,7 @@ echo -e "$(date)\t$scriptname\tStarting SNAP indexing of nt..."
 for f in $prefix.nt.*
 do
 	echo -e "$(date)\t$scriptname\tStarting SNAP indexing of $f..."
-    snap index $f snap_index_$f -O$Ofactor
+    snap index $f snap_index_$f -locationSize 5
 	echo -e "$(date)\t$scriptname\tCompleted SNAP indexing of $f..."
 done
 echo -e "$(date)\t$scriptname\tCompleted SNAP indexing of nt."

From e0708f98d9d90c90d0b52f7c1b92f1a4956d5f4a Mon Sep 17 00:00:00 2001
From: Simon Ye <sye737@gmail.com>
Date: Thu, 16 Apr 2015 14:08:47 -0400
Subject: [PATCH 05/18] Remove --parallel from sort in taxonomy lookup.

---
 taxonomy_lookup.pl | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/taxonomy_lookup.pl b/taxonomy_lookup.pl
index 79d0eac..d8e5ba2 100755
--- a/taxonomy_lookup.pl
+++ b/taxonomy_lookup.pl
@@ -80,12 +80,11 @@
 
 # sort/uniq gi file
 my $startsort = [gettimeofday()];
-system ("sort --parallel=$cores -u $basef_inputfile.gi > $basef_inputfile.gi.uniq");
-
+system ("sort -u $basef_inputfile.gi > $basef_inputfile.gi.uniq");
 my $sorttime = tv_interval($startsort);
 print localtime()."\t$0\tTime to sort -u gi file: $sorttime seconds\n";
 
-# Parallelization can occur at this point in the code. Since the file in now sorted, it can be split into n chunks 
+# Parallelization can occur at this point in the code. Since the file in now sorted, it can be split into n chunks
 # with no overlap.
 my %taxonomy;
 
@@ -110,19 +109,19 @@
 	chomp;
 	my ($family, $genus, $species, $lineage) = ("") x 4;
 	my $result = taxonomy_fgsl($_, $seq_type);
-	
+
 	if ($result =~ /family--(.*?)\t/) {
 		$family = $1;
 	}
-	
+
 	if ($result =~ /genus--(.*?)\t/) {
 		$genus = $1;
 	}
-	
+
 	if ($result =~ /species--(.*?)\t/) {
 		$species = $1;
 	}
-	
+
 	if ($result =~ /lineage--(.*)$/) {
 		$lineage = $1;
 	}
@@ -228,7 +227,7 @@ sub taxonomy_fgsl {
 			$sth->execute(($taxid));
 			if (@noderow = $sth->fetchrow_array) {
 				(my $tid, my $parent, my $rank) = @noderow;
-		
+
 				if ($namerow = $name_return->fetchrow_array) {
 					$name = trim($namerow);
 				}

From 27e11f3f7a10846d36aabdb9a5a02194fd7bba46 Mon Sep 17 00:00:00 2001
From: Simon Ye <sye737@gmail.com>
Date: Thu, 16 Apr 2015 19:42:42 -0400
Subject: [PATCH 06/18] Remove trailing whitespace.

---
 abyss_minimus.sh              | 12 ++++++------
 compare_multiple_sam.py       |  4 ++--
 compare_sam.py                | 12 ++++++------
 coveragePlot.py               | 10 +++++-----
 coverage_generator_bp.sh      |  2 +-
 create_SURPI_data.sh          |  4 ++--
 create_tab_delimited_table.pl |  8 ++++----
 create_taxonomy_db.py         | 10 +++++-----
 create_taxonomy_db.sh         | 10 +++++-----
 crop_reads.csh                |  8 ++++----
 cutadapt_quality.csh          |  6 +++---
 download_SURPI_data.sh        |  8 ++++----
 extractAlltoFast.sh           |  8 ++++----
 extractHeaderFromFastq.csh    |  2 +-
 extractSamFromSam.sh          |  6 +++---
 fastq-extractBarcodedSRA.sh   |  4 ++--
 get_genbankfasta.pl           |  6 +++---
 mapPerfectBLASTtoGenome.py    |  2 +-
 plot_reads_to_gi.sh           | 10 +++++-----
 preprocess.sh                 |  6 +++---
 preprocess_ncores.sh          |  6 +++---
 ribo_snap_bac_euk.sh          |  4 ++--
 slave/setup_slave.sh          |  2 +-
 snap_nt.sh                    | 12 ++++++------
 split_fasta.pl                |  2 +-
 start_slaves.sh               |  8 ++++----
 table_generator.sh            | 12 ++++++------
 taxonomy_lookup_embedded.pl   | 10 +++++-----
 tweet.pl                      |  4 ++--
 update_sam.py                 |  2 +-
 30 files changed, 100 insertions(+), 100 deletions(-)

diff --git a/abyss_minimus.sh b/abyss_minimus.sh
index ed14d85..cc924f5 100755
--- a/abyss_minimus.sh
+++ b/abyss_minimus.sh
@@ -77,11 +77,11 @@ for f in `cat $inputfile.barcodes` ; do
 	END_SPLIT=$(date +%s)
 	diff_SPLIT=$(( $END_SPLIT - $START_SPLIT ))
 	echo -e "$(date)\t$scriptname\tSplitting FASTA took $diff_SPLIT s."
-	### run abyss (deBruijn assembler) on each 100,000 read demultiplexed fasta file, including the unsplit demultiplexed file 
+	### run abyss (deBruijn assembler) on each 100,000 read demultiplexed fasta file, including the unsplit demultiplexed file
 	echo -e "$(date)\t$scriptname\tRunning abyss on each $split_FASTA_size chunk..."
 	START_ABYSS=$(date +%s)
 
-	for d in bar$f.${inputfile}_* ; do 
+	for d in bar$f.${inputfile}_* ; do
 		abyss-pe k=$kmer name=$d.f se=$d np=$cores >& $d.abyss.log
 	done
 
@@ -115,10 +115,10 @@ for f in `cat $inputfile.barcodes` ; do
 ###########
 	echo -e "$(date)\t$scriptname\tStarting cat barcode addition and cutoff of minimo output"
 	START_MIN_PROCESS=$(date +%s)
-	# Minimo output gives more than one line per sequence, so here we linearize sequences (linearization protocol from here http://seqanswers.com/forums/showthread.php?t=27567 ) 
+	# Minimo output gives more than one line per sequence, so here we linearize sequences (linearization protocol from here http://seqanswers.com/forums/showthread.php?t=27567 )
 	# then we add the relevant barcode to the end of the header contig. Contigs that survive untouched from abyss already have a barcode at the end of them, so that extra barcode is taken away
-	cat all-contigs.fa | awk '{if (substr($0,1,1)==">"){if (p){print "\n";} print $0} else printf("%s",$0);p++;}END{print "\n"}' | sed '/^$/d' | sed 's/ /_/g' | sed "s/$/"$f"/g;n" | sed "s/"$f""$f"/"$f"/g" | sed 's/#/#@/g' | sed 's/^>/>contig_/g' > all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss-minim.fa	
-	# change generic name all-contigs.fa 
+	cat all-contigs.fa | awk '{if (substr($0,1,1)==">"){if (p){print "\n";} print $0} else printf("%s",$0);p++;}END{print "\n"}' | sed '/^$/d' | sed 's/ /_/g' | sed "s/$/"$f"/g;n" | sed "s/"$f""$f"/"$f"/g" | sed 's/#/#@/g' | sed 's/^>/>contig_/g' > all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss-minim.fa
+	# change generic name all-contigs.fa
 	mv all-contigs.fa all-contigs.fa.$f
 	# only contigs larger than $cutoff_post_Minimo are retained
 	cat all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss-minim.fa | awk 'NR%2==1 {x = $0} NR%2==0 { if (length($0) >= '$3') printf("%s\n%s\n",x,$0)}' > all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss.${cutoff_post_Minimo}-mini.fa
@@ -148,7 +148,7 @@ fi
 rm -f all.$inputfile.contigs.abyssmini.cut$cutoff_post_Abyss.$cutoff_post_Minimo.e1.NR.RAPSearch.aln
 rm -f all.$inputfile.contigs.abyssmini.e1.NR.RAPSearch.m8.noheader
 rm -f all.$inputfile.contigs.abyssmini.e1.NR.RAPSearch.m8.noheader.seq
-rm -f all.$inputfile.contigs.abyssmini.e1.NR.RAPSearch.m8.noheader 
+rm -f all.$inputfile.contigs.abyssmini.e1.NR.RAPSearch.m8.noheader
 rm -f all.$inputfile.contigs.abyssmini.e1.NR.RAPSearch.m8.noheader.ex.fa
 rm -f all.$inputfile.unitigs.cut$cutoff_post_Abyss-contigs.sortlen.seq
 rm -f all-contigs*
diff --git a/compare_multiple_sam.py b/compare_multiple_sam.py
index 1af36c8..334536b 100755
--- a/compare_multiple_sam.py
+++ b/compare_multiple_sam.py
@@ -9,7 +9,7 @@
 #
 # Copyright (C) 2014 Charles Y Chiu - All Rights Reserved
 # Permission to copy and modify is granted under the BSD license
-# Last revised 3/21/2014  
+# Last revised 3/21/2014
 
 import sys
 
@@ -61,7 +61,7 @@
 	lineList = []
 	for file in fileObjList:
 		lineList.append(file.readline())
-		
+
 for file in fileObjList:
 	file.close()
 outputFile.close()
diff --git a/compare_sam.py b/compare_sam.py
index 2bd65af..7ced0a3 100755
--- a/compare_sam.py
+++ b/compare_sam.py
@@ -42,17 +42,17 @@ def logHeader():
 
 while line1 != '':
 	data1 = line1.split()
-	 
+
 	# check to see whether line is in the SAM header or alignment section
 	if (data1[0][0]!="@"):
 		#We are in the SAM alignment section
 
 		edit_distance=int(data1[12].split(":")[2])
-	
+
 		#using old snap (<=v0.15.4), could use length of line to determine if it was a hit or miss.
 		#new snap (>=v1.0) instead uses an edit distance of -1 to signify a miss.
-	
-		if (edit_distance >= 0):	
+
+		if (edit_distance >= 0):
 			#this is a hit
 			new_hits += 1
 			sum_edistance_new += int(data1[12].split(":")[2])
@@ -61,10 +61,10 @@ def logHeader():
 				# this has been previously flagged as a hit, and is also now a hit. Compare edit distances to find best hit.
 				edit_distance_previous=firstentry[1]
 				edit_distance_current = data1[12].split(":")[2]
-				
+
 				existing_hits += 1
 				sum_edistance_existing += int(edit_distance_previous)
-				
+
 				if (int(edit_distance_current) <= int(edit_distance_previous)):
 					replacements += 1
 					sum_data1 += int(edit_distance_current)
diff --git a/coveragePlot.py b/coveragePlot.py
index c95981e..62a7930 100755
--- a/coveragePlot.py
+++ b/coveragePlot.py
@@ -2,7 +2,7 @@
 #
 #	coveragePlot.py
 #
-#	This program generates genomic coverage plots 
+#	This program generates genomic coverage plots
 #	Chiu Laboratory
 #	University of California, San Francisco
 #	January, 2014
@@ -47,7 +47,7 @@ def smart_truncate1(text, max_length=100, suffix='...'):
 
 # print "Installed version is: %s." % mpl_version
 
-#load function is deprecated as of matplotlib v1.3.1, replaced with 
+#load function is deprecated as of matplotlib v1.3.1, replaced with
 if (LooseVersion(mpl_version) >= LooseVersion('1.3.1') ):
 	data = np.loadtxt(dataFile)
 else:
@@ -78,7 +78,7 @@ def smart_truncate1(text, max_length=100, suffix='...'):
     plot(data[:,0],data[:,1],color)
     xlabel("base position",fontsize=8)
     ylabel("fold coverage",fontsize=8)
-    title_text = sys.argv[2] 
+    title_text = sys.argv[2]
     suptitle(title_text,fontsize=9)
     xMin, xMax, yMin, yMax = min(data[:,0]),max(data[:,0]),min(data[:,1]),max(data[:,1])
     # add a 10% buffer to yMax
@@ -105,7 +105,7 @@ def smart_truncate1(text, max_length=100, suffix='...'):
     plot(data[:,0],data[:,1],color)
     xlabel("base position",fontsize=8)
     ylabel("fold coverage",fontsize=8)
-    title_text = sys.argv[2] 
+    title_text = sys.argv[2]
     suptitle(title_text,fontsize=9)
     xMin, xMax, yMin, yMax = min(data[:,0]),max(data[:,0]),min(data[:,1]),max(data[:,1])
     yMax *= 1.1
@@ -122,7 +122,7 @@ def smart_truncate1(text, max_length=100, suffix='...'):
     plot(data[:,0],data[:,1],color)
     xlabel("base position",fontsize=8)
     ylabel("fold coverage",fontsize=8)
-    title_text = sys.argv[2] 
+    title_text = sys.argv[2]
     suptitle(title_text,fontsize=9)
     xMin, xMax, yMin, yMax = min(data[:,0]),max(data[:,0]),min(data[:,1]),max(data[:,1])
     yMax *= 1.1
diff --git a/coverage_generator_bp.sh b/coverage_generator_bp.sh
index 9510a8d..6f13feb 100755
--- a/coverage_generator_bp.sh
+++ b/coverage_generator_bp.sh
@@ -93,7 +93,7 @@ do
 		# create list of curated GIs $SNAP_file.$genus.gi.list.curatedgenome
 		# First retain only gis that are complete genomes if no GIs with "complete genomes" are returned, we'll go to GIs  "complete sequences", if no complete sequences, then we just go with the GIs we have
 
-		get_genbankfasta.pl -i bar.$bar.$SNAP_file.$genus.gi.list  > bar.$bar.$SNAP_file.$genus.gi.headers 
+		get_genbankfasta.pl -i bar.$bar.$SNAP_file.$genus.gi.list  > bar.$bar.$SNAP_file.$genus.gi.headers
 
 		egrep ">gi.*[Cc]omplete [Gg]enome" bar.$bar.$SNAP_file.$genus.gi.headers  | awk -F "|" '{print$2}'  | head -n $top_gis > bar.$bar.$SNAP_file.$genus.gi
 		if [ -s bar.$bar.$SNAP_file.$genus.gi ]
diff --git a/create_SURPI_data.sh b/create_SURPI_data.sh
index 9a4649d..e79a01b 100755
--- a/create_SURPI_data.sh
+++ b/create_SURPI_data.sh
@@ -15,7 +15,7 @@
 # Copyright (C) 2014 Scot Federman - All Rights Reserved
 # SURPI has been released under a modified BSD license.
 # Please see license file for details.
-# Last revised 7/7/2014  
+# Last revised 7/7/2014
 
 scriptname=${0##*/}
 green='\e[0;32m'
@@ -42,7 +42,7 @@ SNAP_nt_dir="COMP_SNAP"
 RiboClean_dir="RiboClean_SNAP"
 
 # SNAP_nt_chunks=  # of chunks to split nt into, SNAP index will be created for each chunk.
-# Currently, the minimum number of chunks is around 17-20 
+# Currently, the minimum number of chunks is around 17-20
 # SNAP 0.15.4 will not successfully index nt with less than 17 chunks, though this will vary a bit with each NT release.
 # I picked 20 here as a safe default.
 # This will likely have to be increased when using SNAP 1.0, which has different indexing characteristics,
diff --git a/create_tab_delimited_table.pl b/create_tab_delimited_table.pl
index 77a1fc5..f0a0710 100755
--- a/create_tab_delimited_table.pl
+++ b/create_tab_delimited_table.pl
@@ -10,7 +10,7 @@
 # Copyright (C) 2014 Scot Federman - All Rights Reserved
 # SURPI has been released under a modified BSD license.
 # Please see license file for details.
-# Last revised 1/26/2014  
+# Last revised 1/26/2014
 
 use Getopt::Std;
 use strict;
@@ -21,14 +21,14 @@
 
 if ($opt_h) {
 	print <<USAGE;
-	
+
 create_tab_delimited_table.pl
 
 This program will return a simpler version of a SNAP or RAP output file that has been produced via the SURPI pipeline.
 Output file is in the following format:
 
 	header	gi	species	genus	family
-	
+
 
 Usage:
 
@@ -39,7 +39,7 @@
 Command Line Switches:
 
 	-f	Specify SNAP/RAP input format
-	
+
 USAGE
 	exit;
 }
diff --git a/create_taxonomy_db.py b/create_taxonomy_db.py
index c4b1775..bc8fac5 100755
--- a/create_taxonomy_db.py
+++ b/create_taxonomy_db.py
@@ -10,7 +10,7 @@
 # Copyright (C) 2014 Scot Federman - All Rights Reserved
 # SURPI has been released under a modified BSD license.
 # Please see license file for details.
-# Last revised 7/2/2014  
+# Last revised 7/2/2014
 
 import sqlite3
 
@@ -27,13 +27,13 @@
 		line = line.split("|")
 		taxid = line[0].strip()
 		name = line[1].strip()
-		
+
 		c.execute ("INSERT INTO names VALUES (?,?)", (taxid, name))
 
 d = conn.cursor()
 d.execute('''CREATE TABLE nodes (
 			taxid INTEGER PRIMARY KEY,
-			parent_taxid INTEGER, 
+			parent_taxid INTEGER,
 			rank TEXT)''')
 
 with open('nodes.dmp', 'r') as map_file:
@@ -42,7 +42,7 @@
 		taxid = line[0].strip()
 		parent_taxid = line[1].strip()
 		rank = line[2].strip()
-		
+
 		d.execute ("INSERT INTO nodes VALUES (?,?,?)", (taxid, parent_taxid, rank))
 conn.commit()
 conn.close()
@@ -67,7 +67,7 @@
 conn = sqlite3.connect('gi_taxid_prot.db')
 c = conn.cursor()
 c.execute('''CREATE TABLE gi_taxid (
-			gi INTEGER PRIMARY KEY, 
+			gi INTEGER PRIMARY KEY,
 			taxid INTEGER)''')
 
 with open('gi_taxid_prot.dmp', 'r') as map_file:
diff --git a/create_taxonomy_db.sh b/create_taxonomy_db.sh
index 6378221..ca81242 100755
--- a/create_taxonomy_db.sh
+++ b/create_taxonomy_db.sh
@@ -11,7 +11,7 @@
 # Copyright (C) 2014 Scot Federman - All Rights Reserved
 # SURPI has been released under a modified BSD license.
 # Please see license file for details.
-# Last revised 7/2/2014  
+# Last revised 7/2/2014
 
 scriptname=${0##*/}
 
@@ -28,16 +28,16 @@ done
 if [[ ${HELP-} -eq 1  ||  $# -lt 1 ]]
 then
 	cat <<USAGE
-	
+
 ${bold}$scriptname${normal}
 
-This script will create the taxonomy SQLite database using NCBI downloadable files. 
+This script will create the taxonomy SQLite database using NCBI downloadable files.
 
 ${bold}Command Line Switches:${normal}
 
 	-h	Show this help
 
-	-d	Specify directory containing NCBI data	
+	-d	Specify directory containing NCBI data
 
 ${bold}Usage:${normal}
 
@@ -65,7 +65,7 @@ pigz -dc -k "$db_directory/gi_taxid_nucl.dmp.gz" > gi_taxid_nucl.dmp
 pigz -dc -k "$db_directory/gi_taxid_prot.dmp.gz" > gi_taxid_prot.dmp
 
 # the below grep "fixes" the issue whereby aliases, mispellings, and other alternate names are returned.
-# We could simply look for a name that is a "scientific name", 
+# We could simply look for a name that is a "scientific name",
 # but this shrinks the db a bit, speeding up lookups, and removes data we do not need at this time.
 echo -e "$(date)\t$scriptname\tRetaining scientific names..."
 grep "scientific name" names.dmp > names_scientificname.dmp
diff --git a/crop_reads.csh b/crop_reads.csh
index 4b5a56d..29fc65b 100755
--- a/crop_reads.csh
+++ b/crop_reads.csh
@@ -9,7 +9,7 @@
 #
 #
 # will go from position 10 to 85  (or whatever size is available)
-#                          
+#
 #
 # $1 = file to crop
 # $2 = position to start crop ($start_nt)
@@ -18,11 +18,11 @@
 # Copyright (C) 2014 Charles Y Chiu - All Rights Reserved
 # SURPI has been released under a modified BSD license.
 # Please see license file for details.
-# Last revised 1/26/2014  
-                                                                                                      
+# Last revised 1/26/2014
+
 if ($#argv != 3) then
 	echo "Usage: crop_reads.csh <input FASTQ/FASTA file> <start pos> <cropped length>"
 	exit(1)
 endif
 
-cat $argv[1] | awk '(NR%2==1){print $0} (NR%2==0){print substr($0,'"$argv[2]"','"$argv[3]"')}' 
+cat $argv[1] | awk '(NR%2==1){print $0} (NR%2==0){print substr($0,'"$argv[2]"','"$argv[3]"')}'
diff --git a/cutadapt_quality.csh b/cutadapt_quality.csh
index 9198d9a..f29c2de 100755
--- a/cutadapt_quality.csh
+++ b/cutadapt_quality.csh
@@ -3,7 +3,7 @@
 # 	cutadapt_quality.csh
 #
 #	runs cutadapt to remove primer sequences from Illumina files
-#	also accepts a quality argument for Illumina / Sanger quality 
+#	also accepts a quality argument for Illumina / Sanger quality
 #	user specifies length cutoff
 #	user specifies whether short reads less than length cutoff are kept; if so, they are converted to size=1
 #
@@ -19,7 +19,7 @@
 # Copyright (C) 2014 Charles Chiu - All Rights Reserved
 # SURPI has been released under a modified BSD license.
 # Please see license file for details.
-                                                                                                             
+
 if ($#argv != 7) then
 	echo "Usage: cutadapt_quality.csh <input FASTQ file> <quality S/I> <length cutoff> <keep short reads Y/N> <adapter_set> <temporary_files_directory> <quality cutoff>"
 	exit(1)
@@ -113,6 +113,6 @@ endif
 #set numreads_end = `egrep -c "@HWI|@M00|@SRR" $inputfile:r.cutadapt.fastq`
 
 #@ reads_removed = $numreads_start - $numreads_end
-#echo $reads_removed" reads removed by cutadapt" 
+#echo $reads_removed" reads removed by cutadapt"
 #echo $numreads_end" reads at end of cutadapt"
 rm -f $TEMPDIR{$$}
diff --git a/download_SURPI_data.sh b/download_SURPI_data.sh
index c2bf6a3..f20f32e 100755
--- a/download_SURPI_data.sh
+++ b/download_SURPI_data.sh
@@ -12,7 +12,7 @@
 # Copyright (C) 2014 Scot Federman - All Rights Reserved
 # SURPI has been released under a modified BSD license.
 # Please see license file for details.
-# Last revised 7/2/2014  
+# Last revised 7/2/2014
 
 scriptname=${0##*/}
 bold=$(tput bold)
@@ -38,10 +38,10 @@ done
 if [[ ${HELP-} -eq 1 ]]
 then
 	cat <<USAGE
-	
+
 ${bold}$scriptname${normal}
 
-This program will download necessary files from NCBI for use with SURPI. 
+This program will download necessary files from NCBI for use with SURPI.
 
 ${bold}Command Line Switches:${normal}
 
@@ -85,7 +85,7 @@ download_file ()
 	remote_dir=$2
 	file=$3
 	md5=$4
-	
+
 	( cd $destination_folder ; curl -O "$remote_dir/$file" )
 	if [[ $md5 ]]
 	then
diff --git a/extractAlltoFast.sh b/extractAlltoFast.sh
index c7bbd94..8b1d79d 100755
--- a/extractAlltoFast.sh
+++ b/extractAlltoFast.sh
@@ -17,7 +17,7 @@ scriptname=${0##*/}
 
 if [ $# -lt 6 ]
 then
-	echo "Usage: $scriptname <inputfile> <Input File Type [BLASTN/FASTA/FASTQ]> <Parent file> <Parent File Type [FASTA/FASTQ]> <Output file> <Output format: [FASTA/FASTQ]>" 
+	echo "Usage: $scriptname <inputfile> <Input File Type [BLASTN/FASTA/FASTQ]> <Parent file> <Parent File Type [FASTA/FASTQ]> <Output file> <Output format: [FASTA/FASTQ]>"
 	exit 65
 fi
 
@@ -36,7 +36,7 @@ if [ $inputfile_type = BLASTN ]
 then
 	awk '{print$1}' $inputfile > $inputfile.header
 	echo -e "$(date)\t$scriptname\tuniqued blastn file, replaced beginning with @"
-	
+
 	if [ $parentfile_type = FASTA ]
 	then
 		seqtk subseq $parentfile $inputfile.header > $outputfile
@@ -59,12 +59,12 @@ then
 	echo -e "$(date)\t$scriptname\tDone preparing input Fasta file "
 
 	if [ $parentfile_type = FASTA ]
-	then   
+	then
 		seqtk subseq $parentfile $inputfile.header > $outputfile
 	elif [ $parentfile_type = FASTQ ]
 	then
 		if [ $output_format = FASTQ ]
-		then        
+		then
 			cat $parentfile | fqextract $inputfile.header > $outputfile
 		elif [ $output_format = FASTA ]
 		then
diff --git a/extractHeaderFromFastq.csh b/extractHeaderFromFastq.csh
index d441aa2..5945279 100755
--- a/extractHeaderFromFastq.csh
+++ b/extractHeaderFromFastq.csh
@@ -14,7 +14,7 @@
 # Copyright (C) 2014 Charles Y Chiu - All Rights Reserved
 # SURPI has been released under a modified BSD license.
 # Please see license file for details.
-# Last revised 1/26/2014  
+# Last revised 1/26/2014
 
 if ($#argv != 4) then
 	echo "Usage: extractHeaderFromFastq.csh <header file> <format of header list file (BLASTN / FASTQ / FASTA)> <data file in FASTQ format> <output file>"
diff --git a/extractSamFromSam.sh b/extractSamFromSam.sh
index 31cfe1a..79b91e6 100755
--- a/extractSamFromSam.sh
+++ b/extractSamFromSam.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
-#     
+#
 #	extactSamFromSam.sh
-# 
+#
 #	extract SAM reads corresponding to a SAM header file from another SAM reference file and writes to
 #	SAM output file
 # 	Chiu Laboratory
@@ -40,7 +40,7 @@ else
     let "numlines = `wc -l basef | awk '{print $1}'`"
     let "LinesPerCore = numlines / $cores"
     echo -e "$(date)\t$scriptname\twill use $cores cores with $LinesPerCore entries per core"
-    
+
     split -l $LinesPerCore $basef
 
     echo -e "$(date)\t$scriptname\textracting reads from $baseg using headers from $basef"
diff --git a/fastq-extractBarcodedSRA.sh b/fastq-extractBarcodedSRA.sh
index f990a01..1b2fd58 100755
--- a/fastq-extractBarcodedSRA.sh
+++ b/fastq-extractBarcodedSRA.sh
@@ -18,7 +18,7 @@ if [ $# -lt 1 ]; then
     exit
 fi
 
-# 
+#
 # gets absolute path to SRA file
 #
 SRA_PATH=$(readlink -e $1)
@@ -34,7 +34,7 @@ files=($header*.fastq)
 
 # restore barcode and R1/R2 designation to FASTQ headers
 for f in "${files[@]}"
-do 
+do
     barcode=$(echo "$f" | sed 's/_[12]//g' | sed 's/.fastq//g' | sed 's/.*_//g')
     readnum=$(echo "$f" | sed 's/.*_\([12]\).fastq/\1/g' | sed '/'$header'/d')
     echo -e "$(date)\t$scriptname\tRestoring barcode # $barcode and R1/R2 designation to $f"
diff --git a/get_genbankfasta.pl b/get_genbankfasta.pl
index c4c2f22..991a385 100755
--- a/get_genbankfasta.pl
+++ b/get_genbankfasta.pl
@@ -33,7 +33,7 @@
 
 if ($opt_h) {
 	print <<USAGE;
-	
+
 get_genbankfasta.pl
 
 Currently, the maximum number of sequences this program can retrieve in a single run is about 750.
@@ -53,7 +53,7 @@
 4. To output to a file, use the -o switch
 	get_genbankfasta.pl -o outputfile 149408158 116734707
 		note: all command line switches must be present before gi on command line
-		
+
 	get_genbankfasta.pl -i gi_list -o outputfile
 
 5. Use the -f switch to select the output type:
@@ -61,7 +61,7 @@
 	fasta - retrieve FASTA file for gi input
 	docsum - retrieve Document Summary for gi input
 	docsum_oneline - retrieve Document Summary for gi input & collapse to one line per gi
-		
+
 		get_genbankfasta.pl -f docsum_oneline 149408158 410777373
 		get_genbankfasta.pl -f fasta -i gi_list
 		get_genbankfasta.pl -f docsum_oneline -i gi_list
diff --git a/mapPerfectBLASTtoGenome.py b/mapPerfectBLASTtoGenome.py
index 2d41ca0..64ebf17 100755
--- a/mapPerfectBLASTtoGenome.py
+++ b/mapPerfectBLASTtoGenome.py
@@ -2,7 +2,7 @@
 #
 # Simple script to iterate over the -m 8 results from a
 # BLAST and spits out the number of hits at each base of the
-# query database.  
+# query database.
 #
 # Peter Skewes-Cox
 # February 3, 2009
diff --git a/plot_reads_to_gi.sh b/plot_reads_to_gi.sh
index 45c0c26..b436552 100755
--- a/plot_reads_to_gi.sh
+++ b/plot_reads_to_gi.sh
@@ -16,7 +16,7 @@
 scriptname=${0##*/}
 
 if [ $# -lt 5 ]; then
-    echo "Usage: $scriptname <.fasta> <gi # OR reference fasta> <name of gi / FA (if reference in fasta form> <e value> <cores available>"    
+    echo "Usage: $scriptname <.fasta> <gi # OR reference fasta> <name of gi / FA (if reference in fasta form> <e value> <cores available>"
     echo "not uniquing Blastn anymore 9/13/13 given a fasta file and the GI or FASTA of the reference to assemble to -> makes an assembly.  blastn e value in 1e-8 format . jedi has 16 cores available"
     exit
 fi
@@ -59,7 +59,7 @@ done
 
 cat $1*.$2.$3.$4.blastn > $1.$2.$3.$4.Blastn
 
-uniq_blastn_nopipes.sh $1.$2.$3.$4.Blastn  
+uniq_blastn_nopipes.sh $1.$2.$3.$4.Blastn
 extractAlltoFast.sh $1.$2.$3.$4.Blastn.uniq BLASTN $1 FASTA $1.$2.$3.$4.Blastn.uniq.ex.fa FASTA
 
 ####figuring out length of sequence in gi#######
@@ -79,7 +79,7 @@ echo -e -n "$(date)\t$scriptname\t%Coverage = "
 echo "scale=6;100*$coverage/$gilength" | bc
 
 let "sumofcolumntwo = `awk '{print$2}' $1.$2.$3.$4.map | awk '{sum += $1} END{print sum}'`"
-echo -e -n "$(date)\t$scriptname\tAverage depth of coverage (x) = " 
+echo -e -n "$(date)\t$scriptname\tAverage depth of coverage (x) = "
 echo "scale=6;$sumofcolumntwo/$gilength" | bc
 let "numberBlastnReads = `egrep -c "^SCS|^HWI|gi|^M00|^kmer|^SRR" $1.$2.$3.$4.Blastn.uniq`"
 echo -e "$(date)\t$scriptname\tNumber of reads contributing to assembly  $numberBlastnReads"
@@ -94,7 +94,7 @@ echo "Reference sequence length = $gilength bp" >> $1.$2.$3.$4.report
 echo "Coverage in bp = $coverage" >> $1.$2.$3.$4.report
 echo -n "%Coverage = " >> $1.$2.$3.$4.report
 echo "scale=6;100*$coverage/$gilength" | bc >> $1.$2.$3.$4.report
-echo -n "Average depth of coverage = " >> $1.$2.$3.$4.report 
+echo -n "Average depth of coverage = " >> $1.$2.$3.$4.report
 echo "scale=6;$sumofcolumntwo/$gilength" | bc >> $1.$2.$3.$4.report
 #echo "Average depth of coverage = $averagedepthcoverage x " >> $1.$2.$3.$4.report
 echo "Number of reads contributing to assembly = $numberBlastnReads" >> $1.$2.$3.$4.report
@@ -106,7 +106,7 @@ ps2pdf14 $1.$2.$3.$4.ps $1.$2.$3.$4.pdf
 #### Cleanup ########
 rm -f $1*$2.$3.$4.blastn
 rm -f $1*$2.$3.$4.error
-rm -f $3.$2.fasta.nhr                                                     
+rm -f $3.$2.fasta.nhr
 rm -f $3.$2.fasta.nin
 rm -f $3.$2.fasta.nsq
 #rm -f $1.$2.$3.$4.Blastn
diff --git a/preprocess.sh b/preprocess.sh
index 443ba3d..1791961 100755
--- a/preprocess.sh
+++ b/preprocess.sh
@@ -8,7 +8,7 @@
 #	January, 2014
 #
 # cutadapt=>crop->dust removal (no uniq) ***
-#                                                                                                                                
+#
 # 12/20/12 - modified to switch to cutadapt for trimming
 # 12/31/12 - modified from Cshell to BASH version for timing
 #
@@ -100,7 +100,7 @@ fi
 echo -e "$(date)\t$scriptname\t********** running crop, Read1 **********"
 START1=$(date +%s)
 
-if [ $run_uniq == "Y" ] 
+if [ $run_uniq == "Y" ]
 then
 	echo -e "$(date)\t$scriptname\tWe will be using $crop_length as the length of the cropped read"
 	crop_reads.csh $basef.cutadapt.uniq.fastq $start_nt $crop_length > $basef.cutadapt.uniq.cropped.fastq
@@ -117,7 +117,7 @@ echo -e "$(date)\t$scriptname\tDone crop: CROP took $diff seconds"
 echo -e "$(date)\t$scriptname\t********** running dust, Read1 **********"
 START1=$(date +%s)
 
-if [ $run_uniq == "Y" ] 
+if [ $run_uniq == "Y" ]
 then
 	prinseq-lite.pl -fastq $basef.cutadapt.uniq.cropped.fastq -out_format 3 -out_good $basef.cutadapt.uniq.cropped.dusted -out_bad $basef.cutadapt.uniq.cropped.dusted.bad -log -lc_method dust -lc_threshold 7
 	mv -f $basef.cutadapt.uniq.cropped.dusted.fastq $basef.preprocessed.fastq
diff --git a/preprocess_ncores.sh b/preprocess_ncores.sh
index 820e50a..89760a5 100755
--- a/preprocess_ncores.sh
+++ b/preprocess_ncores.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-#                      
+#
 #	preprocess_ncores.sh
 #
 #	This script runs preprocessing across multiple cores (FASTA/FASTQ header modification, quality filtering, adapter trimming, and low-complexity filtering)
@@ -108,13 +108,13 @@ do
 	cat $basef.preprocessed.fastq >> $basef2.preprocessed.fastq
 	rm -f $basef.preprocessed.fastq
 	cat $basef.cutadapt.cropped.dusted.bad.fastq >> $basef2.cutadapt.cropped.dusted.bad.fastq
-	rm -f $basef.cutadapt.cropped.dusted.bad.fastq 
+	rm -f $basef.cutadapt.cropped.dusted.bad.fastq
 
 	rm -f $f
 	rm -f $basef.modheader.fastq
 	rm -f $basef.cutadapt.summary.log
 	rm -f $basef.adapterinfo.log
-	rm -f $basef.cutadapt.cropped.fastq 
+	rm -f $basef.cutadapt.cropped.fastq
 done
 
 echo -e "$(date)\t$scriptname\tDone concatenating output..."
diff --git a/ribo_snap_bac_euk.sh b/ribo_snap_bac_euk.sh
index 261e3de..97aea22 100755
--- a/ribo_snap_bac_euk.sh
+++ b/ribo_snap_bac_euk.sh
@@ -58,8 +58,8 @@ awk '{print$1}' $inputfile.noSmallS_LargeS.sam | sed '/^@/d' > $inputfile.noSmal
 # retrieve reads from original $inputfile
 
 extractSamFromSam.sh $inputfile.noSmallS_LargeS.header.sam $inputfile $basef.noRibo.annotated
-echo -e "$(date)\t$scriptname\tCreated $inputfile.noRibo.annotated" 
-table_generator.sh $basef.noRibo.annotated SNAP N Y N N 
+echo -e "$(date)\t$scriptname\tCreated $inputfile.noRibo.annotated"
+table_generator.sh $basef.noRibo.annotated SNAP N Y N N
 
 rm -f $inputfile.noLargeS.sam
 rm -f $inputfile.noLargeS.matched.sam
diff --git a/slave/setup_slave.sh b/slave/setup_slave.sh
index e9a52a8..58a5261 100755
--- a/slave/setup_slave.sh
+++ b/slave/setup_slave.sh
@@ -22,7 +22,7 @@ if [ ! -d $bin_folder ]
 then
 	mkdir $bin_folder
 fi
-	
+
 CWD=$(pwd)
 
 # set timezone
diff --git a/snap_nt.sh b/snap_nt.sh
index 50c31b3..f0fe92c 100755
--- a/snap_nt.sh
+++ b/snap_nt.sh
@@ -1,16 +1,16 @@
 #!/bin/bash
-#                                                                                                                                
+#
 #	snap_nt.sh
 #
 #	This script runs SNAP against the NT database
 #	Chiu Laboratory
 #	University of California, San Francisco
 #
-# 
+#
 # Copyright (C) 2014 Charles Chiu - All Rights Reserved
 # SURPI has been released under a modified BSD license.
 # Please see license file for details.
-                 
+
 expected_args=6
 scriptname=${0##*/}
 
@@ -72,12 +72,12 @@ for snap_index in $SNAP_NT_index_directory/* ; do
 	cat $basef.snap.log >> $basef.snapNT.log
 	cat $basef.time.log >> $basef.timeNT.log
 
-    
+
 	compare_sam.py $basef.tmp.sam $basef.prev.sam
 	cat $basef.prev.sam | egrep -v "^@" | awk '{print "@"$1"\n"$10"\n""+"$1"\n"$11}' > $basef.tmp.fastq
 
 	counter=1
-    
+
 	END2=$(date +%s)
 
 	echo -e "$(date)\t$scriptname\tDone with $snap_index."
@@ -86,7 +86,7 @@ for snap_index in $SNAP_NT_index_directory/* ; do
 done
 
 # need to restore the hits
-update_sam.py $basef.prev.sam $basef.NT.sam 
+update_sam.py $basef.prev.sam $basef.NT.sam
 
 rm -f $basef.tmp.sam
 rm -f $basef.tmp.fastq
diff --git a/split_fasta.pl b/split_fasta.pl
index 060cb92..e040512 100755
--- a/split_fasta.pl
+++ b/split_fasta.pl
@@ -80,7 +80,7 @@
     #write record to file
     print (OUTFILE ">$sequenceTitle\n");
     print (OUTFILE "$sequenceEntry\n");
-    $seqCount++;   
+    $seqCount++;
     $seqThisFile++;
 
     if ($seqThisFile == $numberToCopy) {
diff --git a/start_slaves.sh b/start_slaves.sh
index 622df47..e428d77 100755
--- a/start_slaves.sh
+++ b/start_slaves.sh
@@ -33,14 +33,14 @@ file_with_slave_ips=$7
 placement_group=$8
 ###
 
-#This is the time (in seconds) this script waits after starting up the AWS machines. 
+#This is the time (in seconds) this script waits after starting up the AWS machines.
 #It allows the instances time to start up. In practice, 120s appears to be sufficient.
 WAIT_TIME=120
 
 #folder where slave scripts are located
 slave_folder="/usr/local/bin/surpi/slave"
 #script located on slave to run
-slave_script="slave_setup.sh"	
+slave_script="slave_setup.sh"
 
 #this parameter is currently tied to the $keypair used during slave_setup.sh. should be cleaned up prior to release
 pemkey="/home/ubuntu/.ssh/surpi.pem"
@@ -106,7 +106,7 @@ do
 done
 # instance_id_list=( $result )
 # instance_id_size=${#instance_id_list[@]}
-# 
+#
 # if [ "$instance_id_size" != "$number_of_instances" ]; then
 # 	echo "ERROR: could not create $number_of_instances instances"
 # 	echo "$number_of_instances instances were created."
@@ -150,7 +150,7 @@ do
 
 	echo -e "$(date)\t$scriptname\t$attached" >> slave.$COUNTER.log 2>&1
 	#verify $attached here to verify that EBS was attached
-	
+
 	echo -e "$(date)\t$scriptname\tconnecting and running $slave_script script..." >> slave.$COUNTER.log 2>&1
 	ssh -o StrictHostKeyChecking=no -i $pemkey ubuntu@$host "/home/ubuntu/$slave_script" >> slave.$COUNTER.log 2>&1 &
 	let COUNTER=COUNTER+1
diff --git a/table_generator.sh b/table_generator.sh
index 240d3b1..4491367 100755
--- a/table_generator.sh
+++ b/table_generator.sh
@@ -7,7 +7,7 @@
 #	University of California, San Francisco
 #	January, 2014
 #
-# input file: annotated sam or annotated -m 8 file with taxonomies provided in the following format at the end of the .sam and -m 8 file: 
+# input file: annotated sam or annotated -m 8 file with taxonomies provided in the following format at the end of the .sam and -m 8 file:
 # "gi# --family --genus --species"
 # Output files are tab delimited  files ending in .counttable whereby rows represent taxonomic annotations at various levels (family, genus, species, gi)
 # Columns represent individual barcodes found in the dataset, and cells contain the number of reads
@@ -20,7 +20,7 @@
 scriptname=${0##*/}
 
 if [ $# -lt 6 ]; then
-	echo "Usage: $scriptname <annotated file> <SNAP/RAP> <gi Y/N> <species Y/N> <genus Y/N> <family Y/N> " 
+	echo "Usage: $scriptname <annotated file> <SNAP/RAP> <gi Y/N> <species Y/N> <genus Y/N> <family Y/N> "
     exit
 fi
 
@@ -33,7 +33,7 @@ genus=$5
 family=$6
 ###
 
-###substitute forward slash with @_ because forward slash in species name makes it ungreppable. using @_ because @ is used inside the contig barcode (ie. #@13 is barcode 13, contig generated) 
+###substitute forward slash with @_ because forward slash in species name makes it ungreppable. using @_ because @ is used inside the contig barcode (ie. #@13 is barcode 13, contig generated)
 create_tab_delimited_table.pl -f $file_type $inputfile  |  sed 's/ /_/g' | sed 's/,/_/g' | sed 's/\//@_/g' > $inputfile.tempsorted
 echo -e "$(date)\t$scriptname\tdone creating $inputfile.tempsorted"
 
@@ -61,7 +61,7 @@ then
 		done
 	echo -e "GI\tSpecies\tGenus\tFamily(@=contigbarcode)" > $inputfile.header
 	cat $inputfile.header $inputfile.gi.uniq.columntable > $inputfile.gi.counttable_temp
-	paste $inputfile.gi.counttable_temp bar*.$inputfile.gi.output > $inputfile.gi.counttable 
+	paste $inputfile.gi.counttable_temp bar*.$inputfile.gi.output > $inputfile.gi.counttable
 	sed -i 's/@_/ /g' $inputfile.gi.counttable
 	echo -e "$(date)\t$scriptname\tdone generating gi counttable"
 fi
@@ -84,7 +84,7 @@ then
 	done
 	echo -e "Species\tGenus\tFamily(@=contigbarcode)" > $inputfile.header
 	cat $inputfile.header $inputfile.species.uniq.columntable > $inputfile.species.counttable_temp
-	paste $inputfile.species.counttable_temp bar*.$inputfile.species.output > $inputfile.species.counttable 
+	paste $inputfile.species.counttable_temp bar*.$inputfile.species.output > $inputfile.species.counttable
 	sed -i 's/@_/ /g' $inputfile.species.counttable
 	echo -e "$(date)\t$scriptname\tdone generating species counttable"
 fi
@@ -106,7 +106,7 @@ then
 	done
 	echo -e "Genus\tFamily(@=contigbarcode)" > $inputfile.header
 	cat $inputfile.header $inputfile.genus.uniq.columntable > $inputfile.genus.counttable_temp
-	paste $inputfile.genus.counttable_temp bar*.$inputfile.genus.output > $inputfile.genus.counttable 
+	paste $inputfile.genus.counttable_temp bar*.$inputfile.genus.output > $inputfile.genus.counttable
 	sed -i 's/@_/ /g' $inputfile.genus.counttable
 	echo -e "$(date)\t$scriptname\tdone generating genus counttable"
 fi
diff --git a/taxonomy_lookup_embedded.pl b/taxonomy_lookup_embedded.pl
index 5709a89..7184152 100755
--- a/taxonomy_lookup_embedded.pl
+++ b/taxonomy_lookup_embedded.pl
@@ -9,7 +9,7 @@
 # Copyright (C) 2014 Scot Federman - All Rights Reserved
 # SURPI has been released under a modified BSD license.
 # Please see license file for details.
-# Last revised 1/26/2014    
+# Last revised 1/26/2014
 
 use DBI;
 use strict;
@@ -36,12 +36,12 @@
 
 if ($opt_h) {
 	print <<USAGE;
-	
+
 taxonomy_lookup_embedded.pl
 
 This program will query the NCBI taxonomic database and return whatever taxonomy is requested for a gi, or list of gis. The
 returned data will be in tabular format, and will be in the following order.
-	
+
 	species	genus	family	order	class	phylum	kingdom
 
 Usage:
@@ -60,7 +60,7 @@
 
 	-d	nucl/prot
 		This specifies whether the gi list are nucleotides or protein. It is a required parameter.
-	
+
 	The following switches all will specify which taxonomic information will be returned.
 	-k	Kingdom
 	-p	Phylum
@@ -71,7 +71,7 @@
 	-s	Species
 	-l	Lineage
 	-x	Display Taxid in output table
-	
+
 USAGE
 	exit;
 }
diff --git a/tweet.pl b/tweet.pl
index 23b21f4..27127ad 100755
--- a/tweet.pl
+++ b/tweet.pl
@@ -7,13 +7,13 @@
 #	University of California, San Francisco
 #	January, 2014
 #
-#In order to use this script, the consumer_key, consumer_secret, oauth_token, and oauth_token_secret must be filled in with 
+#In order to use this script, the consumer_key, consumer_secret, oauth_token, and oauth_token_secret must be filled in with
 #proper values for your chosen Twitter account.
 #
 # Copyright (C) 2014 Scot Federman - All Rights Reserved
 # SURPI has been released under a modified BSD license.
 # Please see license file for details.
-# Last revised 1/26/2014    
+# Last revised 1/26/2014
 
 use Net::Twitter::Lite::WithAPIv1_1;
 use Scalar::Util 'blessed';
diff --git a/update_sam.py b/update_sam.py
index 4c81084..395886d 100755
--- a/update_sam.py
+++ b/update_sam.py
@@ -1,7 +1,7 @@
 #!/usr/bin/python
 #
 #	update_sam.py
-#       this script reannotates the SAM file based on the better hit identified in "compare_sam.py" 
+#       this script reannotates the SAM file based on the better hit identified in "compare_sam.py"
 #
 #	Chiu Laboratory
 #	University of California, San Francisco

From fcb9025b3c04f7ed4a2927ee9471b70c197b24fe Mon Sep 17 00:00:00 2001
From: Simon Ye <sye737@gmail.com>
Date: Thu, 16 Apr 2015 19:47:05 -0400
Subject: [PATCH 07/18] Fix shebangs for python/perl scripts.

---
 compare_multiple_sam.py       | 2 +-
 compare_sam.py                | 2 +-
 coveragePlot.py               | 2 +-
 create_tab_delimited_table.pl | 2 +-
 create_taxonomy_db.py         | 2 +-
 get_genbankfasta.pl           | 2 +-
 mapPerfectBLASTtoGenome.py    | 2 +-
 split_fasta.pl                | 2 +-
 taxonomy_lookup.pl            | 2 +-
 taxonomy_lookup_embedded.pl   | 3 ++-
 tweet.pl                      | 2 +-
 update_sam.py                 | 2 +-
 12 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/compare_multiple_sam.py b/compare_multiple_sam.py
index 334536b..ce151ed 100755
--- a/compare_multiple_sam.py
+++ b/compare_multiple_sam.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 #	compare_multiple_sam.py
 #
diff --git a/compare_sam.py b/compare_sam.py
index 7ced0a3..07e8703 100755
--- a/compare_sam.py
+++ b/compare_sam.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 #	compare_sam.py
 #
diff --git a/coveragePlot.py b/coveragePlot.py
index 62a7930..0c48713 100755
--- a/coveragePlot.py
+++ b/coveragePlot.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 #	coveragePlot.py
 #
diff --git a/create_tab_delimited_table.pl b/create_tab_delimited_table.pl
index f0a0710..a1be09e 100755
--- a/create_tab_delimited_table.pl
+++ b/create_tab_delimited_table.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
 #
 #	create_tab_delimited_table.pl
 #
diff --git a/create_taxonomy_db.py b/create_taxonomy_db.py
index bc8fac5..09fe402 100755
--- a/create_taxonomy_db.py
+++ b/create_taxonomy_db.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 #	create_taxonomy_db.py
 #
diff --git a/get_genbankfasta.pl b/get_genbankfasta.pl
index 991a385..995872e 100755
--- a/get_genbankfasta.pl
+++ b/get_genbankfasta.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
 #
 #	get_genbankfasta.pl
 #
diff --git a/mapPerfectBLASTtoGenome.py b/mapPerfectBLASTtoGenome.py
index 64ebf17..cf3da59 100755
--- a/mapPerfectBLASTtoGenome.py
+++ b/mapPerfectBLASTtoGenome.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # Simple script to iterate over the -m 8 results from a
 # BLAST and spits out the number of hits at each base of the
diff --git a/split_fasta.pl b/split_fasta.pl
index e040512..f67f660 100755
--- a/split_fasta.pl
+++ b/split_fasta.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
 #split_fasta.pl version 1.0
 #This script accepts a file consisting of multiple FASTA formatted sequence records.
 #It splits the file into multiple new files, each consisting of a subset of the original records.
diff --git a/taxonomy_lookup.pl b/taxonomy_lookup.pl
index d8e5ba2..79c0f6e 100755
--- a/taxonomy_lookup.pl
+++ b/taxonomy_lookup.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl -w
 #
 #	taxonomy_lookup.pl
 #
diff --git a/taxonomy_lookup_embedded.pl b/taxonomy_lookup_embedded.pl
index 7184152..d8e7a51 100755
--- a/taxonomy_lookup_embedded.pl
+++ b/taxonomy_lookup_embedded.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
 #
 #	taxonomy_lookup_embedded.pl
 #
@@ -12,6 +12,7 @@
 # Last revised 1/26/2014
 
 use DBI;
+use warnings;
 use strict;
 use Getopt::Std;
 use Time::HiRes qw[gettimeofday tv_interval];
diff --git a/tweet.pl b/tweet.pl
index 27127ad..255896b 100755
--- a/tweet.pl
+++ b/tweet.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
 #
 #	tweet.pl
 #
diff --git a/update_sam.py b/update_sam.py
index 395886d..29cc545 100755
--- a/update_sam.py
+++ b/update_sam.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 #	update_sam.py
 #       this script reannotates the SAM file based on the better hit identified in "compare_sam.py"

From 8256bb3079ad6856d4f4fef5d507482d9356e509 Mon Sep 17 00:00:00 2001
From: Simon Ye <sye737@gmail.com>
Date: Sat, 18 Apr 2015 11:07:16 -0400
Subject: [PATCH 08/18] Append to $PATH in config file.

---
 SURPI.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SURPI.sh b/SURPI.sh
index 3c4264a..84804fe 100755
--- a/SURPI.sh
+++ b/SURPI.sh
@@ -91,7 +91,7 @@ fi
 
 if [[ $create_config_file ]]
 then
-	echo "PATH=/usr/local/bin/surpi:/usr/local/bin:/usr/bin/:/bin" > go_$configprefix
+	echo "PATH=/usr/local/bin/surpi:/usr/local/bin:/usr/bin/:/bin:$PATH" > go_$configprefix
 	echo "nohup $scriptname -f $configprefix.config > SURPI.$configprefix.log 2> SURPI.$configprefix.err" >> go_$configprefix
 
 	chmod +x go_$configprefix

From c2dcc66a2b240df181c72a5941cede63b1b8aed6 Mon Sep 17 00:00:00 2001
From: Simon Ye <sye737@gmail.com>
Date: Sat, 18 Apr 2015 12:02:26 -0400
Subject: [PATCH 09/18] Remove ABYSS-P from hard dependencies.

---
 SURPI.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SURPI.sh b/SURPI.sh
index 84804fe..e0bc6ed 100755
--- a/SURPI.sh
+++ b/SURPI.sh
@@ -487,7 +487,7 @@ nopathf=${FASTQ_file##*/} # remove the path to file
 basef=${nopathf%.fastq}
 
 #verify that all software dependencies are properly installed
-declare -a dependency_list=("gt" "seqtk" "fastq" "fqextract" "cutadapt" "prinseq-lite.pl" "dropcache" "$snap" "rapsearch" "fastQValidator" "abyss-pe" "ABYSS-P" "Minimo")
+declare -a dependency_list=("gt" "seqtk" "fastq" "fqextract" "cutadapt" "prinseq-lite.pl" "dropcache" "$snap" "rapsearch" "fastQValidator" "abyss-pe" "Minimo")
 echo "-----------------------------------------------------------------------------------------"
 echo "DEPENDENCY VERIFICATION"
 echo "-----------------------------------------------------------------------------------------"

From dd5d6e0321e175eb94a45f6b633791cb857c93d8 Mon Sep 17 00:00:00 2001
From: Simon Ye <sye737@gmail.com>
Date: Sat, 18 Apr 2015 12:07:45 -0400
Subject: [PATCH 10/18] Add back default value for -d in SURPI.sh.

---
 SURPI.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SURPI.sh b/SURPI.sh
index e0bc6ed..1193cbe 100755
--- a/SURPI.sh
+++ b/SURPI.sh
@@ -26,7 +26,7 @@ scriptname=${0##*/}
 while getopts "$optspec" option; do
 	case "${option}" in
 		f) config_file=${OPTARG};; # get parameters from config file if specified
-    d) ref_path=${OPTARG};; # Path to reference db directory.
+		d) ref_path=${OPTARG:-/reference};; # Path to reference db directory.
 		h) HELP=1;;
 		v) VERIFICATION=1;;
 		z)	create_config_file=${OPTARG}

From 52b56755ac7c9bf73e2aecbb105fb6bcd29485d6 Mon Sep 17 00:00:00 2001
From: Simon Ye <sye737@gmail.com>
Date: Wed, 20 May 2015 16:49:02 -0400
Subject: [PATCH 11/18] Use 0x4 FLAGS bit for sam merging, not edit distance.

---
 compare_sam.py | 125 +++++++++++++++++++++++++------------------------
 1 file changed, 64 insertions(+), 61 deletions(-)

diff --git a/compare_sam.py b/compare_sam.py
index 07e8703..a8feb98 100755
--- a/compare_sam.py
+++ b/compare_sam.py
@@ -14,13 +14,13 @@
 import os
 
 def logHeader():
-	import os.path, sys, time
-	return "%s\t%s\t" % (time.strftime("%a %b %d %H:%M:%S %Z %Y"), os.path.basename(sys.argv[0]))
+    import os.path, sys, time
+    return "%s\t%s\t" % (time.strftime("%a %b %d %H:%M:%S %Z %Y"), os.path.basename(sys.argv[0]))
 
 usage = "compare_sam.py <annotated SAM file> <outputFile>"
 if len(sys.argv) != 3:
-	print usage
-	sys.exit(0)
+    print usage
+    sys.exit(0)
 
 SAMfile1 = sys.argv[1]
 outputFile1 = sys.argv[2]
@@ -41,63 +41,66 @@ def logHeader():
 linenum += 1
 
 while line1 != '':
-	data1 = line1.split()
-
-	# check to see whether line is in the SAM header or alignment section
-	if (data1[0][0]!="@"):
-		#We are in the SAM alignment section
-
-		edit_distance=int(data1[12].split(":")[2])
-
-		#using old snap (<=v0.15.4), could use length of line to determine if it was a hit or miss.
-		#new snap (>=v1.0) instead uses an edit distance of -1 to signify a miss.
-
-		if (edit_distance >= 0):
-			#this is a hit
-			new_hits += 1
-			sum_edistance_new += int(data1[12].split(":")[2])
-			firstentry = data1[0].split("|")
-			if (len(firstentry)>=2):
-				# this has been previously flagged as a hit, and is also now a hit. Compare edit distances to find best hit.
-				edit_distance_previous=firstentry[1]
-				edit_distance_current = data1[12].split(":")[2]
-
-				existing_hits += 1
-				sum_edistance_existing += int(edit_distance_previous)
-
-				if (int(edit_distance_current) <= int(edit_distance_previous)):
-					replacements += 1
-					sum_data1 += int(edit_distance_current)
-					# need to pick the new gi, not the old gi, bug fix 1/27/13
-					gi=data1[2].split("|")[1]
-					replacement_line = line1.replace("|" + firstentry[1] + "|" + firstentry[2],"|" + str(edit_distance_current) + "|" + str(gi),1)
-					outputFile.write(replacement_line)
-				else:
-					sum_data1 += int(edit_distance_previous)
-					outputFile.write(line1)
-			else:
-				edit_distance_current = data1[12].split(":")[2]
-				existing_hits += 1
-				sum_data1 += int(edit_distance_current)
-				gi=data1[2].split("|")[1]
-				replacement_line = line1.replace(data1[0],data1[0] + "|" + str(edit_distance_current) + "|" + str(gi),1)
-				outputFile.write(replacement_line)
-				replacements+=1
-		else:
-			#this is a miss
-			outputFile.write(line1)
-			firstentry = data1[0].split("|")
-			if (len(firstentry)>=2):
-				edit_distance_previous=firstentry[1]
-				existing_hits += 1
-				sum_data1 += int(edit_distance_previous)
-				sum_edistance_existing += int(edit_distance_previous)
-	else:
-		#We are in the SAM header section
-		outputFile.write(line1)
-
-	line1 = file1.readline()
-	linenum += 1
+    data1 = line1.split()
+
+    # check to see whether line is in the SAM header or alignment section
+    if (data1[0][0]!="@"):
+        #We are in the SAM alignment section
+
+        edit_distance=int(data1[12].split(":")[2])
+        mapped = not bool(int(data1[1]) & 0x4)
+
+        #using old snap (<=v0.15.4), could use length of line to determine if it was a hit or miss.
+        #new snap (>=v1.0) instead uses an edit distance of -1 to signify a miss.
+
+        # Not sure what above is saying.
+        # Use 0x4 mask in FLAG field to check for unmapped reads.
+        if (mapped):
+            #this is a hit
+            new_hits += 1
+            sum_edistance_new += edit_distance
+            firstentry = data1[0].split("|")
+            if (len(firstentry)>=2):
+                # this has been previously flagged as a hit, and is also now a hit. Compare edit distances to find best hit.
+                edit_distance_previous=firstentry[1]
+                edit_distance_current = data1[12].split(":")[2]
+
+                existing_hits += 1
+                sum_edistance_existing += int(edit_distance_previous)
+
+                if (int(edit_distance_current) <= int(edit_distance_previous)):
+                    replacements += 1
+                    sum_data1 += int(edit_distance_current)
+                    # need to pick the new gi, not the old gi, bug fix 1/27/13
+                    gi=data1[2].split("|")[1]
+                    replacement_line = line1.replace("|" + firstentry[1] + "|" + firstentry[2],"|" + str(edit_distance_current) + "|" + str(gi),1)
+                    outputFile.write(replacement_line)
+                else:
+                    sum_data1 += int(edit_distance_previous)
+                    outputFile.write(line1)
+            else:
+                edit_distance_current = data1[12].split(":")[2]
+                existing_hits += 1
+                sum_data1 += int(edit_distance_current)
+                gi=data1[2].split("|")[1]
+                replacement_line = line1.replace(data1[0],data1[0] + "|" + str(edit_distance_current) + "|" + str(gi),1)
+                outputFile.write(replacement_line)
+                replacements+=1
+        else:
+            #this is a miss
+            outputFile.write(line1)
+            firstentry = data1[0].split("|")
+            if (len(firstentry)>=2):
+                edit_distance_previous=firstentry[1]
+                existing_hits += 1
+                sum_data1 += int(edit_distance_previous)
+                sum_edistance_existing += int(edit_distance_previous)
+    else:
+        #We are in the SAM header section
+        outputFile.write(line1)
+
+    line1 = file1.readline()
+    linenum += 1
 file1.close()
 outputFile.close()
 

From 3b60a93b6d7d7b6f6331dbe92440f740ba1e6511 Mon Sep 17 00:00:00 2001
From: Simon Ye <sye737@gmail.com>
Date: Thu, 21 May 2015 11:26:47 -0400
Subject: [PATCH 12/18] Add debug/logging scripts.

---
 debug.sh   | 41 +++++++++++++++++++++++++++++++++++++++++
 logging.sh | 29 +++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+)
 create mode 100755 debug.sh
 create mode 100755 logging.sh

diff --git a/debug.sh b/debug.sh
new file mode 100755
index 0000000..8a07c32
--- /dev/null
+++ b/debug.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+set -e
+trap _exit_trap EXIT
+trap _err_trap ERR
+_showed_traceback=f
+
+function _exit_trap
+{
+  local _ec="$?"
+  if [[ $_ec != 0 && "${_showed_traceback}" != t ]]; then
+    traceback 1
+  fi
+}
+
+function _err_trap
+{
+  local _ec="$?"
+  local _cmd="${BASH_COMMAND:-unknown}"
+  traceback 1
+  _showed_traceback=t
+  echo "The command ${_cmd} exited with exit code ${_ec}." 1>&2
+}
+
+function traceback
+{
+  # Hide the traceback() call.
+  local -i start=$(( ${1:-0} + 1 ))
+  local -i end=${#BASH_SOURCE[@]}
+  local -i i=0
+  local -i j=0
+
+  echo "Traceback (last called is first):" 1>&2
+  for ((i=${start}; i < ${end}; i++)); do
+    j=$(( $i - 1 ))
+    local function="${FUNCNAME[$i]}"
+    local file="${BASH_SOURCE[$i]}"
+    local line="${BASH_LINENO[$j]}"
+    echo "     ${function}() in ${file}:${line}" 1>&2
+  done
+}
+
diff --git a/logging.sh b/logging.sh
new file mode 100755
index 0000000..8168821
--- /dev/null
+++ b/logging.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+##########################################################################
+# library
+##########################################################################
+
+# via http://stackoverflow.com/a/12199798/18706
+format_date() {
+  ((h=${1}/3600))
+  ((m=(${1}%3600)/60))
+  ((s=${1}%60))
+  printf "%02d:%02d:%02d\n" $h $m $s
+}
+
+start=$(date +"%s")
+age() {
+  now=$(date +"%s")
+  the_age=$(($now-$start))
+  format_date $the_age
+}
+
+announce() {
+  echo `age`" ${FUNCNAME[ 1 ]} $1"
+}
+
+log() {
+  echo -e "$(age)\t$scriptname\t $1"
+}
+

From 1320ce26fec835e942e9f56c9e4b251fdbb7bbcd Mon Sep 17 00:00:00 2001
From: Simon Ye <sye737@gmail.com>
Date: Thu, 21 May 2015 21:41:51 -0400
Subject: [PATCH 13/18] Normalize subtraction snap directory name.

---
 create_SURPI_data.sh | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/create_SURPI_data.sh b/create_SURPI_data.sh
index e79a01b..93a1cd9 100755
--- a/create_SURPI_data.sh
+++ b/create_SURPI_data.sh
@@ -35,6 +35,7 @@ cleanup_dir="rawdata"
 #You should likely stick with the default values here. If changing them, then the corresponding
 #values will also need to be changed within the SURPI config file.
 reference_dir="reference"
+subtraction_dir="Subtraction_SNAP"
 taxonomy_dir="taxonomy"
 RAPSearch_dir="RAPSearch"
 FAST_dir="FAST_SNAP"
@@ -133,8 +134,12 @@ if [ ! -d "$reference_dir/$FAST_dir" ]; then
 	mkdir "$reference_dir/$FAST_dir"
 fi
 
+if [ ! -d "$reference_dir/$subtraction_dir" ]; then
+	mkdir "$reference_dir/$subtraction_dir"
+fi
+
 echo -e "$(date)\t$scriptname\tMoving curated data into place..."
-mv snap_index_hg19_rRNA_mito_Hsapiens_rna "$reference_dir"
+mv snap_index_hg19_rRNA_mito_Hsapiens_rna "$reference_dir/$subtraction_dir"
 mv snap_index_Bacterial_Refseq_05172012.CLEAN.LenFiltered.uniq_s16 "$reference_dir/$FAST_dir"
 mv rapsearch_viral_aa_130628_db_v2.12 "$reference_dir/$RAPSearch_dir"
 mv rapsearch_viral_aa_130628_db_v2.12.info "$reference_dir/$RAPSearch_dir"

From a521bfc785581b2eaae2c71a249d31ebc9de12c0 Mon Sep 17 00:00:00 2001
From: Simon Ye <sye737@gmail.com>
Date: Thu, 21 May 2015 21:42:15 -0400
Subject: [PATCH 14/18] Taxonomy lookup use warnings not shebang.

---
 taxonomy_lookup.pl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/taxonomy_lookup.pl b/taxonomy_lookup.pl
index 79c0f6e..35d7b85 100755
--- a/taxonomy_lookup.pl
+++ b/taxonomy_lookup.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/env perl -w
+#!/usr/bin/env perl
 #
 #	taxonomy_lookup.pl
 #
@@ -12,6 +12,7 @@
 # Please see license file for details.
 
 use strict;
+use warnings;
 #use diagnostics;
 use Time::HiRes qw[gettimeofday tv_interval];
 use DBI;

From 225d4c1bbf954e52b4f4b3e3dfbfdfb43f6a2069 Mon Sep 17 00:00:00 2001
From: Simon Ye <sye737@gmail.com>
Date: Thu, 21 May 2015 21:42:59 -0400
Subject: [PATCH 15/18] Specify fastq format in ribo snap.

---
 ribo_snap_bac_euk.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ribo_snap_bac_euk.sh b/ribo_snap_bac_euk.sh
index 97aea22..5a8a080 100755
--- a/ribo_snap_bac_euk.sh
+++ b/ribo_snap_bac_euk.sh
@@ -44,12 +44,12 @@ fasta_to_fastq $inputfile.fasta > $inputfile.fakq # if Bacteria.annotated file h
 
 crop_reads.csh $inputfile.fakq 10 75 > $inputfile.fakq.crop
 # snap against large ribosomal subunit
-snap single $SNAP_index_Large $inputfile.fakq.crop -o $inputfile.noLargeS.unmatched.sam -t $cores -x -f -h 250 -d 18 -n 200 -F u
+snap single $SNAP_index_Large -fastq $inputfile.fakq.crop -o $inputfile.noLargeS.unmatched.sam -t $cores -x -f -h 250 -d 18 -n 200 -F u
 egrep -v "^@" $inputfile.noLargeS.unmatched.sam | awk '{if($3 == "*") print "@"$1"\n"$10"\n""+"$1"\n"$11}' > $(echo "$inputfile".noLargeS.unmatched.sam | sed 's/\(.*\)\..*/\1/').fastq
 echo -e "$(date)\t$scriptname\tDone: first snap alignment"
 
 # snap against small ribosomal subunit
-snap single $SNAP_index_Small $inputfile.noLargeS.unmatched.fastq -o $inputfile.noSmallS_LargeS.sam -t $cores -h 250 -d 18 -n 200 -F u
+snap single $SNAP_index_Small -fastq $inputfile.noLargeS.unmatched.fastq -o $inputfile.noSmallS_LargeS.sam -t $cores -h 250 -d 18 -n 200 -F u
 echo -e "$(date)\t$scriptname\tDone: second snap alignment"
 
 # convert snap unmatched to ribo output to header format

From 911a905fcd601fb4a194761eed3c60ee3465f5ab Mon Sep 17 00:00:00 2001
From: Simon Ye <sye737@gmail.com>
Date: Thu, 21 May 2015 21:43:45 -0400
Subject: [PATCH 16/18] No dropcache in snap_nt_combine.

---
 snap_nt_combine.sh | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/snap_nt_combine.sh b/snap_nt_combine.sh
index 08b71d7..cb700aa 100755
--- a/snap_nt_combine.sh
+++ b/snap_nt_combine.sh
@@ -19,7 +19,7 @@
 # Copyright (C) 2014 Charles Chiu - All Rights Reserved
 # Permission to copy and modify is granted under the BSD license
 
-expected_args=6
+expected_args=5
 scriptname=${0##*/}
 
 if [ $# -lt $expected_args ]
@@ -32,9 +32,8 @@ fi
 inputfile=$1
 SNAP_NT_index_directory=$2
 cores=$3
-free_cache_cutoff=$4
-SNAP_d_cutoff=$5
-simultaneous_SNAPs=$6
+SNAP_d_cutoff=$4
+simultaneous_SNAPs=$5
 ###
 
 echo -e "$(date)\t$scriptname\tStarting SNAP to NT"
@@ -55,14 +54,6 @@ for snap_index in $SNAP_NT_index_directory/*; do
 	nopathsnap_index=${snap_index##*/} # remove the path to file
 	echo -e "$(date)\t$scriptname\tStarting SNAP on $nopathsnap_index"
 
-	freemem=$(free -g | awk '{print $4}' | head -n 2 | tail -1)
-	echo -e "$(date)\t$scriptname\tThere is $freemem GB available free memory...[cutoff=$free_cache_cutoff GB]"
-	if [ $freemem -lt $free_cache_cutoff ]
-	then
-		echo -e "$(date)\t$scriptname\tClearing cache..."
-		dropcache
-	fi
-
 	START_SNAP=$(date +%s)
 	/usr/bin/time -o $basef.$nopathsnap_index.snap.log snap single $snap_index $basef.fastq -o $basef.$nopathsnap_index.sam -t $cores -x -f -h 250 -d $SNAP_d_cutoff -n 25 > $basef.$nopathsnap_index.time.log
 	SNAP_DONE=$(date +%s)

From b998dc5f93337628751415b9d4ea7da23fedf952 Mon Sep 17 00:00:00 2001
From: Simon Ye <sye737@gmail.com>
Date: Thu, 21 May 2015 21:46:21 -0400
Subject: [PATCH 17/18] Cleanup and no dropcache.

---
 abyss_minimus.sh                 | 53 +++++++++++++-----------
 extractHeaderFromFastq_ncores.sh |  2 +-
 preprocess.sh                    | 28 +++++++------
 preprocess_ncores.sh             | 61 ++++++++++++---------------
 snap_nt.sh                       | 71 ++++++++++++++++----------------
 5 files changed, 108 insertions(+), 107 deletions(-)

diff --git a/abyss_minimus.sh b/abyss_minimus.sh
index cc924f5..99d62d7 100755
--- a/abyss_minimus.sh
+++ b/abyss_minimus.sh
@@ -15,8 +15,9 @@
 # Copyright (C) 2014 Samia N Naccache - All Rights Reserved
 # SURPI has been released under a modified BSD license.
 # Please see license file for details.
-
 scriptname=${0##*/}
+source debug.sh
+source logging.sh
 
 if [ $# -lt 5 ]
 then
@@ -34,7 +35,7 @@ ignore_barcodes=$6
 split_FASTA_size=100000
 
 #### create list of barcodes in the fasta file in the following format: #barcode
-echo -e "$(date)\t$scriptname\tDemultiplexing barcodes..."
+log "Demultiplexing barcodes..."
 START_DEMULTIPLEX=$(date +%s)
 
 #The sed '/^#$/d' removes a blank #. This is needed in cases where barcodes are present along with reads that have no barcodes
@@ -54,10 +55,10 @@ else #$inputfile.barcodes doesn't exist, or size=0, then we only have 1 barcode,
 	echo "#" > $inputfile.barcodes
 fi
 
-echo -e "$(date)\t$scriptname\tCompleted demultiplexing barcodes."
+log "Completed demultiplexing barcodes."
 END_DEMULTIPLEX=$(date +%s)
 diff_DEMULTIPLEX=$(( END_DEMULTIPLEX - START_DEMULTIPLEX ))
-echo -e "$(date)\t$scriptname\tBarcode demultiplex took $diff_DEMULTIPLEX s."
+log "Barcode demultiplex took $diff_DEMULTIPLEX s."
 ### generate fasta file for every separate barcode (demultiplex)
 
 for f in `cat $inputfile.barcodes` ; do
@@ -68,30 +69,33 @@ for f in `cat $inputfile.barcodes` ; do
 		grep -E "$f(/|$)" $inputfile -A 1 --no-group-separator > bar$f.$inputfile
 	fi
 	# split demultiplexed fasta file into 100,000 read sub-fastas
-	echo -e "$(date)\t$scriptname\tSplitting FASTA into chunks of size: $split_FASTA_size."
+	log "Splitting FASTA into chunks of size: $split_FASTA_size."
 	START_SPLIT=$(date +%s)
 
 	cp bar$f.$inputfile bar$f.${inputfile}_n # So that the unsplit demultiplexed file is also denovo assembled #
 	split_fasta.pl -i bar$f.$inputfile -o bar$f.$inputfile -n $split_FASTA_size
-	echo -e "$(date)\t$scriptname\tCompleted splitting FASTA file."
+	log "Completed splitting FASTA file."
 	END_SPLIT=$(date +%s)
 	diff_SPLIT=$(( $END_SPLIT - $START_SPLIT ))
-	echo -e "$(date)\t$scriptname\tSplitting FASTA took $diff_SPLIT s."
+	log "Splitting FASTA took $diff_SPLIT s."
 	### run abyss (deBruijn assembler) on each 100,000 read demultiplexed fasta file, including the unsplit demultiplexed file
-	echo -e "$(date)\t$scriptname\tRunning abyss on each $split_FASTA_size chunk..."
+	log "Running abyss on each $split_FASTA_size chunk..."
 	START_ABYSS=$(date +%s)
 
 	for d in bar$f.${inputfile}_* ; do
-		abyss-pe k=$kmer name=$d.f se=$d np=$cores >& $d.abyss.log
+    echo $d
+	  log "Command: abyss-pe k=$kmer name=$d.f se=$d np=$cores >& $d.abyss.log"
+		#abyss-pe k=$kmer name=$d.f se=$d np=$cores >& $d.abyss.log
+		abyss-pe k=$kmer name=$d.f se=$d >& $d.abyss.log
 	done
 
-	echo -e "$(date)\t$scriptname\tCompleted running abyss on each $split_FASTA_size chunk."
+	log "Completed running abyss on each $split_FASTA_size chunk."
 	END_ABYSS=$(date +%s)
 	diff_ABYSS=$(( END_ABYSS - START_ABYSS ))
-	echo -e "$(date)\t$scriptname\tAbyss took $diff_ABYSS s."
+	log "Abyss took $diff_ABYSS s."
 ###
 ### contigs from split files concatenated, after which reads smaller than the cutoff value are eliminated
-	echo -e "$(date)\t$scriptname\tStarting concatenating and cutoff of abyss output"
+	log "Starting concatenating and cutoff of abyss output"
 	START_CATCONTIGS=$(date +%s)
 
 	#concatenating contig files from different fasta splits, and adding kmer infromation and barcode information to contig headers
@@ -99,21 +103,22 @@ for f in `cat $inputfile.barcodes` ; do
 	# only contigs larger than $cutoff_post_Abyss are retained
  	cat all.bar$f.$inputfile.unitigs.fa | awk 'NR%2==1 {x = $0} NR%2==0 { if (length($0) >= '$2') printf("%s\n%s\n",x,$0)}' > all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss.fa
 
-	echo -e "$(date)\t$scriptname\tDone concatenating and cutoff of abyss output"
+	log "Done concatenating and cutoff of abyss output"
 	END_CATCONTIGS=$(date +%s)
 	diff_CATCONTIGS=$(( END_CATCONTIGS - START_CATCONTIGS ))
-	echo -e "$(date)\t$scriptname\tConcatenating and cutoff of abyss output took $diff_CATCONTIGS s."
+	log "Concatenating and cutoff of abyss output took $diff_CATCONTIGS s."
 ###
 ### run Minimo (OLC assembler)
-	echo -e "$(date)\t$scriptname\tStarting Minimo..."
+	log "Starting Minimo..."
 	START_MINIMO=$(date +%s)
+  log "Command: Minimo all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss.fa -D FASTA_EXP=1"
 	Minimo all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss.fa -D FASTA_EXP=1
-	echo -e "$(date)\t$scriptname\tCompleted Minimo."
+	log "Completed Minimo."
 	END_MINIMO=$(date +%s)
 	diff_MINIMO=$(( END_MINIMO - START_MINIMO ))
-	echo -e "$(date)\t$scriptname\tMinimo took $diff_MINIMO s."
+	log "Minimo took $diff_MINIMO s."
 ###########
-	echo -e "$(date)\t$scriptname\tStarting cat barcode addition and cutoff of minimo output"
+	log "Starting cat barcode addition and cutoff of minimo output"
 	START_MIN_PROCESS=$(date +%s)
 	# Minimo output gives more than one line per sequence, so here we linearize sequences (linearization protocol from here http://seqanswers.com/forums/showthread.php?t=27567 )
 	# then we add the relevant barcode to the end of the header contig. Contigs that survive untouched from abyss already have a barcode at the end of them, so that extra barcode is taken away
@@ -123,23 +128,23 @@ for f in `cat $inputfile.barcodes` ; do
 	# only contigs larger than $cutoff_post_Minimo are retained
 	cat all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss-minim.fa | awk 'NR%2==1 {x = $0} NR%2==0 { if (length($0) >= '$3') printf("%s\n%s\n",x,$0)}' > all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss.${cutoff_post_Minimo}-mini.fa
 
-	echo -e "$(date)\t$scriptname\tDone cat barcode addition and cutoff of minimo output"
+	log "Done cat barcode addition and cutoff of minimo output"
 	END_MIN_PROCESS=$(date +%s)
 	diff_MIN_PROCESS=$(( END_MIN_PROCESS - START_MIN_PROCESS ))
-	echo -e "$(date)\t$scriptname\tcat barcode addition and cutoff of minimo output took $diff_MIN_PROCESS s"
+	log "cat barcode addition and cutoff of minimo output took $diff_MIN_PROCESS s"
 done
 ###
 ### concatenate deBruijn -> OLC contigs from all barcodes together
-echo -e "$(date)\t$scriptname\tConcatenating all barcodes together..."
+log "Concatenating all barcodes together..."
 START_CAT=$(date +%s)
 cat all.bar*.$inputfile.unitigs.cut$cutoff_post_Abyss.${cutoff_post_Minimo}-mini.fa > all.$inputfile.unitigs.cut$cutoff_post_Abyss.${cutoff_post_Minimo}-mini.fa
-echo -e "$(date)\t$scriptname\tCompleted concatenating all barcodes."
+log "Completed concatenating all barcodes."
 END_CAT=$(date +%s)
 diff_CAT=$(( END_CAT - START_CAT ))
-echo -e "$(date)\t$scriptname\tBarcode concatenation took $diff_CAT s."
+log "Barcode concatenation took $diff_CAT s."
 
 # cleaning up files by organizing directories, moving files into directories, and removing temporary files
-mkdir $inputfile.dir
+mkdir --parents $inputfile.dir
 mv bar*$inputfile*.fa $inputfile.dir
 if [ -e all.$inputfile.contigs.abyssmini.cut$cutoff_post_Abyss.$cutoff_post_Minimo.e1.NR.RAPSearch.m8 ]
 then
diff --git a/extractHeaderFromFastq_ncores.sh b/extractHeaderFromFastq_ncores.sh
index c0e63eb..f95e4ee 100755
--- a/extractHeaderFromFastq_ncores.sh
+++ b/extractHeaderFromFastq_ncores.sh
@@ -50,7 +50,7 @@ split -l $LinesPerCore -a 3 $parentfile $parentfile.SplitXS &
 awk '{print$1}' $queryfile > $queryfile.header &
 echo -e "$(date)\t$scriptname\textracting header from $queryfile"
 
-for job in `jobs -p`
+for job in $(jobs -p)
 do
 	wait $job
 done
diff --git a/preprocess.sh b/preprocess.sh
index 1791961..032ba42 100755
--- a/preprocess.sh
+++ b/preprocess.sh
@@ -17,6 +17,8 @@
 # Please see license file for details.
 
 scriptname=${0##*/}
+source debug.sh
+source logging.sh
 
 if [ $# != 10 ]; then
 	echo "Usage: $scriptname <R1 FASTQ file> <S/I quality> <Y/N uniq> <length_cutoff; 0 for no length_cutoff> <Y/N keep short reads> <adapter_set> <start_nt> <crop_length> <temporary_files_directory> <quality_cutoff>"
@@ -44,15 +46,15 @@ fi
 
 if [ $quality = "S" ]
 then
-	echo -e "$(date)\t$scriptname\tselected Sanger quality"
+	log "selected Sanger quality"
 else
-	echo -e "$(date)\t$scriptname\tselected Illumina quality"
+	log "selected Illumina quality"
 fi
 
 # fix header if space is present
 s=`head -1 $inputfile | awk '{if ($0 ~ / /) {print "SPACE"} else {print "NOSPACE"}}'`
 
-echo -e "$(date)\t$scriptname\t$s in header"
+log "$s in header"
 
 nopathf=${1##*/}
 basef=${nopathf%.fastq}
@@ -60,7 +62,7 @@ basef=${nopathf%.fastq}
 #################### START OF PREPROCESSING, READ1 #########################
 
 # run cutadapt, Read1
-echo -e "$(date)\t$scriptname\t********** running cutadapt, Read1 **********"
+log "********** running cutadapt, Read1 **********"
 if [ $s == "SPACE" ]
 then
 	sed "s/\([@HWI|@M00135|@SRR][^ ]*\) \(.\):.:0:\(.*\)/\1#\3\/\2/g" $inputfile > $basef.modheader.fastq
@@ -76,12 +78,12 @@ fi
 
 END1=$(date +%s)
 diff=$(( $END1 - $START1 ))
-echo -e "$(date)\t$scriptname\tDone cutadapt: CUTADAPT took $diff seconds"
+log "Done cutadapt: CUTADAPT took $diff seconds"
 
 # run uniq, Read1
 if [ $run_uniq == "Y" ]
 then
-	echo -e "$(date)\t$scriptname\t********** running uniq, Read1 **********"
+	log "********** running uniq, Read1 **********"
 	START1=$(date +%s)
 
 	if [ $quality = "S" ]
@@ -93,28 +95,28 @@ then
 
 	END1=$(date +%s)
 	diff=$(( $END1 - $START1 ))
-	echo -e "$(date)\t$scriptname\tDone uniq: UNIQ took $diff seconds"
+	log "Done uniq: UNIQ took $diff seconds"
 fi
 
 # run crop, Read 1
-echo -e "$(date)\t$scriptname\t********** running crop, Read1 **********"
+log "********** running crop, Read1 **********"
 START1=$(date +%s)
 
 if [ $run_uniq == "Y" ]
 then
-	echo -e "$(date)\t$scriptname\tWe will be using $crop_length as the length of the cropped read"
+	log "We will be using $crop_length as the length of the cropped read"
 	crop_reads.csh $basef.cutadapt.uniq.fastq $start_nt $crop_length > $basef.cutadapt.uniq.cropped.fastq
 else
-	echo -e "$(date)\t$scriptname\tWe will be using $crop_length as the length of the cropped read"
+	log "We will be using $crop_length as the length of the cropped read"
 	crop_reads.csh $basef.cutadapt.fastq $start_nt $crop_length > $basef.cutadapt.cropped.fastq
 fi
 
 END1=$(date +%s)
 diff=$(( $END1 - $START1 ))
-echo -e "$(date)\t$scriptname\tDone crop: CROP took $diff seconds"
+log "Done crop: CROP took $diff seconds"
 
 # run dust, Read1
-echo -e "$(date)\t$scriptname\t********** running dust, Read1 **********"
+log "********** running dust, Read1 **********"
 START1=$(date +%s)
 
 if [ $run_uniq == "Y" ]
@@ -128,4 +130,4 @@ fi
 
 END1=$(date +%s)
 diff=$(( $END1 - $START1 ))
-echo -e "$(date)\t$scriptname\tDone dust: DUST took $diff seconds"
+log "Done dust: DUST took $diff seconds"
diff --git a/preprocess_ncores.sh b/preprocess_ncores.sh
index 89760a5..1b8ba03 100755
--- a/preprocess_ncores.sh
+++ b/preprocess_ncores.sh
@@ -13,8 +13,9 @@
 # Please see license file for details.
 
 scriptname=${0##*/}
+source logging.sh
 
-if [ $# != 12 ]; then
+if [ $# != 11 ]; then
 	echo "Usage: $scriptname <R1 FASTQ file> <S/I quality> <Y/N uniq> <length cutoff; 0 for no length cutoff> <# of cores> <free cache memory cutoff in GB> <Y/N keep short reads> <adapter_set> <start_nt> <crop_length> <temporary_files_directory> <quality_cutoff>"
 	exit
 fi
@@ -25,13 +26,12 @@ quality=$2
 run_uniq=$3
 length_cutoff=$4
 cores=$5
-cache_reset=$6
-keep_short_reads=$7
-adapter_set=$8
-start_nt=$9
-crop_length=${10}
-temporary_files_directory=${11}
-quality_cutoff=${12}
+keep_short_reads=$6
+adapter_set=$7
+start_nt=$8
+crop_length=${9}
+temporary_files_directory=${10}
+quality_cutoff=${11}
 ###
 
 if [ ! -f $inputfile ]
@@ -40,40 +40,33 @@ then
 	exit
 fi
 
-freemem=$(free -g | awk '{print $4}' | head -n 2 | tail -1)
-echo -e "$(date)\t$scriptname\tThere is $freemem GB available free memory...[cutoff=$free_cache_cutoff GB]"
-if [ $freemem -lt $free_cache_cutoff ]
-then
-	echo -e "$(date)\t$scriptname\tClearing cache..."
-	dropcache
-fi
-
 START=$(date +%s)
 
-echo -e "$(date)\t$scriptname\tSplitting $inputfile..."
+log "Splitting $inputfile..."
 
 let "numlines = `wc -l $inputfile | awk '{print $1}'`"
 let "FASTQentries = numlines / 4"
-echo -e "$(date)\t$scriptname\tThere are $FASTQentries FASTQ entries in $inputfile"
+log "There are $FASTQentries FASTQ entries in $inputfile"
 let "LinesPerCore = numlines / $cores"
-let "FASTQperCore = LinesPerCore / 4"
+# Prevent leaving a small remainder file after split by adding 1.
+let "FASTQperCore = LinesPerCore / 4 + 1"
 let "SplitPerCore = FASTQperCore * 4"
-echo -e "$(date)\t$scriptname\twill use $cores cores with $FASTQperCore entries per core"
+log "will use $cores cores with $FASTQperCore entries per core"
 
 split -l $SplitPerCore $inputfile
 
 END_SPLIT=$(date +%s)
 diff_SPLIT=$(( END_SPLIT - START ))
 
-echo -e "$(date)\t$scriptname\tDone splitting: "
-echo -e "$(date)\t$scriptname\tSPLITTING took $diff_SPLIT seconds"
+log "Done splitting: "
+log "SPLITTING took $diff_SPLIT seconds"
 
-echo -e "$(date)\t$scriptname\tRunning preprocess script for each chunk..."
+log "Running preprocess script for each chunk..."
 
 for f in x??
 do
 	mv $f $f.fastq
-	echo -e "$(date)\t$scriptname\tpreprocess.sh $f.fastq $quality N $length_cutoff $keep_short_reads $adapter_set $start_nt $crop_length $temporary_files_directory >& $f.preprocess.log &"
+	log "preprocess.sh $f.fastq $quality N $length_cutoff $keep_short_reads $adapter_set $start_nt $crop_length $temporary_files_directory >& $f.preprocess.log &"
 	preprocess.sh $f.fastq $quality N $length_cutoff $keep_short_reads $adapter_set $start_nt $crop_length $temporary_files_directory $quality_cutoff >& $f.preprocess.log &
 done
 
@@ -82,7 +75,7 @@ do
 	wait $job
 done
 
-echo -e "$(date)\t$scriptname\tDone preprocessing for each chunk..."
+log "Done preprocessing for each chunk..."
 
 nopathf2=${1##*/}
 basef2=${nopathf2%.fastq}
@@ -117,11 +110,11 @@ do
 	rm -f $basef.cutadapt.cropped.fastq
 done
 
-echo -e "$(date)\t$scriptname\tDone concatenating output..."
+log "Done concatenating output..."
 
 if [ $run_uniq == "Y" ]; # selecting unique reads
 then
-	echo -e "$(date)\t$scriptname\tSelecting unique reads"
+	log "Selecting unique reads"
 	START_UNIQ=$(date +%s)
 	# selecting unique reads
 	sed "n;n;n;d" $basef2.preprocessed.fastq | sed "n;n;d" | sed "s/^@/>/g" > $basef2.preprocessed.fasta
@@ -130,28 +123,28 @@ then
 	cp -f $basef2.uniq.fastq $basef2.preprocessed.fastq
 	END_UNIQ=$(date +%s)
 	diff_UNIQ=$(( END_UNIQ - START_UNIQ ))
-	echo -e "$(date)\t$scriptname\tUNIQ took $diff_UNIQ seconds"
+	log "UNIQ took $diff_UNIQ seconds"
 else
-	echo -e "$(date)\t$scriptname\tIncluding duplicates (did not run UNIQ)"
+	log "Including duplicates (did not run UNIQ)"
 fi
 
 END=$(date +%s)
 diff_TOTAL=$(( END - START ))
 
 let "avgtime1=`grep CUTADAPT $basef2.preprocess.log | awk '{print $12}' | sort -n | awk '{ a[i++]=$1} END {print a[int(i/2)];}'`"
-echo -e "$(date)\t$scriptname\tmedian CUTADAPT time per core: $avgtime1 seconds"
+log "median CUTADAPT time per core: $avgtime1 seconds"
 
 if [ $run_uniq = "Y" ]; then
 	let "avgtime2 = $diff_UNIQ"
-	echo -e "$(date)\t$scriptname\tUNIQ time: $diff_UNIQ seconds"
+	log "UNIQ time: $diff_UNIQ seconds"
 else
 	let "avgtime2=0"
 fi
 
 let "avgtime3=`grep DUST $basef2.preprocess.log | awk '{print $12}' | sort -n | awk '{ a[i++]=$1} END {print a[int(i/2)];}'`"
-echo -e "$(date)\t$scriptname\tmedian DUST time per core: $avgtime3 seconds"
+log "median DUST time per core: $avgtime3 seconds"
 
 let "totaltime = diff_SPLIT + avgtime1 + avgtime2 + avgtime3"
-echo -e "$(date)\t$scriptname\tTOTAL TIME: $totaltime seconds"
+log "TOTAL TIME: $totaltime seconds"
 
-echo -e "$(date)\t$scriptname\tTOTAL CLOCK TIME (INCLUDING OVERHEAD): $diff_TOTAL seconds"
+log "TOTAL CLOCK TIME (INCLUDING OVERHEAD): $diff_TOTAL seconds"
diff --git a/snap_nt.sh b/snap_nt.sh
index f0fe92c..c010cff 100755
--- a/snap_nt.sh
+++ b/snap_nt.sh
@@ -10,13 +10,14 @@
 # Copyright (C) 2014 Charles Chiu - All Rights Reserved
 # SURPI has been released under a modified BSD license.
 # Please see license file for details.
-
-expected_args=6
 scriptname=${0##*/}
+source debug.sh
+source logging.sh
 
+expected_args=5
 if [ $# -lt $expected_args ]
 then
-	echo "Usage: $scriptname <FASTQ input file> <directory containing SNAP NT indexes> <number of cores> <free cache memory cutoff in GB> <SNAP d-value cutoff> <SNAP binary>"
+	echo "Usage: $scriptname <FASTQ input file> <directory containing SNAP NT indexes> <number of cores> <SNAP d-value cutoff> <SNAP binary>"
 	exit 65
 fi
 
@@ -24,49 +25,48 @@ fi
 inputfile=$1
 SNAP_NT_index_directory=$2
 cores=$3
-free_cache_cutoff=$4
-SNAP_d_cutoff=$5
-snap=$6
+SNAP_d_cutoff=$4
+snap=$5
 ###
 
-echo -e "$(date)\t$scriptname\tStarting SNAP to NT"
+log "Starting SNAP to NT"
 START1=$(date +%s)
 
-echo -e "$(date)\t$scriptname\tInput file: $inputfile"
+log "Input file: $inputfile"
 nopathf=${inputfile##*/} # remove the path to file
-echo -e "$(date)\t$scriptname\tAfter removing path: $nopathf"
+log "After removing path: $nopathf"
 basef=${nopathf%.fastq} # remove FASTQextension
-echo -e "$(date)\t$scriptname\tAfter removing FASTQ extension: $basef"
-
-echo -e "$(date)\t$scriptname\tMapping $basef to NT..."
-
-rm -f $basef.prev.sam
-rm -f $basef.tmp.sam
-rm -f $basef.tmp2.sam
-rm -f $basef.NT.sam
-
+log "After removing FASTQ extension: $basef"
+
+log "Mapping $basef to NT..."
+
+cleanup() {
+  rm -f $basef.prev.sam || true
+  rm -f $basef.tmp.sam || true
+  rm -f $basef.tmp2.sam || true
+  rm -f $basef.NT.sam || true
+  rm $basef.snapNT.log || true
+  rm $basef.timeNT.log || true
+}
+cleanup
 counter=0
 
-for snap_index in $SNAP_NT_index_directory/* ; do
-	freemem=$(free -g | awk '{print $4}' | head -n 2 | tail -1)
-	echo -e "$(date)\t$scriptname\tThere is $freemem GB available free memory...[cutoff=$free_cache_cutoff GB]"
-	if [ $freemem -lt $free_cache_cutoff ]
-	then
-		echo -e "$(date)\t$scriptname\tClearing cache..."
-		dropcache
-	fi
-	nopathsnap_index=${snap_index##*/} # remove the path to file
-	echo -e "$(date)\t$scriptname\tFound $snap_index ... processing ..."
+for snap_index_basename in $(ls -1v "$SNAP_NT_index_directory") ; do
+	#nopathsnap_index=${snap_index##*/} # remove the path to file
+  snap_index = "$SNAP_NT_index_directory/$snap_index_basename"
+	log "Found $snap_index_basename ... processing ..."
 	START2=$(date +%s)
 
 	if [[ $counter -eq 0 ]]
 	then
 		#running first SNAP chunk
-		/usr/bin/time -o $basef.snap.log $snap single $snap_index $basef.fastq -o $basef.tmp.sam -t $cores -x -f -h 250 -d $SNAP_d_cutoff -n 25 > $basef.time.log
+		/usr/bin/time -o $basef.time.log $snap single $snap_index $basef.fastq -o $basef.$nopathsnap_index.sam -t $cores -x -f -h 250 -d $SNAP_d_cutoff -n 25 > $basef.snap.log
+    ln --symbolic --force $basef.$nopathsnap_index.sam $basef.tmp.sam
 # 		cp $basef.tmp.sam temp.sam
 	else
 		#running 2nd SNAP chunk through last SNAP chunk
-		/usr/bin/time -o $basef.snap.log $snap single $snap_index $basef.tmp.fastq -o $basef.tmp.sam -t $cores -x -f -h 250 -d $SNAP_d_cutoff -n 25 > $basef.time.log
+		/usr/bin/time -o $basef.time.log $snap single $snap_index $basef.tmp.fastq -o $basef.$nopathsnap_index.sam -t $cores -x -f -h 250 -d $SNAP_d_cutoff -n 25 > $basef.snap.log
+    ln --symbolic --force $basef.$nopathsnap_index.sam $basef.tmp.sam
 	fi
 
 	cat $basef.snap.log >> $basef.snapNT.log
@@ -80,9 +80,9 @@ for snap_index in $SNAP_NT_index_directory/* ; do
 
 	END2=$(date +%s)
 
-	echo -e "$(date)\t$scriptname\tDone with $snap_index."
+	log "Done with $snap_index."
 	diff=$(( $END2 - $START2 ))
-	echo -e "$(date)\t$scriptname\tMapping of $snap_index took $diff seconds."
+	log "Mapping of $snap_index took $diff seconds."
 done
 
 # need to restore the hits
@@ -91,9 +91,10 @@ update_sam.py $basef.prev.sam $basef.NT.sam
 rm -f $basef.tmp.sam
 rm -f $basef.tmp.fastq
 rm -f $basef.prev.sam
+rm -f $basef.$nopathsnap_index.*
 
 END1=$(date +%s)
-echo -e "$(date)\t$scriptname\tDone with SNAP_NT"
+log "Done with SNAP_NT"
 diff=$(( $END1 - $START1 ))
-echo -e "$(date)\t$scriptname\toutput written to $basef.NT.sam"
-echo -e "$(date)\t$scriptname\tSNAP_NT took $diff seconds"
+log "output written to $basef.NT.sam"
+log "SNAP_NT took $diff seconds"

From f60acf68a3403cb22c517ed68cd0bb190467111f Mon Sep 17 00:00:00 2001
From: Simon Ye <sye737@gmail.com>
Date: Thu, 21 May 2015 21:46:58 -0400
Subject: [PATCH 18/18] Cleanup SURPI

No snap-dev, just snap
No req on ABYSS-P
No dropcache
Add specific checkpointing file
---
 SURPI.sh | 519 ++++++++++++++++++++++++++-----------------------------
 1 file changed, 245 insertions(+), 274 deletions(-)

diff --git a/SURPI.sh b/SURPI.sh
index 1193cbe..fe9a19c 100755
--- a/SURPI.sh
+++ b/SURPI.sh
@@ -14,14 +14,16 @@
 SURPI_version="1.0.22"
 
 optspec=":f:d:hvz:"
-bold=$(tput bold)
-normal=$(tput sgr0)
+bold=$(tput bold 2> /dev/null || true)
+normal=$(tput sgr0 2> /dev/null || true)
 green='\e[0;32m'
 red='\e[0;31m'
 endColor='\e[0m'
 
 host=$(hostname)
 scriptname=${0##*/}
+source debug.sh
+source logging.sh
 
 while getopts "$optspec" option; do
 	case "${option}" in
@@ -29,10 +31,10 @@ while getopts "$optspec" option; do
 		d) ref_path=${OPTARG:-/reference};; # Path to reference db directory.
 		h) HELP=1;;
 		v) VERIFICATION=1;;
-		z)	create_config_file=${OPTARG}
+		z) create_config_file=${OPTARG}
 			configprefix=${create_config_file%.fastq}
 			;;
-		:)	echo "Option -$OPTARG requires an argument." >&2
+		:) echo "Option -$OPTARG requires an argument." >&2
 			exit 1
 			;;
 	esac
@@ -40,58 +42,59 @@ done
 
 if [[ $HELP -eq 1  ||  $# -lt 1 ]]
 then
-	cat <<USAGE
-
+  cat <<USAGE
 ${bold}SURPI version ${SURPI_version}${normal}
 
 This program will run the SURPI pipeline with the parameters supplied by the config file.
 
 ${bold}Command Line Switches:${normal}
 
-	-h	Show this help & ignore all other switches
+  -h  Show this help & ignore all other switches
 
-	-f	Specify config file
+  -f  Specify config file
 
-		This switch is used to initiate a SURPI run using a specified config file. Verification (see -v switch) will occur at the beginning of the run.
-		The pipeline will cease if SURPI fails to find a software dependency or necessary reference data.
+    This switch is used to initiate a SURPI run using a specified config file. Verification (see -v switch) will occur at the beginning of the run.
+    The pipeline will cease if SURPI fails to find a software dependency or necessary reference data.
 
-	-v	Verification mode
+  -v  Verification mode
 
-		When using verification mode, SURPI will verify necessary dependencies, but will
-		stop after verification. This same verification is also done
-		before each SURPI run.
+    When using verification mode, SURPI will verify necessary dependencies, but will
+    stop after verification. This same verification is also done
+    before each SURPI run.
 
-			• software dependencies
-				SURPI will check for the presence of all software dependencies.
-			• reference data specified in config file
-				SURPI does a cursory check for the presence of reference data. This check is
-				not a comprehensive test of the reference data.
-			• taxonomy lookup functionality
-				SURPI verifies the functionality of the taxonomy lookup.
-			• FASTQ file (if requested in config file)
-				SURPI uses fastQValidator to check the integrity of the FASTQ file.
+    • software dependencies
+      SURPI will check for the presence of all software dependencies.
+    • reference data specified in config file
+      SURPI does a cursory check for the presence of reference data. This check is
+      not a comprehensive test of the reference data.
+    • taxonomy lookup functionality
+      SURPI verifies the functionality of the taxonomy lookup.
+    • FASTQ file (if requested in config file)
+      SURPI uses fastQValidator to check the integrity of the FASTQ file.
 
-	-z	Create default config file and go file. [optional] (specify fastq filename)
-		This option will create a standard .config file, and go file.
+ -z  Create default config file and go file. [optional] (specify fastq filename)
 
-${bold}Usage:${normal}
+    This option will create a standard .config file, and go file.
+
+ -d  Specify directory containing reference databases. Default: /reference
 
-	Create default config and go file.
-		$scriptname -z test.fastq
+${bold}Usage:${normal}
 
-	Run SURPI pipeline in verification mode:
-		$scriptname -f config -v
+  Create default config and go file.
+    $scriptname -z test.fastq
 
-	Run SURPI pipeline with the config file:
-		$scriptname -f config
+  Run SURPI pipeline in verification mode:
+    $scriptname -f config -v
 
+  Run SURPI pipeline with the config file:
+    $scriptname -f config
 USAGE
 	exit
 fi
 
 if [[ $create_config_file ]]
 then
-	echo "PATH=/usr/local/bin/surpi:/usr/local/bin:/usr/bin/:/bin:$PATH" > go_$configprefix
+	echo "PATH=$PATH:/usr/local/bin/surpi:/usr/local/bin:/usr/bin/:/bin" > go_$configprefix
 	echo "nohup $scriptname -f $configprefix.config > SURPI.$configprefix.log 2> SURPI.$configprefix.err" >> go_$configprefix
 
 	chmod +x go_$configprefix
@@ -153,7 +156,8 @@ run_mode="Comprehensive"
 # $basef.cutadapt.fastq
 # $basef.preprocessed.fastq
 #preprocess="skip"
-
+#human_mapping="skip"
+#alignment="skip"
 
 ##########################
 # Preprocessing
@@ -178,7 +182,7 @@ quality_cutoff=18
 ##########################
 
 #SNAP executable
-snap="/usr/local/bin/snap-dev"
+snap="snap"
 
 #SNAP edit distance for Computational Subtraction of host genome [Highly recommended default: d_human=12]
 #see Section 3.1.2 MaxDist description: http://snap.cs.berkeley.edu/downloads/snap-1.0beta-manual.pdf
@@ -247,7 +251,7 @@ eBLASTn="1e-15"
 # SURPI will subtract all SNAP databases found in this directory from the input sequence
 # Useful if you want to subtract multiple genomes (without combining SNAP databases)
 # or, if you need to split a db if it is larger than available RAM.
-SNAP_subtraction_db="$ref_path/hg19"
+SNAP_subtraction_folder="$ref_path/Subtraction_SNAP"
 
 # directory for SNAP-indexed databases of NCBI NT (for mapping phase in comprehensive mode)
 # directory must ONLY contain snap indexed databases
@@ -266,11 +270,11 @@ taxonomy_db_directory="$ref_path/taxonomy"
 
 #RAPSearch viral database name: indexed protein dataset (all of Viruses)
 #make sure that directory also includes the .info file
-RAPSearch_VIRUS_db="$ref_path/RAPSearch/rapsearch_viral_db"
+RAPSearch_VIRUS_db="$ref_path/RAPSearch/rapsearch_viral_aa_130628_db_v2.12"
 
 #RAPSearch nr database name: indexed protein dataset (all of NR)
 #make sure that directory also includes the .info file
-RAPSearch_NR_db="$ref_path/RAPSearch/rapsearch_nr_db"
+RAPSearch_NR_db="$ref_path/RAPSearch/rapsearch_nr_db_v2.12"
 
 ribo_snap_bac_euk_directory="$ref_path/RiboClean_SNAP"
 
@@ -287,11 +291,6 @@ ribo_snap_bac_euk_directory="$ref_path/RiboClean_SNAP"
 #This folder will not be created by SURPI, so be sure it already exists with proper permissions.
 temporary_files_directory="/tmp/"
 
-#This parameter controls whether dropcache is used throughout the pipeline. If free RAM is less than cache_reset,
-#	then dropcache. If cache_reset=0, then dropcache will never be used.
-cache_reset="0"
-
-
 ##########################
 # AWS related values
 ##########################
@@ -403,25 +402,6 @@ then
 	exit 65
 fi
 
-#set cache_reset. if none specified:
-# >500GB -> 200GB
-# >200GB -> 150GB
-# otherwise -> 50GB
-# note: this may cause problems on a machine with <50GB RAM
-if [ ! $cache_reset ]
-then
-	total_ram=$(grep MemTotal /proc/meminfo | awk '{print $2}')
-	if [ "$total_ram" -gt "500000000" ] #500GiB
-	then
-		cache_reset=200 # This is 200GB
-	elif [ "$total_ram" -gt "200000000" ] #200 GiB
-	then
-		cache_reset=150
-	else
-		cache_reset=50
-	fi
-fi
-
 #these 2 parameters are for cropping prior to snap in the preprocessing stage
 if [ ! $start_nt ]
 then
@@ -512,10 +492,8 @@ seqtk_version=$(seqtk 2>&1 | head -3 | tail -1 | awk '{print $2}')
 cutadapt_version=$(cutadapt --version)
 prinseqlite_version=$(prinseq-lite.pl --version 2>&1 | awk '{print $2}')
 snap_version=$(snap 2>&1 | grep version | awk '{print $5}')
-snap_dev_version=$(snap-dev 2>&1 | grep version | awk '{print $5}')
 rapsearch_version=$(rapsearch 2>&1 | head -2 | tail -1 | awk '{print $2}')
 abyss_pe_version=$(abyss-pe version | head -2 | tail -1 | awk '{print $3}')
-ABYSS_P_version=$(ABYSS-P  --version | head -1 | awk '{print $3}')
 Minimo_version=$(Minimo -h | tail -2 | awk '{print $2}')
 echo -e "SURPI version: $SURPI_version"
 echo -e "config file version: $config_file_version"
@@ -524,10 +502,8 @@ echo -e "seqtk: $seqtk_version"
 echo -e "cutadapt: $cutadapt_version"
 echo -e "prinseq-lite: $prinseqlite_version${endColor}"
 echo -e "snap: $snap_version${endColor}"
-echo -e "snap-dev: $snap_dev_version${endColor}"
 echo -e "RAPSearch: $rapsearch_version"
 echo -e "abyss-pe: $abyss_pe_version"
-echo -e "ABYSS-P: $ABYSS_P_version"
 echo -e "Minimo: $Minimo_version"
 
 echo "-----------------------------------------------------------------------------------------"
@@ -547,9 +523,10 @@ do
 done
 
 echo -e "SNAP Comprehensive Mode database"
-for f in $SNAP_COMPREHENSIVE_db_dir/*
+for f_basename in $(ls -1v "$SNAP_COMPREHENSIVE_db_dir")
 do
-	if [ -f $f/Genome ]
+  f = "$SNAP_COMPREHENSIVE_db_dir"/"$f_basename"
+  if [ -f $f/Genome ]
 	then
 		echo -e "\t$f: ${green}OK${endColor}"
 	else
@@ -578,7 +555,8 @@ done
 
 #verify taxonomy is functioning properly
 result=$( taxonomy_lookup_embedded.pl -d nucl -q $taxonomy_db_directory 149408158 )
-if [ $result = "149408158" ]
+echo "taxonomy result: ${result}"
+if [ $result == "149408158" ]
 then
 	echo -e "taxonomy: ${green}OK${endColor}"
 else
@@ -679,7 +657,6 @@ echo "contigcutoff for abyss assembly unitigs: $contigcutoff"
 echo "abysskmer length: $abysskmer"
 echo "Ignore barcodes for assembly? $ignore_barcodes_for_de_novo"
 
-echo "cache_reset (if 0, then dropcache will never be used): $cache_reset"
 echo "start_nt: $start_nt"
 echo "crop_length: $crop_length"
 
@@ -736,10 +713,10 @@ then
 	exit
 fi
 ###########################################################
-echo -e "$(date)\t$scriptname\t########## STARTING SURPI PIPELINE ##########"
+log "########## STARTING SURPI PIPELINE ##########"
 START_PIPELINE=$(date +%s)
-echo -e "$(date)\t$scriptname\tFound file $FASTQ_file"
-echo -e "$(date)\t$scriptname\tAfter removing path: $nopathf"
+log "Found file $FASTQ_file"
+log "After removing path: $nopathf"
 ############ Start up AWS slave machines ##################
 
 file_with_slave_ips="slave_list.txt"
@@ -747,49 +724,51 @@ file_with_slave_ips="slave_list.txt"
 if [ "$snap_nt_procedure" = "AWS_master_slave" ]
 then
 	# start the slaves as a background process. They should be ready to run at the SNAP to NT step in the pipeline.
-	start_slaves.sh $ami_id $actual_slave_instances $instance_type $keypair $security_group $availability_zone $file_with_slave_ips $placement_group & # > $basef.AWS.log 2>&1
+	start_slaves.sh $ami_id $actual_slave_instances $instance_type $keypair $security_group $availability_zone $file_with_slave_ips $placement_group & # > $basef.AWS.log "2>&1
 fi
 
 ############ PREPROCESSING ##################
-if [ "$preprocess" != "skip" ]
+if grep -qs preprocess_complete "$basef.checkpoint"; then
+  preprocess_complete=1
+fi
+if [[ "$preprocess" != "skip" ]] && [[ "$preprocess_complete" != 1 ]]
 then
-	echo -e "$(date)\t$scriptname\t############### PREPROCESSING ###############"
-	echo -e "$(date)\t$scriptname\tStarting: preprocessing using $cores cores "
+  if [[ -f $basef.cutadapt.fastq.done ]]; then
+    cp $basef.cutadapt.fastq
+  fi
+	log "############### PREPROCESSING ###############"
+	log "Starting: preprocessing using $cores cores "
 	START_PREPROC=$(date +%s)
-	echo -e "$(date)\t$scriptname\tParameters: preprocess_ncores.sh $basef.fastq $quality N $length_cutoff $cores Y N $adapter_set $start_nt $crop_length $temporary_files_directory >& $basef.preprocess.log"
-	preprocess_ncores.sh $basef.fastq $quality N $length_cutoff $cores $cache_reset N $adapter_set $start_nt $crop_length $temporary_files_directory $quality_cutoff >& $basef.preprocess.log
-	echo -e "$(date)\t$scriptname\tDone: preprocessing "
+	log "Parameters: preprocess_ncores.sh $basef.fastq $quality N $length_cutoff $cores Y N $adapter_set $start_nt $crop_length $temporary_files_directory >& $basef.preprocess.log"
+	preprocess_ncores.sh $basef.fastq $quality N $length_cutoff $cores N $adapter_set $start_nt $crop_length $temporary_files_directory $quality_cutoff >& $basef.preprocess.log
+	log "Done: preprocessing "
 	END_PREPROC=$(date +%s)
 	diff_PREPROC=$(( END_PREPROC - START_PREPROC ))
-	echo -e "$(date)\t$scriptname\tPreprocessing took $diff_PREPROC seconds" | tee timing.$basef.log
-fi
-# verify preprocessing step
-if [ ! -s $basef.cutadapt.fastq ] || [ ! -s $basef.preprocessed.fastq ]
-then
-	echo -e "$(date)\t$scriptname\t${red}Preprocessing appears to have failed. One of the following files does not exist, or is of 0 size:${endColor}"
-	echo "$basef.cutadapt.fastq"
-	echo "$basef.preprocessed.fastq"
-	exit
+  log "Preprocessing took $diff_PREPROC seconds" | tee timing.$basef.log
+
+  # verify preprocessing step
+  if [ ! -s $basef.cutadapt.fastq ] || [ ! -s $basef.preprocessed.fastq ]
+  then
+    log "${red}Preprocessing appears to have failed. One of the following files does not exist, or is of 0 size:${endColor}"
+    echo "$basef.cutadapt.fastq"
+    echo "$basef.preprocessed.fastq"
+    exit
+  else
+    echo "preprocess_complete" >> "$basef.checkpoint"
+  fi
 fi
 
 ############# BEGIN SNAP PIPELINE #################
-freemem=$(free -g | awk '{print $4}' | head -n 2 | tail -1)
-echo -e "$(date)\t$scriptname\tThere is $freemem GB available free memory...[cutoff=$cache_reset GB]"
-if [[ $dropcache == "Y" ]]
-then
-	if [ "$freemem" -lt "$cache_reset" ]
-	then
-		echo -e "$(date)\t$scriptname\tClearing cache..."
-		dropcache
-	fi
-fi
+basef_h=${nopathf%.fastq}.preprocessed.s20.h250n25d${d_human}xfu # remove fastq extension
 ############# HUMAN MAPPING #################
-if [ "$human_mapping" != "skip" ]
+if grep -qs human_mapping_complete "$basef.checkpoint"; then
+  human_mapping_complete=1
+fi
+if [ "$human_mapping" != "skip" ] && [[ "$human_mapping_complete" != 1 ]]
 then
-	echo -e "$(date)\t$scriptname\t############### SNAP TO HUMAN ###############"
-	basef_h=${nopathf%.fastq}.preprocessed.s20.h250n25d${d_human}xfu # remove fastq extension
-	echo -e "$(date)\t$scriptname\tBase file: $basef_h"
-	echo -e "$(date)\t$scriptname\tStarting: $basef_h human mapping"
+	log "############### SNAP TO HUMAN ###############"
+	log "Base file: $basef_h"
+	log "Starting: $basef_h human mapping"
 
 	file_to_subtract="$basef.preprocessed.fastq"
 	subtracted_output_file="$basef_h.human.snap.unmatched.sam"
@@ -799,50 +778,43 @@ then
 	for SNAP_subtraction_db in $SNAP_subtraction_folder/*; do
 		SUBTRACTION_COUNTER=$[$SUBTRACTION_COUNTER +1]
 		# check if SNAP db is cached in RAM, use optimal parameters depending on result
-		SNAP_db_cached=$(vmtouch -m500G -f "$SNAP_subtraction_db" | grep 'Resident Pages' | awk '{print $5}')
-		if [[ "$SNAP_db_cached" == "100%" ]]
-		then
-			echo -e "$(date)\t$scriptname\tSNAP database is cached ($SNAP_db_cached)."
-			SNAP_cache_option=" -map "
-		else
-			echo -e "$(date)\t$scriptname\tSNAP database is not cached ($SNAP_db_cached)."
-			SNAP_cache_option=" -pre -map "
-		fi
-		echo -e "$(date)\t$scriptname\tParameters: snap-dev single $SNAP_subtraction_db $file_to_subtract -o -sam $subtracted_output_file.$SUBTRACTION_COUNTER.sam -t $cores -x -f -h 250 -d ${d_human} -n 25 -F u $SNAP_cache_option"
+		#SNAP_db_cached=$(vmtouch -m500G -f "$SNAP_subtraction_db" | grep 'Resident Pages' | awk '{print $5}')
+		#if [[ "$SNAP_db_cached" == "100%" ]]
+		#then
+			#log "SNAP database is cached ($SNAP_db_cached)."
+			#SNAP_cache_option=" -map "
+		#else
+			#log "SNAP database is not cached ($SNAP_db_cached)."
+			#SNAP_cache_option=" -pre -map "
+		#fi
+		log "Parameters: snap single $SNAP_subtraction_db $file_to_subtract -o -sam $subtracted_output_file.$SUBTRACTION_COUNTER.sam -t $cores -x -f -h 250 -d ${d_human} -n 25 -F u $SNAP_cache_option"
 		START_SUBTRACTION_STEP=$(date +%s)
-		snap-dev single "$SNAP_subtraction_db" "$file_to_subtract" -o -sam "$subtracted_output_file.$SUBTRACTION_COUNTER.sam" -t $cores -x -f -h 250 -d ${d_human} -n 25 -F u $SNAP_cache_option
+		snap single "$SNAP_subtraction_db" "$file_to_subtract" -o -sam "$subtracted_output_file.$SUBTRACTION_COUNTER.sam" -t $cores -x -f -h 250 -d ${d_human} -n 25 -F u $SNAP_cache_option
 		END_SUBTRACTION_STEP=$(date +%s)
-		echo -e "$(date)\t$scriptname\tDone: SNAP to human"
+		log "Done: SNAP to human"
 		diff_SUBTRACTION_STEP=$(( END_SUBTRACTION_STEP - START_SUBTRACTION_STEP ))
-		echo -e "$(date)\t$scriptname\tSubtraction step: $SUBTRACTION_COUNTER took $diff_SUBTRACTION_STEP seconds"
+		log "Subtraction step: $SUBTRACTION_COUNTER took $diff_SUBTRACTION_STEP seconds"
 		file_to_subtract="$subtracted_output_file.$SUBTRACTION_COUNTER.sam"
 	done
 	egrep -v "^@" "$subtracted_output_file.$SUBTRACTION_COUNTER.sam" | awk '{if($3 == "*") print "@"$1"\n"$10"\n""+"$1"\n"$11}' > $(echo "$basef_h".human.snap.unmatched.sam | sed 's/\(.*\)\..*/\1/').fastq
 	END_SUBTRACTION=$(date +%s)
 	diff_SUBTRACTION=$(( END_SUBTRACTION - START_SUBTRACTION ))
-	rm $subtracted_output_file.*.sam
-	echo -e "$(date)\t$scriptname\tSubtraction took $diff_SUBTRACTION seconds" | tee -a timing.$basef.log
-fi
-######dropcache?#############
-freemem=$(free -g | awk '{print $4}' | head -n 2 | tail -1)
-echo -e "$(date)\t$scriptname\tThere is $freemem GB available free memory...[cutoff=$cache_reset GB]"
-if [[ $dropcache == "Y" ]]
-then
-	if [ "$freemem" -lt "$cache_reset" ]
-	then
-		echo -e "$(date)\t$scriptname\tClearing cache..."
-		dropcache
-	fi
+	# rm $subtracted_output_file.*.sam
+	log "Subtraction took $diff_SUBTRACTION seconds" | tee -a timing.$basef.log
+  echo "human_mapping_complete" >> "$basef.checkpoint"
 fi
+
 ############################# SNAP TO NT ##############################
 if [ "$alignment" != "skip" ]
 then
-	if [ ! -f $basef.NT.snap.sam ]
+	if [ -f $basef.NT.snap.sam ] # Cache SNAP NT hits.
 	then
-		echo -e "$(date)\t$scriptname\t####### SNAP UNMATCHED SEQUENCES TO NT ######"
-		echo -e -n "$(date)\t$scriptname\tCalculating number of sequences to analyze using SNAP to NT: "
-		echo $(awk 'NR%4==1' "$basef_h".human.snap.unmatched.fastq | wc -l)
-		echo -e "$(date)\t$scriptname\tStarting: Mapping by SNAP to NT from $basef_h.human.snap.unmatched.fastq"
+    log "Using cached $basef.NT.snap.sam"
+  else
+		log "####### SNAP UNMATCHED SEQUENCES TO NT ######"
+    num_seq_snap_nt = $(awk 'NR%4==1' "$basef_h".human.snap.unmatched.fastq | wc -l)
+		log "Calculating number of sequences to analyze using SNAP to NT: $num_seq_snap_nt"
+		log "Starting: Mapping by SNAP to NT from $basef_h.human.snap.unmatched.fastq"
 		START_SNAPNT=$(date +%s)
 		# SNAP to NT for unmatched reads (d value threshold cutoff = 12)
 
@@ -850,8 +822,8 @@ then
 		then
 			if [ $snap_integrator = "inline" ]
 			then
-				echo -e "$(date)\t$scriptname\tParameters: snap_nt.sh $basef_h.human.snap.unmatched.fastq ${SNAP_COMPREHENSIVE_db_dir} $cores $cache_reset $d_NT_alignment $snap"
-				snap_nt.sh "$basef_h.human.snap.unmatched.fastq" "${SNAP_COMPREHENSIVE_db_dir}" "$cores" "$cache_reset" "$d_NT_alignment" "$snap"
+				log "Parameters: snap_nt.sh $basef_h.human.snap.unmatched.fastq ${SNAP_COMPREHENSIVE_db_dir} $cores $d_NT_alignment $snap"
+				snap_nt.sh "$basef_h.human.snap.unmatched.fastq" "${SNAP_COMPREHENSIVE_db_dir}" "$cores" "$d_NT_alignment" "$snap"
 			elif [ $snap_integrator = "end" ]
 			then
 				if [ "$snap_nt_procedure" = "AWS_master_slave" ]
@@ -866,38 +838,40 @@ then
 						sleep 2
 					done
 					echo
-					echo -e "$(date)\t$scriptname\tParameters: snap_on_slave.sh $basef_h.human.snap.unmatched.fastq $pemkey $file_with_slave_ips $incoming_dir ${basef}.NT.snap.sam $d_NT_alignment"
+					log "Parameters: snap_on_slave.sh $basef_h.human.snap.unmatched.fastq $pemkey $file_with_slave_ips $incoming_dir ${basef}.NT.snap.sam $d_NT_alignment"
 					snap_on_slave.sh "$basef_h.human.snap.unmatched.fastq" "$pemkey" "$file_with_slave_ips" "$incoming_dir" "${basef}.NT.snap.sam" "$d_human"> "$basef.AWS.log" 2>&1
 
 				elif [ "$snap_nt_procedure" = "solo" ]
 				then
-					echo -e "$(date)\t$scriptname\tParameters: snap_nt_combine.sh $basef_h.human.snap.unmatched.fastq ${SNAP_COMPREHENSIVE_db_dir} $cores $cache_reset $d_NT_alignment $num_simultaneous_SNAP_runs"
-					snap_nt_combine.sh "$basef_h.human.snap.unmatched.fastq" "${SNAP_COMPREHENSIVE_db_dir}" "$cores" "$cache_reset" "$d_NT_alignment" "$num_simultaneous_SNAP_runs"
+					log "Parameters: snap_nt_combine.sh $basef_h.human.snap.unmatched.fastq ${SNAP_COMPREHENSIVE_db_dir} $cores $d_NT_alignment $num_simultaneous_SNAP_runs"
+					snap_nt_combine.sh "$basef_h.human.snap.unmatched.fastq" "${SNAP_COMPREHENSIVE_db_dir}" "$cores" "$d_NT_alignment" "$num_simultaneous_SNAP_runs"
 				fi
 			fi
 		elif [ $run_mode = "Fast" ]
 		then
-			echo -e "$(date)\t$scriptname\tParameters: snap_nt.sh $basef_h.human.snap.unmatched.fastq ${SNAP_FAST_db_dir} $cores $cache_reset $d_NT_alignment $snap"
-			snap_nt.sh "$basef_h.human.snap.unmatched.fastq" "${SNAP_FAST_db_dir}" "$cores" "$cache_reset" "$d_NT_alignment" "$snap"
+			log "Parameters: snap_nt.sh $basef_h.human.snap.unmatched.fastq ${SNAP_FAST_db_dir} $cores $d_NT_alignment $snap"
+			snap_nt.sh "$basef_h.human.snap.unmatched.fastq" "${SNAP_FAST_db_dir}" "$cores" "$d_NT_alignment" "$snap"
 		fi
 
-		echo -e "$(date)\t$scriptname\tDone: SNAP to NT"
+		log "Done: SNAP to NT"
 		END_SNAPNT=$(date +%s)
 		diff_SNAPNT=$(( END_SNAPNT - START_SNAPNT ))
-		echo -e "$(date)\t$scriptname\tSNAP to NT took $diff_SNAPNT seconds." | tee -a timing.$basef.log
+		log "SNAP to NT took $diff_SNAPNT seconds." | tee -a timing.$basef.log
 		mv -f "$basef_h.human.snap.unmatched.NT.sam" "$basef.NT.snap.sam"
 	fi
-	echo -e "$(date)\t$scriptname\tStarting: parsing $basef.NT.snap.sam"
-	echo -e "$(date)\t$scriptname\textract matched/unmatched $basef.NT.snap.sam"
-	egrep -v "^@" $basef.NT.snap.sam | awk '{if($3 != "*") print }' > $basef.NT.snap.matched.sam
-	egrep -v "^@" $basef.NT.snap.sam | awk '{if($3 == "*") print }' > $basef.NT.snap.unmatched.sam
-	echo -e "$(date)\t$scriptname\tconvert sam to fastq from $basef.NT.snap.sam"
-	echo -e "$(date)\t$scriptname\tDone: parsing $basef.NT.snap.unmatched.sam"
-	if [ ! -f "$basef.NT.snap.matched.all.annotated" ]
+
+	if [ ! -f "$basef.NT.snap.matched.fulllength.all.annotated.sorted" ]
 	then
+    log "Starting: parsing $basef.NT.snap.sam"
+    log "extract matched/unmatched $basef.NT.snap.sam"
+    egrep -v "^@" $basef.NT.snap.sam | awk '{if($3 != "*") print }' > $basef.NT.snap.matched.sam
+    egrep -v "^@" $basef.NT.snap.sam | awk '{if($3 == "*") print }' > $basef.NT.snap.unmatched.sam
+
+    log "convert sam to fastq from $basef.NT.snap.sam"
+    log "Done: parsing $basef.NT.snap.unmatched.sam"
 		## convert to FASTQ and retrieve full-length sequences
-		echo -e "$(date)\t$scriptname\tconvert to FASTQ and retrieve full-length sequences for SNAP NT matched hits"
-		echo -e "$(date)\t$scriptname\tParameters: extractHeaderFromFastq_ncores.sh $cores $basef.cutadapt.fastq $basef.NT.snap.matched.sam $basef.NT.snap.matched.fulllength.fastq $basef.NT.snap.unmatched.sam $basef.NT.snap.unmatched.fulllength.fastq"
+		log "convert to FASTQ and retrieve full-length sequences for SNAP NT matched hits"
+		log "Parameters: extractHeaderFromFastq_ncores.sh $cores $basef.cutadapt.fastq $basef.NT.snap.matched.sam $basef.NT.snap.matched.fulllength.fastq $basef.NT.snap.unmatched.sam $basef.NT.snap.unmatched.fulllength.fastq"
 		extractHeaderFromFastq_ncores.sh "$cores" "$basef.cutadapt.fastq" "$basef.NT.snap.matched.sam" "$basef.NT.snap.matched.fulllength.fastq" "$basef.NT.snap.unmatched.sam" "$basef.NT.snap.unmatched.fulllength.fastq"   #SNN140507
 		sort -k1,1 "$basef.NT.snap.matched.sam"  > "$basef.NT.snap.matched.sorted.sam"
 		cut -f1-9 "$basef.NT.snap.matched.sorted.sam" > "$basef.NT.snap.matched.sorted.sam.tmp1"
@@ -905,72 +879,77 @@ then
 		awk '(NR%4==1) {printf("%s\t",$0)} (NR%4==2) {printf("%s\t", $0)} (NR%4==0) {printf("%s\n",$0)}' "$basef.NT.snap.matched.fulllength.fastq" | sort -k1,1 | awk '{print $2 "\t" $3}' > "$basef.NT.snap.matched.fulllength.sequence.txt" #SNN140507 change this to bring in quality lines as well
 		paste "$basef.NT.snap.matched.sorted.sam.tmp1" "$basef.NT.snap.matched.fulllength.sequence.txt" "$basef.NT.snap.matched.sorted.sam.tmp2" > "$basef.NT.snap.matched.fulllength.sam"
 		###retrieve taxonomy matched to NT ###
-		echo -e "$(date)\t$scriptname\ttaxonomy retrieval for $basef.NT.snap.matched.fulllength.sam"
-		echo -e "$(date)\t$scriptname\tParameters: taxonomy_lookup.pl $basef.NT.snap.matched.fulllength.sam sam nucl $cores $taxonomy_db_directory"
+		log "taxonomy retrieval for $basef.NT.snap.matched.fulllength.sam"
+		log "Parameters: taxonomy_lookup.pl $basef.NT.snap.matched.fulllength.sam sam nucl $cores $taxonomy_db_directory"
 		taxonomy_lookup.pl "$basef.NT.snap.matched.fulllength.sam" sam nucl $cores $taxonomy_db_directory
 		sort -k 13.7n "$basef.NT.snap.matched.fulllength.all.annotated" > "$basef.NT.snap.matched.fulllength.all.annotated.sorted" # sam format is no longer disturbed
 		rm -f  "$basef.NT.snap.matched.fulllength.gi" "$basef.NT.snap.matched.fullength.gi.taxonomy"
 	fi
-# adjust filenames for FAST mode
-	grep "Viruses;" "$basef.NT.snap.matched.fulllength.all.annotated.sorted" > "$basef.NT.snap.matched.fl.Viruses.annotated"
-	grep "Bacteria;" "$basef.NT.snap.matched.fulllength.all.annotated.sorted" > "$basef.NT.snap.matched.fl.Bacteria.annotated"
-
-	##SNN140507 cleanup bacterial reads
-	echo -e "$(date)\t$scriptname\tParameters: ribo_snap_bac_euk.sh $basef.NT.snap.matched.fl.Bacteria.annotated BAC $cores $ribo_snap_bac_euk_directory"
-	ribo_snap_bac_euk.sh $basef.NT.snap.matched.fl.Bacteria.annotated BAC $cores $ribo_snap_bac_euk_directory #SNN140507
-	if [ $run_mode = "Comprehensive" ]
-	then
-		grep "Primates;" "$basef.NT.snap.matched.fulllength.all.annotated.sorted" > "$basef.NT.snap.matched.fl.Primates.annotated"
-		grep -v "Primates" "$basef.NT.snap.matched.fulllength.all.annotated.sorted" | grep "Mammalia" > "$basef.NT.snap.matched.fl.nonPrimMammal.annotated"
-		grep -v "Mammalia" "$basef.NT.snap.matched.fulllength.all.annotated.sorted" | grep "Chordata" > "$basef.NT.snap.matched.fl.nonMammalChordat.annotated"
-		grep -v "Chordata" "$basef.NT.snap.matched.fulllength.all.annotated.sorted" | grep "Eukaryota" > "$basef.NT.snap.matched.fl.nonChordatEuk.annotated"
-		ribo_snap_bac_euk.sh $basef.NT.snap.matched.fl.nonChordatEuk.annotated EUK $cores $ribo_snap_bac_euk_directory
-	fi
-	echo -e "$(date)\t$scriptname\tDone taxonomy retrieval"
-	echo -e "$(date)\t$scriptname\tParameters: table_generator.sh $basef.NT.snap.matched.fl.Viruses.annotated SNAP Y Y Y Y>& $basef.table_generator_snap.matched.fl.log"
-	table_generator.sh "$basef.NT.snap.matched.fl.Viruses.annotated" SNAP Y Y Y Y>& "$basef.table_generator_snap.matched.fl.log"
-	if [ $run_mode = "Comprehensive" ]
-	then
-		### convert to FASTQ and retrieve full-length sequences to add to unmatched SNAP for viral RAPSearch###
-		egrep -v "^@" "$basef.NT.snap.matched.fl.Viruses.annotated" | awk '{if($3 != "*") print "@"$1"\n"$10"\n""+"$1"\n"$11}' > $(echo "$basef.NT.snap.matched.fl.Viruses.annotated" | sed 's/\(.*\)\..*/\1/').fastq
-		echo -e "$(date)\t$scriptname\tDone: convert to FASTQ and retrieve full-length sequences for SNAP NT hits "
-	fi
-	echo -e "$(date)\t$scriptname\t############# SORTING unmatched to NT BY LENGTH AND UNIQ AND LOOKUP ORIGINAL SEQUENCES  #################"
-	if [ $run_mode = "Comprehensive" ]
-	then
-		#SNN 140507 extractHeaderFromFastq.csh "$basef.NT.snap.unmatched.fastq" FASTQ "$basef.cutadapt.fastq" "$basef.NT.snap.unmatched.fulllength.fastq"
-		sed "n;n;n;d" "$basef.NT.snap.unmatched.fulllength.fastq" | sed "n;n;d" | sed "s/^@/>/g" > "$basef.NT.snap.unmatched.fulllength.fasta"
-	fi
-	cat "$basef.NT.snap.unmatched.fulllength.fasta" | perl -e 'while (<>) {$h=$_; $s=<>; $seqs{$h}=$s;} foreach $header (reverse sort {length($seqs{$a}) <=> length($seqs{$b})} keys %seqs) {print $header.$seqs{$header}}' > $basef.NT.snap.unmatched.fulllength.sorted.fasta
-	if [ $run_mode = "Comprehensive" ]
-	then
-		echo -e "$(date)\t$scriptname\twe will be using 50 as the length of the cropped read for removing unique and low-complexity reads"
-		echo -e "$(date)\t$scriptname\tParameters: crop_reads.csh $basef.NT.snap.unmatched.fulllength.sorted.fasta 25 50 > $basef.NT.snap.unmatched.fulllength.sorted.cropped.fasta"
-		crop_reads.csh "$basef.NT.snap.unmatched.fulllength.sorted.fasta" 25 50 > "$basef.NT.snap.unmatched.fulllength.sorted.cropped.fasta"
-		echo -e "$(date)\t$scriptname\t*** reads cropped ***"
-		echo -e "$(date)\t$scriptname\tParameters: gt sequniq -seqit -force -o $basef.NT.snap.unmatched.fulllength.sorted.cropped.uniq.fasta $basef.NT.snap.unmatched.fulllength.sorted.cropped.fasta"
-		gt sequniq -seqit -force -o "$basef.NT.snap.unmatched.fulllength.sorted.cropped.uniq.fasta" "$basef.NT.snap.unmatched.fulllength.sorted.cropped.fasta"
-		echo -e "$(date)\t$scriptname\tParameters: extractAlltoFast.sh $basef.NT.snap.unmatched.fulllength.sorted.cropped.uniq.fasta FASTA $basef.NT.snap.unmatched.fulllength.fasta FASTA $basef.NT.snap.unmatched.uniq.fl.fasta FASTA"
-		extractAlltoFast.sh "$basef.NT.snap.unmatched.fulllength.sorted.cropped.uniq.fasta" FASTA "$basef.NT.snap.unmatched.fulllength.fasta" FASTA "$basef.NT.snap.unmatched.uniq.fl.fasta" FASTA #SNN140507
-	fi
-	echo -e "$(date)\t$scriptname\tDone uniquing full length sequences of unmatched to NT"
+
+	if [ ! -f "$basef.NT.snap.unmatched.uniq.fl.fasta" ]
+  then
+    # adjust filenames for FAST mode
+    grep "Viruses;" "$basef.NT.snap.matched.fulllength.all.annotated.sorted" > "$basef.NT.snap.matched.fl.Viruses.annotated"
+    grep "Bacteria;" "$basef.NT.snap.matched.fulllength.all.annotated.sorted" > "$basef.NT.snap.matched.fl.Bacteria.annotated"
+
+    ##SNN140507 cleanup bacterial reads
+    log "Parameters: ribo_snap_bac_euk.sh $basef.NT.snap.matched.fl.Bacteria.annotated BAC $cores $ribo_snap_bac_euk_directory"
+    ribo_snap_bac_euk.sh $basef.NT.snap.matched.fl.Bacteria.annotated BAC $cores $ribo_snap_bac_euk_directory #SNN140507
+    if [ $run_mode = "Comprehensive" ]
+    then
+      grep "Primates;" "$basef.NT.snap.matched.fulllength.all.annotated.sorted" > "$basef.NT.snap.matched.fl.Primates.annotated"
+      grep -v "Primates" "$basef.NT.snap.matched.fulllength.all.annotated.sorted" | grep "Mammalia" > "$basef.NT.snap.matched.fl.nonPrimMammal.annotated"
+      grep -v "Mammalia" "$basef.NT.snap.matched.fulllength.all.annotated.sorted" | grep "Chordata" > "$basef.NT.snap.matched.fl.nonMammalChordat.annotated"
+      grep -v "Chordata" "$basef.NT.snap.matched.fulllength.all.annotated.sorted" | grep "Eukaryota" > "$basef.NT.snap.matched.fl.nonChordatEuk.annotated"
+      ribo_snap_bac_euk.sh $basef.NT.snap.matched.fl.nonChordatEuk.annotated EUK $cores $ribo_snap_bac_euk_directory
+    fi
+    log "Done taxonomy retrieval"
+    log "Parameters: table_generator.sh $basef.NT.snap.matched.fl.Viruses.annotated SNAP Y Y Y Y>& $basef.table_generator_snap.matched.fl.log"
+    table_generator.sh "$basef.NT.snap.matched.fl.Viruses.annotated" SNAP Y Y Y Y>& "$basef.table_generator_snap.matched.fl.log"
+    if [ $run_mode = "Comprehensive" ]
+    then
+      ### convert to FASTQ and retrieve full-length sequences to add to unmatched SNAP for viral RAPSearch###
+      egrep -v "^@" "$basef.NT.snap.matched.fl.Viruses.annotated" | awk '{if($3 != "*") print "@"$1"\n"$10"\n""+"$1"\n"$11}' > $(echo "$basef.NT.snap.matched.fl.Viruses.annotated" | sed 's/\(.*\)\..*/\1/').fastq
+      log "Done: convert to FASTQ and retrieve full-length sequences for SNAP NT hits "
+    fi
+    log "############# SORTING unmatched to NT BY LENGTH AND UNIQ AND LOOKUP ORIGINAL SEQUENCES  #################"
+    if [ $run_mode = "Comprehensive" ]
+    then
+      #SNN 140507 extractHeaderFromFastq.csh "$basef.NT.snap.unmatched.fastq" FASTQ "$basef.cutadapt.fastq" "$basef.NT.snap.unmatched.fulllength.fastq"
+      sed "n;n;n;d" "$basef.NT.snap.unmatched.fulllength.fastq" | sed "n;n;d" | sed "s/^@/>/g" > "$basef.NT.snap.unmatched.fulllength.fasta"
+    fi
+    cat "$basef.NT.snap.unmatched.fulllength.fasta" | perl -e 'while (<>) {$h=$_; $s=<>; $seqs{$h}=$s;} foreach $header (reverse sort {length($seqs{$a}) <=> length($seqs{$b})} keys %seqs) {print $header.$seqs{$header}}' > $basef.NT.snap.unmatched.fulllength.sorted.fasta
+    if [ $run_mode = "Comprehensive" ]
+    then
+      log "we will be using 50 as the length of the cropped read for removing unique and low-complexity reads"
+      log "Parameters: crop_reads.csh $basef.NT.snap.unmatched.fulllength.sorted.fasta 25 50 > $basef.NT.snap.unmatched.fulllength.sorted.cropped.fasta"
+      crop_reads.csh "$basef.NT.snap.unmatched.fulllength.sorted.fasta" 25 50 > "$basef.NT.snap.unmatched.fulllength.sorted.cropped.fasta"
+      log "*** reads cropped ***"
+      log "Parameters: gt sequniq -seqit -force -o $basef.NT.snap.unmatched.fulllength.sorted.cropped.uniq.fasta $basef.NT.snap.unmatched.fulllength.sorted.cropped.fasta"
+      gt sequniq -seqit -force -o "$basef.NT.snap.unmatched.fulllength.sorted.cropped.uniq.fasta" "$basef.NT.snap.unmatched.fulllength.sorted.cropped.fasta"
+      log "Parameters: extractAlltoFast.sh $basef.NT.snap.unmatched.fulllength.sorted.cropped.uniq.fasta FASTA $basef.NT.snap.unmatched.fulllength.fasta FASTA $basef.NT.snap.unmatched.uniq.fl.fasta FASTA"
+      extractAlltoFast.sh "$basef.NT.snap.unmatched.fulllength.sorted.cropped.uniq.fasta" FASTA "$basef.NT.snap.unmatched.fulllength.fasta" FASTA "$basef.NT.snap.unmatched.uniq.fl.fasta" FASTA #SNN140507
+    fi
+    log "Done uniquing full length sequences of unmatched to NT"
+  fi
 fi
+
 ####################### DENOVO CONTIG ASSEMBLY #####
 if [ $run_mode = "Comprehensive" ]
 then
-	echo -e "$(date)\t$scriptname\t######### Running ABYSS and Minimus #########"
+	log "######### Running ABYSS and Minimus #########"
 	START_deNovo=$(date +%s)
-	echo -e "$(date)\t$scriptname\tAdding matched viruses to NT unmatched"
+	log "Adding matched viruses to NT unmatched"
 	sed "n;n;n;d" "$basef.NT.snap.matched.fl.Viruses.fastq" | sed "n;n;d" | sed "s/^@/>/g" | sed 's/>/>Vir/g' > "$basef.NT.snap.matched.fl.Viruses.fasta"
 	gt sequniq -seqit -force -o "$basef.NT.snap.matched.fl.Viruses.uniq.fasta" "$basef.NT.snap.matched.fl.Viruses.fasta"
 	cat "$basef.NT.snap.unmatched.uniq.fl.fasta" "$basef.NT.snap.matched.fl.Viruses.uniq.fasta" > "$basef.NT.snap.unmatched_addVir_uniq.fasta"
-	echo -e "$(date)\t$scriptname\tStarting deNovo assembly"
-	echo -e "$(date)\t$scriptname\tParameters: abyss_minimus.sh $basef.NT.snap.unmatched_addVir_uniq.fasta $length $contigcutoff $cores $abysskmer $ignore_barcodes_for_de_novo"
+	log "Starting deNovo assembly"
+	log "Parameters: abyss_minimus.sh $basef.NT.snap.unmatched_addVir_uniq.fasta $length $contigcutoff $cores $abysskmer $ignore_barcodes_for_de_novo"
 	abyss_minimus.sh "$basef.NT.snap.unmatched_addVir_uniq.fasta" "$length" "$contigcutoff" "$cores" "$abysskmer" "$ignore_barcodes_for_de_novo"
-	echo -e "$(date)\t$scriptname\tCompleted deNovo assembly: generated all.$basef.NT.snap.unmatched_addVir_uniq.fasta.unitigs.cut${length}.${contigcutoff}-mini.fa"
+  log "Completed deNovo assembly: generated all.$basef.NT.snap.unmatched_addVir_uniq.fasta.unitigs.cut${length}.${contigcutoff}-mini.fa"
 	END_deNovo=$(date +%s)
 	diff_deNovo=$(( END_deNovo - START_deNovo ))
-	echo -e "$(date)\t$scriptname\tdeNovo Assembly took $diff_deNovo seconds." | tee -a timing.$basef.log
+	log "deNovo Assembly took $diff_deNovo seconds." | tee -a timing.$basef.log
 fi
 #######RAPSearch#####
 #################### RAPSearch to Vir ###########
@@ -980,59 +959,55 @@ then
 	then
 		if [ -f "$basef.NT.snap.unmatched.uniq.fl.fasta" ]
 		then
-			echo -e "$(date)\t$scriptname\t############# RAPSearch to ${RAPSearch_VIRUS_db} ON NT-UNMATCHED SEQUENCES #################"
-			if [[ $dropcache == "Y" ]]
-			then
-				dropcache
-			fi
-			echo -e "$(date)\t$scriptname\tStarting: RAPSearch $basef.NT.snap.unmatched.uniq.fl.fasta "
+			log "############# RAPSearch to ${RAPSearch_VIRUS_db} ON NT-UNMATCHED SEQUENCES #################"
+			log "Starting: RAPSearch $basef.NT.snap.unmatched.uniq.fl.fasta "
 			START14=$(date +%s)
-			echo -e "$(date)\t$scriptname\tParameters: rapsearch -q $basef.NT.snap.unmatched.uniq.fl.fasta -d $RAPSearch_VIRUS_db -o $basef.$rapsearch_database.RAPSearch.e1 -z $cores -e $ecutoff_Vir -v 1 -b 1 -t N >& $basef.$rapsearch_database.RAPSearch.log"
+			log "Parameters: rapsearch -q $basef.NT.snap.unmatched.uniq.fl.fasta -d $RAPSearch_VIRUS_db -o $basef.$rapsearch_database.RAPSearch.e1 -z $cores -e $ecutoff_Vir -v 1 -b 1 -t N >& $basef.$rapsearch_database.RAPSearch.log"
 			rapsearch -q "$basef.NT.snap.unmatched.uniq.fl.fasta" -d $RAPSearch_VIRUS_db -o $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir} -z "$cores" -e "$ecutoff_Vir" -v 1 -b 1 -t N >& $basef.$rapsearch_database.RAPSearch.log
-			echo -e "$(date)\t$scriptname\tDone RAPSearch"
+			log "Done RAPSearch"
 			END14=$(date +%s)
 			diff=$(( END14 - START14 ))
-			echo -e "$(date)\t$scriptname\tRAPSearch to Vir Took $diff seconds"
-			echo -e "$(date)\t$scriptname\tStarting: add FASTA sequences to RAPSearch m8 output file "
+			log "RAPSearch to Vir Took $diff seconds"
+			log "Starting: add FASTA sequences to RAPSearch m8 output file "
 			START15=$(date +%s)
 			sed -i '/^#/d' $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8
 			seqtk subseq $basef.NT.snap.unmatched.uniq.fl.fasta $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8 > $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta
 
 			sed '/>/d' $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta >  $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta.seq
 			paste $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8 $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta.seq > $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.addseq.m8
-			echo -e "$(date)\t$scriptname\tParameters: taxonomy_lookup.pl $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.addseq.m8 blast prot $cores $taxonomy_db_directory"
+			log "Parameters: taxonomy_lookup.pl $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.addseq.m8 blast prot $cores $taxonomy_db_directory"
 			taxonomy_lookup.pl $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.addseq.m8 blast prot $cores $taxonomy_db_directory
 			mv $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.addseq.all.annotated $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.annotated
-			echo -e "$(date)\t$scriptname\tParameters: table_generator.sh $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.annotated RAP N Y N N"
+			log "Parameters: table_generator.sh $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.annotated RAP N Y N N"
 			table_generator.sh $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.annotated RAP N Y N N
-			echo -e "$(date)\t$scriptname\tDone: converting RAPSearch Vir output to fasta"
+			log "Done: converting RAPSearch Vir output to fasta"
 			END15=$(date +%s)
 			diff=$(( END15 - START15 ))
-			echo -e "$(date)\t$scriptname\tConverting RAPSearch Vir output to fasta sequences took $diff seconds." | tee -a timing.$basef.log
+			log "Converting RAPSearch Vir output to fasta sequences took $diff seconds." | tee -a timing.$basef.log
 			cat "$basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta" "all.$basef.NT.snap.unmatched_addVir_uniq.fasta.unitigs.cut${length}.${contigcutoff}-mini.fa" > "$basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta"
 		else
-			echo -e "$(date)\t$scriptname\tCannot run viral RAPSearch - necessary input file ($basef.$rapsearch_database.RAPSearch.e$ecutoff_Vir.m8) does not exist"
-			echo -e "$(date)\t$scriptname\tconcatenating RAPSearchvirus output and Contigs"
+			log "Cannot run viral RAPSearch - necessary input file ($basef.$rapsearch_database.RAPSearch.e$ecutoff_Vir.m8) does not exist"
+			log "concatenating RAPSearchvirus output and Contigs"
 		fi
-		echo -e "$(date)\t$scriptname\t############# Cleanup RAPSearch Vir by RAPSearch to NR #################"
+		log "############# Cleanup RAPSearch Vir by RAPSearch to NR #################"
 		if [ -f $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta ]
 		then
-			echo -e "$(date)\t$scriptname\tStarting: RAPSearch to $RAPSearch_NR_db of $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta :"
+			log "Starting: RAPSearch to $RAPSearch_NR_db of $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta :"
 			START16=$(date +%s)
-			echo -e "$(date)\t$scriptname\tParameters: rapsearch -q $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta -d $RAPSearch_NR_db -o $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR} -z $cores -e $ecutoff_NR -v 1 -b 1 -t N -a T"
+			log "Parameters: rapsearch -q $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta -d $RAPSearch_NR_db -o $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR} -z $cores -e $ecutoff_NR -v 1 -b 1 -t N -a T"
 			rapsearch -q $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta -d $RAPSearch_NR_db -o $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR} -z $cores -e $ecutoff_NR -v 1 -b 1 -t N -a T
-			echo -e "$(date)\t$scriptname\trapsearch to nr done"
+			log "rapsearch to nr done"
 			sed -i '/^#/d' $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.m8
-			echo -e "$(date)\t$scriptname\tremoved extra #"
+			log "removed extra #"
 			END16=$(date +%s)
 			diff=$(( END16 - START16 ))
-			echo -e "$(date)\t$scriptname\tRAPSearch to NR took $diff seconds." | tee -a timing.$basef.log
-			echo -e "$(date)\t$scriptname\tStarting: Seq retrieval and Taxonomy $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}"
+			log "RAPSearch to NR took $diff seconds." | tee -a timing.$basef.log
+			log "Starting: Seq retrieval and Taxonomy $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}"
 			START17=$(date +%s)
 			seqtk subseq $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta \
 					$basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.m8 > \
 					$basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.m8.fasta
-			echo -e "$(date)\t$scriptname\tretrieved sequences"
+			log "retrieved sequences"
 			cat $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.m8.fasta | \
 					awk '{if (substr($0,1,1)==">"){if (p){print "\n";} print $0} else printf("%s",$0);p++;}END{print "\n"}' | \
 					sed '/^$/d' | \
@@ -1040,25 +1015,25 @@ then
 			paste $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e$ecutoff_Vir.NR.e${ecutoff_NR}.m8 \
 					$basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.m8.fasta.seq > \
 					$basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.addseq.m8
-			echo -e "$(date)\t$scriptname\tmade addseq file"
-			echo -e "$(date)\t$scriptname\t############# RAPSearch Taxonomy"
-			echo -e "$(date)\t$scriptname\tParameters: taxonomy_lookup.pl $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.addseq.m8 blast prot $cores $taxonomy_db_directory"
+			log "made addseq file"
+			log "############# RAPSearch Taxonomy"
+			log "Parameters: taxonomy_lookup.pl $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.addseq.m8 blast prot $cores $taxonomy_db_directory"
 			taxonomy_lookup.pl $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.addseq.m8 blast prot $cores $taxonomy_db_directory
 			cp $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.addseq.all.annotated $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.annotated
-			echo -e "$(date)\t$scriptname\tretrieved taxonomy"
+			log "retrieved taxonomy"
 			grep "Viruses" $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.annotated > $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.Viruses.annotated
 			egrep "^contig" $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.annotated > $basef.Contigs.NR.RAPSearch.e${ecutoff_NR}.annotated
-			echo -e "$(date)\t$scriptname\textracted RAPSearch taxonomy"
-			echo -e "$(date)\t$scriptname\tStarting Readcount table"
-			echo -e "$(date)\t$scriptname\tParameters: table_generator.sh $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.Viruses.annotated RAP Y Y Y Y"
+			log "extracted RAPSearch taxonomy"
+			log "Starting Readcount table"
+			log "Parameters: table_generator.sh $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.Viruses.annotated RAP Y Y Y Y"
 			table_generator.sh $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.Viruses.annotated RAP Y Y Y Y
-			echo -e "$(date)\t$scriptname\tParameters: table_generator.sh $basef.Contigs.NR.RAPSearch.e${ecutoff_NR}.annotated RAP Y Y Y Y"
+			log "Parameters: table_generator.sh $basef.Contigs.NR.RAPSearch.e${ecutoff_NR}.annotated RAP Y Y Y Y"
 			table_generator.sh $basef.Contigs.NR.RAPSearch.e${ecutoff_NR}.annotated RAP Y Y Y Y
 			#allow contigs to be incorporated into coverage maps by making contig barcodes the same as non-contig barcodes (removing the @)
 			sed 's/@//g' $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.Viruses.annotated > $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.Viruses.annotated.bar.inc
-			echo -e "$(date)\t$scriptname\tmaking coverage maps"
+			log "making coverage maps"
 			# coverage_generator_bp.sh (divides each fasta file into $cores cores then runs BLASTn using one core each.
-			echo -e "$(date)\t$scriptname\tParameters: coverage_generator_bp.sh $basef.NT.snap.matched.fl.Viruses.annotated $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.Viruses.annotated.bar.inc $eBLASTn $cores 10 1 $basef"
+			log "Parameters: coverage_generator_bp.sh $basef.NT.snap.matched.fl.Viruses.annotated $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.Viruses.annotated.bar.inc $eBLASTn $cores 10 1 $basef"
 			coverage_generator_bp.sh $basef.NT.snap.matched.fl.Viruses.annotated $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.Viruses.annotated.bar.inc $eBLASTn $cores 10 1 $basef
 
 			awk '{print$1}'  $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.annotated > $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.annotated.header
@@ -1078,38 +1053,38 @@ then
 			cat $basef.not.in.NR.[a-z][a-z][a-z][a-z][a-z][a-z].annotated > $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.annotated.not.in.NR.annotated
 			rm -r $basef.not.in.NR.[a-z][a-z][a-z][a-z][a-z][a-z]
 			rm -r $basef.not.in.NR.[a-z][a-z][a-z][a-z][a-z][a-z].annotated
-			echo -e "$(date)\t$scriptname\tParameters: table_generator.sh $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.annotated.not.in.NR.annotated RAP N Y N N"
+			log "Parameters: table_generator.sh $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.annotated.not.in.NR.annotated RAP N Y N N"
 			table_generator.sh $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.annotated.not.in.NR.annotated RAP N Y N N
 
 			END17=$(date +%s)
 			diff=$(( END17 - START17 ))
-			echo -e "$(date)\t$scriptname\tRAPSearch seq retrieval, taxonomy and readcount took $diff seconds" | tee -a timing.$basef.log
+			log "RAPSearch seq retrieval, taxonomy and readcount took $diff seconds" | tee -a timing.$basef.log
 		else
-			echo -e "$(date)\t$scriptname\tCannot run RAPSearch to NR - necessary input file ($basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta) does not exist"
+			log "Cannot run RAPSearch to NR - necessary input file ($basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta) does not exist"
 		fi
 	fi
 	##################RAPSearch to NR #######
 	if [ "$rapsearch_database" == "NR" ]
 	then
-		echo -e "$(date)\t$scriptname\t#################### RAPSearch to NR ###########"
+		log "#################### RAPSearch to NR ###########"
 		cat "$basef.NT.snap.unmatched.uniq.fl.fasta" "all.$basef.NT.snap.unmatched_addVir_uniq.fasta.unitigs.cut${length}.${contigcutoff}-mini.fa" > "$basef.Contigs.NT.snap.unmatched.uniq.fl.fasta"
 		if [ -f $basef.Contigs.NT.snap.unmatched.uniq.fl.fasta ]
 		then
-			echo -e "$(date)\t$scriptname\t############# RAPSearch to NR #################"
-			echo -e "$(date)\t$scriptname\tStarting: RAPSearch to NR $basef.Contigs.NT.snap.unmatched.uniq.fl.fasta"
+			log "############# RAPSearch to NR #################"
+			log "Starting: RAPSearch to NR $basef.Contigs.NT.snap.unmatched.uniq.fl.fasta"
 			START16=$(date +%s)
-			echo -e "$(date)\t$scriptname\tParameters:rapsearch -q $basef.Contigs.NT.snap.unmatched.uniq.fl.fasta -d $RAPSearch_NR_db -o $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR} -z $cores -e $ecutoff_NR -v 1 -b 1 -t N -a $RAPSearch_NR_fast_mode"
+			log "Parameters:rapsearch -q $basef.Contigs.NT.snap.unmatched.uniq.fl.fasta -d $RAPSearch_NR_db -o $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR} -z $cores -e $ecutoff_NR -v 1 -b 1 -t N -a $RAPSearch_NR_fast_mode"
 			rapsearch -q "$basef.Contigs.NT.snap.unmatched.uniq.fl.fasta" -d "$RAPSearch_NR_db" -o "$basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}" -z $cores -e $ecutoff_NR -v 1 -b 1 -t N -a $RAPSearch_NR_fast_mode
-			echo -e "$(date)\t$scriptname\tRAPSearch to NR done"
+			log "RAPSearch to NR done"
 			sed -i '/^#/d' $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.m8
-			echo -e "$(date)\t$scriptname\tremoved extra #"
+			log "removed extra #"
 			END16=$(date +%s)
 			diff=$(( END16 - START16 ))
-			echo -e "$(date)\t$scriptname\tRAPSearch to NR took $diff seconds" | tee -a timing.$basef.log
-			echo -e "$(date)\t$scriptname\tStarting: Seq retrieval and Taxonomy $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}"
+			log "RAPSearch to NR took $diff seconds" | tee -a timing.$basef.log
+			log "Starting: Seq retrieval and Taxonomy $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}"
 			START17=$(date +%s)
 			seqtk subseq $basef.Contigs.NT.snap.unmatched.uniq.fl.fasta $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.m8 > $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.m8.fasta
-			echo -e "$(date)\t$scriptname\tretrieved sequences"
+			log "retrieved sequences"
 			cat $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.m8.fasta | \
 				awk '{if (substr($0,1,1)==">"){if (p){print "\n";} print $0} else printf("%s",$0);p++;}END{print "\n"}' | \
 				sed '/^$/d' | \
@@ -1119,40 +1094,36 @@ then
 			paste $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.m8 \
 					$basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.m8.fasta.seq > \
 					$basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.addseq.m8
-			echo -e "$(date)\t$scriptname\tmade addseq file"
-			echo -e "$(date)\t$scriptname\t############# RAPSearch Taxonomy"
-			echo -e "$(date)\t$scriptname\tParameters: taxonomy_lookup.pl $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.addseq.m8 blast prot $cores $taxonomy_db_directory"
+			log "made addseq file"
+			log "############# RAPSearch Taxonomy"
+			log "Parameters: taxonomy_lookup.pl $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.addseq.m8 blast prot $cores $taxonomy_db_directory"
 			taxonomy_lookup.pl $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.addseq.m8 blast prot $cores $taxonomy_db_directory
 			cp $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.addseq.all.annotated $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.annotated
-			echo -e "$(date)\t$scriptname\tretrieved taxonomy"
+			log "retrieved taxonomy"
 			grep "Viruses" $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.annotated > $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.Viruses.annotated
 			egrep "^contig" $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.annotated > $basef.Contigs.$rapsearch_database.RAPSearch.e${ecutoff_NR}.annotated
-			echo -e "$(date)\t$scriptname\textracted RAPSearch taxonomy"
-			echo -e "$(date)\t$scriptname\tStarting Readcount table"
-			echo -e "$(date)\t$scriptname\tParameters: table_generator.sh $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.Viruses.annotated RAP Y Y Y Y"
+			log "extracted RAPSearch taxonomy"
+			log "Starting Readcount table"
+			log "Parameters: table_generator.sh $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.Viruses.annotated RAP Y Y Y Y"
 			table_generator.sh $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.Viruses.annotated RAP Y Y Y Y
 			grep -v Viruses $basef.Contigs.$rapsearch_database.RAPSearch.e${ecutoff_NR}.annotated > $basef.Contigs.$rapsearch_database.RAPSearch.e${ecutoff_NR}.noVir.annotated
-			echo -e "$(date)\t$scriptname\tParameters: table_generator.sh $basef.Contigs.$rapsearch_database.RAPSearch.e${ecutoff_NR}.noVir.annotated RAP N Y N N"
+			log "Parameters: table_generator.sh $basef.Contigs.$rapsearch_database.RAPSearch.e${ecutoff_NR}.noVir.annotated RAP N Y N N"
 			table_generator.sh $basef.Contigs.$rapsearch_database.RAPSearch.e${ecutoff_NR}.noVir.annotated RAP N Y N N
 			sed 's/@//g' $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.Viruses.annotated > $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.Viruses.annotated.bar.inc
-			echo -e "$(date)\t$scriptname\tmaking coverage maps"
-			echo -e "$(date)\t$scriptname\tParameters: coverage_generator_bp.sh $basef.NT.snap.matched.fl.Viruses.annotated $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.Viruses.annotated.bar.inc $eBLASTn 10 10 1 $basef"
+			log "making coverage maps"
+			log "Parameters: coverage_generator_bp.sh $basef.NT.snap.matched.fl.Viruses.annotated $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.Viruses.annotated.bar.inc $eBLASTn 10 10 1 $basef"
 			coverage_generator_bp.sh $basef.NT.snap.matched.fl.Viruses.annotated $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.Viruses.annotated.bar.inc $eBLASTn 10 10 1 $basef
 			END17=$(date +%s)
 			diff=$(( END17 - START17 ))
-			echo -e "$(date)\t$scriptname\tRAPSearch seq retrieval, taxonomy and table readcount and coverage took $diff seconds." | tee -a timing.$basef.log
+			log "RAPSearch seq retrieval, taxonomy and table readcount and coverage took $diff seconds." | tee -a timing.$basef.log
 		else
-			echo -e "$(date)\t$scriptname\tCannot run RAPSearch to NR - necessary input file ($basef.Contigs.NT.snap.unmatched.uniq.fl.fasta) does not exist"
+			log "Cannot run RAPSearch to NR - necessary input file ($basef.Contigs.NT.snap.unmatched.uniq.fl.fasta) does not exist"
 		fi
 	fi
-	if [[ $dropcache == "Y" ]]
-	then
-		dropcache
-	fi
 fi
 
 ############################# OUTPUT FINAL COUNTS #############################
-echo -e "$(date)\t$scriptname\tStarting: generating readcounts.$basef.log report"
+log "Starting: generating readcounts.$basef.log report"
 START_READCOUNT=$(date +%s)
 
 headerid_top=$(head -1 $basef.fastq | cut -c1-4)
@@ -1163,8 +1134,8 @@ if [ "$headerid_top" == "$headerid_bottom" ]
 # We should adjust this code to check that all headers are unique, rather than just the first and last
 then
 	headerid=$(head -1 $basef.fastq | cut -c1-4 | sed 's/@//g')
-	echo -e "$(date)\t$scriptname\theaderid_top $headerid_top = headerid_bottom $headerid_bottom and headerid = $headerid"
-	echo -e "$(date)\t$scriptname\tParameters: readcount.sh $basef $headerid Y $basef.fastq $basef.preprocessed.fastq $basef.preprocessed.s20.h250n25d12xfu.human.snap.unmatched.fastq $basef.NT.snap.matched.fulllength.all.annotated.sorted $basef.NT.snap.matched.fl.Viruses.annotated $basef.NT.snap.matched.fl.Bacteria.annotated $basef.NT.snap.matched.fl.nonChordatEuk.annotated $basef.NT.snap.unmatched.sam $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.Viruses.annotated"
+	log "headerid_top $headerid_top = headerid_bottom $headerid_bottom and headerid = $headerid"
+	log "Parameters: readcount.sh $basef $headerid Y $basef.fastq $basef.preprocessed.fastq $basef.preprocessed.s20.h250n25d12xfu.human.snap.unmatched.fastq $basef.NT.snap.matched.fulllength.all.annotated.sorted $basef.NT.snap.matched.fl.Viruses.annotated $basef.NT.snap.matched.fl.Bacteria.annotated $basef.NT.snap.matched.fl.nonChordatEuk.annotated $basef.NT.snap.unmatched.sam $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.Viruses.annotated"
 	readcount.sh $basef $headerid Y $basef.fastq \
 			$basef.preprocessed.fastq \
 			$basef.preprocessed.s20.h250n25d12xfu.human.snap.unmatched.fastq \
@@ -1174,18 +1145,18 @@ then
 			$basef.NT.snap.matched.fl.nonChordatEuk.annotated \
 			$basef.NT.snap.unmatched.sam \
 			$basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.Viruses.annotated
-	echo -e "$(date)\t$scriptname\tDone: generating readcounts.$basef.log report"
+	log "Done: generating readcounts.$basef.log report"
 	END_READCOUNT=$(date +%s)
 	diff_READCOUNT=$(( END_READCOUNT - START_READCOUNT ))
-	echo -e "$(date)\t$scriptname\tGenerating read count report Took $diff_READCOUNT seconds" | tee -a timing.$basef.log
+	log "Generating read count report Took $diff_READCOUNT seconds" | tee -a timing.$basef.log
 else
-	echo -e "$(date)\t$scriptname\treadcount.sh aborted due to non-unique header id"
+	log "readcount.sh aborted due to non-unique header id"
 fi
 
-echo -e "$(date)\t$scriptname\t#################### SURPI PIPELINE COMPLETE ##################"
+log "#################### SURPI PIPELINE COMPLETE ##################"
 END_PIPELINE=$(date +%s)
 diff_PIPELINE=$(( END_PIPELINE - START_PIPELINE ))
-echo -e "$(date)\t$scriptname\tTotal run time of pipeline took $diff_PIPELINE seconds" | tee -a timing.$basef.log
+log "Total run time of pipeline took $diff_PIPELINE seconds" | tee -a timing.$basef.log
 
 echo "Script and Parameters = $0 $@ " > $basef.pipeline_parameters.log
 echo "Raw Read quality = $quality" >> $basef.pipeline_parameters.log