chiulab · yesimon · Apr 16, 2015 · Apr 16, 2015 · Apr 16, 2015 · Apr 16, 2015
diff --git a/License.txt b/License.txt
diff --git a/Readme.txt b/Readme.txt
diff --git a/SURPI.sh b/SURPI.sh
diff --git a/abyss_minimus.sh b/abyss_minimus.sh
@@ -15,8 +15,9 @@
 # Copyright (C) 2014 Samia N Naccache - All Rights Reserved
 # SURPI has been released under a modified BSD license.
 # Please see license file for details.
-
 scriptname=${0##*/}
+source debug.sh
+source logging.sh
 
 if [ $# -lt 5 ]
 then
@@ -34,7 +35,7 @@ ignore_barcodes=$6
 split_FASTA_size=100000
 
 #### create list of barcodes in the fasta file in the following format: #barcode
-echo -e "$(date)\t$scriptname\tDemultiplexing barcodes..."
+log "Demultiplexing barcodes..."
 START_DEMULTIPLEX=$(date +%s)
 
 #The sed '/^#$/d' removes a blank #. This is needed in cases where barcodes are present along with reads that have no barcodes
@@ -54,10 +55,10 @@ else #$inputfile.barcodes doesn't exist, or size=0, then we only have 1 barcode,
 	echo "#" > $inputfile.barcodes
 fi
 
-echo -e "$(date)\t$scriptname\tCompleted demultiplexing barcodes."
+log "Completed demultiplexing barcodes."
 END_DEMULTIPLEX=$(date +%s)
 diff_DEMULTIPLEX=$(( END_DEMULTIPLEX - START_DEMULTIPLEX ))
-echo -e "$(date)\t$scriptname\tBarcode demultiplex took $diff_DEMULTIPLEX s."
+log "Barcode demultiplex took $diff_DEMULTIPLEX s."
 ### generate fasta file for every separate barcode (demultiplex)
 
 for f in `cat $inputfile.barcodes` ; do
@@ -68,78 +69,82 @@ for f in `cat $inputfile.barcodes` ; do
 		grep -E "$f(/|$)" $inputfile -A 1 --no-group-separator > bar$f.$inputfile
 	fi
 	# split demultiplexed fasta file into 100,000 read sub-fastas
-	echo -e "$(date)\t$scriptname\tSplitting FASTA into chunks of size: $split_FASTA_size."
+	log "Splitting FASTA into chunks of size: $split_FASTA_size."
 	START_SPLIT=$(date +%s)
 
 	cp bar$f.$inputfile bar$f.${inputfile}_n # So that the unsplit demultiplexed file is also denovo assembled #
 	split_fasta.pl -i bar$f.$inputfile -o bar$f.$inputfile -n $split_FASTA_size
-	echo -e "$(date)\t$scriptname\tCompleted splitting FASTA file."
+	log "Completed splitting FASTA file."
 	END_SPLIT=$(date +%s)
 	diff_SPLIT=$(( $END_SPLIT - $START_SPLIT ))
-	echo -e "$(date)\t$scriptname\tSplitting FASTA took $diff_SPLIT s."
-	### run abyss (deBruijn assembler) on each 100,000 read demultiplexed fasta file, including the unsplit demultiplexed file 
-	echo -e "$(date)\t$scriptname\tRunning abyss on each $split_FASTA_size chunk..."
+	log "Splitting FASTA took $diff_SPLIT s."
+	### run abyss (deBruijn assembler) on each 100,000 read demultiplexed fasta file, including the unsplit demultiplexed file
+	log "Running abyss on each $split_FASTA_size chunk..."
 	START_ABYSS=$(date +%s)
 
-	for d in bar$f.${inputfile}_* ; do 
-		abyss-pe k=$kmer name=$d.f se=$d np=$cores >& $d.abyss.log
+	for d in bar$f.${inputfile}_* ; do
+    echo $d
+	  log "Command: abyss-pe k=$kmer name=$d.f se=$d np=$cores >& $d.abyss.log"
+		#abyss-pe k=$kmer name=$d.f se=$d np=$cores >& $d.abyss.log
+		abyss-pe k=$kmer name=$d.f se=$d >& $d.abyss.log
 	done
 
-	echo -e "$(date)\t$scriptname\tCompleted running abyss on each $split_FASTA_size chunk."
+	log "Completed running abyss on each $split_FASTA_size chunk."
 	END_ABYSS=$(date +%s)
 	diff_ABYSS=$(( END_ABYSS - START_ABYSS ))
-	echo -e "$(date)\t$scriptname\tAbyss took $diff_ABYSS s."
+	log "Abyss took $diff_ABYSS s."
 ###
 ### contigs from split files concatenated, after which reads smaller than the cutoff value are eliminated
-	echo -e "$(date)\t$scriptname\tStarting concatenating and cutoff of abyss output"
+	log "Starting concatenating and cutoff of abyss output"
 	START_CATCONTIGS=$(date +%s)
 
 	#concatenating contig files from different fasta splits, and adding kmer infromation and barcode information to contig headers
 	cat bar$f.${inputfile}_*.f-unitigs.fa | sed 's/ /_/g' | sed "s/$/"_kmer"$kmer""/g;n" | sed "s/$/"$f"/g;n" > all.bar$f.$inputfile.unitigs.fa
 	# only contigs larger than $cutoff_post_Abyss are retained
  	cat all.bar$f.$inputfile.unitigs.fa | awk 'NR%2==1 {x = $0} NR%2==0 { if (length($0) >= '$2') printf("%s\n%s\n",x,$0)}' > all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss.fa
 
-	echo -e "$(date)\t$scriptname\tDone concatenating and cutoff of abyss output"
+	log "Done concatenating and cutoff of abyss output"
 	END_CATCONTIGS=$(date +%s)
 	diff_CATCONTIGS=$(( END_CATCONTIGS - START_CATCONTIGS ))
-	echo -e "$(date)\t$scriptname\tConcatenating and cutoff of abyss output took $diff_CATCONTIGS s."
+	log "Concatenating and cutoff of abyss output took $diff_CATCONTIGS s."
 ###
 ### run Minimo (OLC assembler)
-	echo -e "$(date)\t$scriptname\tStarting Minimo..."
+	log "Starting Minimo..."
 	START_MINIMO=$(date +%s)
+  log "Command: Minimo all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss.fa -D FASTA_EXP=1"
 	Minimo all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss.fa -D FASTA_EXP=1
-	echo -e "$(date)\t$scriptname\tCompleted Minimo."
+	log "Completed Minimo."
 	END_MINIMO=$(date +%s)
 	diff_MINIMO=$(( END_MINIMO - START_MINIMO ))
-	echo -e "$(date)\t$scriptname\tMinimo took $diff_MINIMO s."
+	log "Minimo took $diff_MINIMO s."
 ###########
-	echo -e "$(date)\t$scriptname\tStarting cat barcode addition and cutoff of minimo output"
+	log "Starting cat barcode addition and cutoff of minimo output"
 	START_MIN_PROCESS=$(date +%s)
-	# Minimo output gives more than one line per sequence, so here we linearize sequences (linearization protocol from here http://seqanswers.com/forums/showthread.php?t=27567 ) 
+	# Minimo output gives more than one line per sequence, so here we linearize sequences (linearization protocol from here http://seqanswers.com/forums/showthread.php?t=27567 )
 	# then we add the relevant barcode to the end of the header contig. Contigs that survive untouched from abyss already have a barcode at the end of them, so that extra barcode is taken away
-	cat all-contigs.fa | awk '{if (substr($0,1,1)==">"){if (p){print "\n";} print $0} else printf("%s",$0);p++;}END{print "\n"}' | sed '/^$/d' | sed 's/ /_/g' | sed "s/$/"$f"/g;n" | sed "s/"$f""$f"/"$f"/g" | sed 's/#/#@/g' | sed 's/^>/>contig_/g' > all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss-minim.fa	
-	# change generic name all-contigs.fa 
+	cat all-contigs.fa | awk '{if (substr($0,1,1)==">"){if (p){print "\n";} print $0} else printf("%s",$0);p++;}END{print "\n"}' | sed '/^$/d' | sed 's/ /_/g' | sed "s/$/"$f"/g;n" | sed "s/"$f""$f"/"$f"/g" | sed 's/#/#@/g' | sed 's/^>/>contig_/g' > all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss-minim.fa
+	# change generic name all-contigs.fa
 	mv all-contigs.fa all-contigs.fa.$f
 	# only contigs larger than $cutoff_post_Minimo are retained
 	cat all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss-minim.fa | awk 'NR%2==1 {x = $0} NR%2==0 { if (length($0) >= '$3') printf("%s\n%s\n",x,$0)}' > all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss.${cutoff_post_Minimo}-mini.fa
 
-	echo -e "$(date)\t$scriptname\tDone cat barcode addition and cutoff of minimo output"
+	log "Done cat barcode addition and cutoff of minimo output"
 	END_MIN_PROCESS=$(date +%s)
 	diff_MIN_PROCESS=$(( END_MIN_PROCESS - START_MIN_PROCESS ))
-	echo -e "$(date)\t$scriptname\tcat barcode addition and cutoff of minimo output took $diff_MIN_PROCESS s"
+	log "cat barcode addition and cutoff of minimo output took $diff_MIN_PROCESS s"
 done
 ###
 ### concatenate deBruijn -> OLC contigs from all barcodes together
-echo -e "$(date)\t$scriptname\tConcatenating all barcodes together..."
+log "Concatenating all barcodes together..."
 START_CAT=$(date +%s)
 cat all.bar*.$inputfile.unitigs.cut$cutoff_post_Abyss.${cutoff_post_Minimo}-mini.fa > all.$inputfile.unitigs.cut$cutoff_post_Abyss.${cutoff_post_Minimo}-mini.fa
-echo -e "$(date)\t$scriptname\tCompleted concatenating all barcodes."
+log "Completed concatenating all barcodes."
 END_CAT=$(date +%s)
 diff_CAT=$(( END_CAT - START_CAT ))
-echo -e "$(date)\t$scriptname\tBarcode concatenation took $diff_CAT s."
+log "Barcode concatenation took $diff_CAT s."
 
 # cleaning up files by organizing directories, moving files into directories, and removing temporary files
-mkdir $inputfile.dir
+mkdir --parents $inputfile.dir
 mv bar*$inputfile*.fa $inputfile.dir
 if [ -e all.$inputfile.contigs.abyssmini.cut$cutoff_post_Abyss.$cutoff_post_Minimo.e1.NR.RAPSearch.m8 ]
 then
@@ -148,7 +153,7 @@ fi
 rm -f all.$inputfile.contigs.abyssmini.cut$cutoff_post_Abyss.$cutoff_post_Minimo.e1.NR.RAPSearch.aln
 rm -f all.$inputfile.contigs.abyssmini.e1.NR.RAPSearch.m8.noheader
 rm -f all.$inputfile.contigs.abyssmini.e1.NR.RAPSearch.m8.noheader.seq
-rm -f all.$inputfile.contigs.abyssmini.e1.NR.RAPSearch.m8.noheader 
+rm -f all.$inputfile.contigs.abyssmini.e1.NR.RAPSearch.m8.noheader
 rm -f all.$inputfile.contigs.abyssmini.e1.NR.RAPSearch.m8.noheader.ex.fa
 rm -f all.$inputfile.unitigs.cut$cutoff_post_Abyss-contigs.sortlen.seq
 rm -f all-contigs*

diff --git a/compare_multiple_sam.py b/compare_multiple_sam.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 #	compare_multiple_sam.py
 #
@@ -9,7 +9,7 @@
 #
 # Copyright (C) 2014 Charles Y Chiu - All Rights Reserved
 # Permission to copy and modify is granted under the BSD license
-# Last revised 3/21/2014  
+# Last revised 3/21/2014
 
 import sys
 
@@ -61,7 +61,7 @@
 	lineList = []
 	for file in fileObjList:
 		lineList.append(file.readline())
-		
+
 for file in fileObjList:
 	file.close()
 outputFile.close()
diff --git a/compare_sam.py b/compare_sam.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 #	compare_sam.py
 #
@@ -14,13 +14,13 @@
 import os
 
 def logHeader():
-	import os.path, sys, time
-	return "%s\t%s\t" % (time.strftime("%a %b %d %H:%M:%S %Z %Y"), os.path.basename(sys.argv[0]))
+    import os.path, sys, time
+    return "%s\t%s\t" % (time.strftime("%a %b %d %H:%M:%S %Z %Y"), os.path.basename(sys.argv[0]))
 
 usage = "compare_sam.py <annotated SAM file> <outputFile>"
 if len(sys.argv) != 3:
-	print usage
-	sys.exit(0)
+    print usage
+    sys.exit(0)
 
 SAMfile1 = sys.argv[1]
 outputFile1 = sys.argv[2]
@@ -41,63 +41,66 @@ def logHeader():
 linenum += 1
 
 while line1 != '':
-	data1 = line1.split()
-
-	# check to see whether line is in the SAM header or alignment section
-	if (data1[0][0]!="@"):
-		#We are in the SAM alignment section
-
-		edit_distance=int(data1[12].split(":")[2])
-
-		#using old snap (<=v0.15.4), could use length of line to determine if it was a hit or miss.
-		#new snap (>=v1.0) instead uses an edit distance of -1 to signify a miss.
-
-		if (edit_distance >= 0):	
-			#this is a hit
-			new_hits += 1
-			sum_edistance_new += int(data1[12].split(":")[2])
-			firstentry = data1[0].split("|")
-			if (len(firstentry)>=2):
-				# this has been previously flagged as a hit, and is also now a hit. Compare edit distances to find best hit.
-				edit_distance_previous=firstentry[1]
-				edit_distance_current = data1[12].split(":")[2]
-
-				existing_hits += 1
-				sum_edistance_existing += int(edit_distance_previous)
-
-				if (int(edit_distance_current) <= int(edit_distance_previous)):
-					replacements += 1
-					sum_data1 += int(edit_distance_current)
-					# need to pick the new gi, not the old gi, bug fix 1/27/13
-					gi=data1[2].split("|")[1]
-					replacement_line = line1.replace("|" + firstentry[1] + "|" + firstentry[2],"|" + str(edit_distance_current) + "|" + str(gi),1)
-					outputFile.write(replacement_line)
-				else:
-					sum_data1 += int(edit_distance_previous)
-					outputFile.write(line1)
-			else:
-				edit_distance_current = data1[12].split(":")[2]
-				existing_hits += 1
-				sum_data1 += int(edit_distance_current)
-				gi=data1[2].split("|")[1]
-				replacement_line = line1.replace(data1[0],data1[0] + "|" + str(edit_distance_current) + "|" + str(gi),1)
-				outputFile.write(replacement_line)
-				replacements+=1
-		else:
-			#this is a miss
-			outputFile.write(line1)
-			firstentry = data1[0].split("|")
-			if (len(firstentry)>=2):
-				edit_distance_previous=firstentry[1]
-				existing_hits += 1
-				sum_data1 += int(edit_distance_previous)
-				sum_edistance_existing += int(edit_distance_previous)
-	else:
-		#We are in the SAM header section
-		outputFile.write(line1)
-
-	line1 = file1.readline()
-	linenum += 1
+    data1 = line1.split()
+
+    # check to see whether line is in the SAM header or alignment section
+    if (data1[0][0]!="@"):
+        #We are in the SAM alignment section
+
+        edit_distance=int(data1[12].split(":")[2])
+        mapped = not bool(int(data1[1]) & 0x4)
+
+        #using old snap (<=v0.15.4), could use length of line to determine if it was a hit or miss.
+        #new snap (>=v1.0) instead uses an edit distance of -1 to signify a miss.
+
+        # Not sure what above is saying.
+        # Use 0x4 mask in FLAG field to check for unmapped reads.
+        if (mapped):
+            #this is a hit
+            new_hits += 1
+            sum_edistance_new += edit_distance
+            firstentry = data1[0].split("|")
+            if (len(firstentry)>=2):
+                # this has been previously flagged as a hit, and is also now a hit. Compare edit distances to find best hit.
+                edit_distance_previous=firstentry[1]
+                edit_distance_current = data1[12].split(":")[2]
+
+                existing_hits += 1
+                sum_edistance_existing += int(edit_distance_previous)
+
+                if (int(edit_distance_current) <= int(edit_distance_previous)):
+                    replacements += 1
+                    sum_data1 += int(edit_distance_current)
+                    # need to pick the new gi, not the old gi, bug fix 1/27/13
+                    gi=data1[2].split("|")[1]
+                    replacement_line = line1.replace("|" + firstentry[1] + "|" + firstentry[2],"|" + str(edit_distance_current) + "|" + str(gi),1)
+                    outputFile.write(replacement_line)
+                else:
+                    sum_data1 += int(edit_distance_previous)
+                    outputFile.write(line1)
+            else:
+                edit_distance_current = data1[12].split(":")[2]
+                existing_hits += 1
+                sum_data1 += int(edit_distance_current)
+                gi=data1[2].split("|")[1]
+                replacement_line = line1.replace(data1[0],data1[0] + "|" + str(edit_distance_current) + "|" + str(gi),1)
+                outputFile.write(replacement_line)
+                replacements+=1
+        else:
+            #this is a miss
+            outputFile.write(line1)
+            firstentry = data1[0].split("|")
+            if (len(firstentry)>=2):
+                edit_distance_previous=firstentry[1]
+                existing_hits += 1
+                sum_data1 += int(edit_distance_previous)
+                sum_edistance_existing += int(edit_distance_previous)
+    else:
+        #We are in the SAM header section
+        outputFile.write(line1)
+
+    line1 = file1.readline()
+    linenum += 1
 file1.close()
 outputFile.close()
 

diff --git a/coveragePlot.py b/coveragePlot.py
@@ -1,8 +1,8 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 #	coveragePlot.py
 #
-#	This program generates genomic coverage plots 
+#	This program generates genomic coverage plots
 #	Chiu Laboratory
 #	University of California, San Francisco
 #	January, 2014
@@ -47,7 +47,7 @@ def smart_truncate1(text, max_length=100, suffix='...'):
 
 # print "Installed version is: %s." % mpl_version
 
-#load function is deprecated as of matplotlib v1.3.1, replaced with 
+#load function is deprecated as of matplotlib v1.3.1, replaced with
 if (LooseVersion(mpl_version) >= LooseVersion('1.3.1') ):
 	data = np.loadtxt(dataFile)
 else:
@@ -78,7 +78,7 @@ def smart_truncate1(text, max_length=100, suffix='...'):
     plot(data[:,0],data[:,1],color)
     xlabel("base position",fontsize=8)
     ylabel("fold coverage",fontsize=8)
-    title_text = sys.argv[2] 
+    title_text = sys.argv[2]
     suptitle(title_text,fontsize=9)
     xMin, xMax, yMin, yMax = min(data[:,0]),max(data[:,0]),min(data[:,1]),max(data[:,1])
     # add a 10% buffer to yMax
@@ -105,7 +105,7 @@ def smart_truncate1(text, max_length=100, suffix='...'):
     plot(data[:,0],data[:,1],color)
     xlabel("base position",fontsize=8)
     ylabel("fold coverage",fontsize=8)
-    title_text = sys.argv[2] 
+    title_text = sys.argv[2]
     suptitle(title_text,fontsize=9)
     xMin, xMax, yMin, yMax = min(data[:,0]),max(data[:,0]),min(data[:,1]),max(data[:,1])
     yMax *= 1.1
@@ -122,7 +122,7 @@ def smart_truncate1(text, max_length=100, suffix='...'):
     plot(data[:,0],data[:,1],color)
     xlabel("base position",fontsize=8)
     ylabel("fold coverage",fontsize=8)
-    title_text = sys.argv[2] 
+    title_text = sys.argv[2]
     suptitle(title_text,fontsize=9)
     xMin, xMax, yMin, yMax = min(data[:,0]),max(data[:,0]),min(data[:,1]),max(data[:,1])
     yMax *= 1.1

diff --git a/coverage_generator_bp.sh b/coverage_generator_bp.sh
@@ -93,7 +93,7 @@ do
 		# create list of curated GIs $SNAP_file.$genus.gi.list.curatedgenome
 		# First retain only gis that are complete genomes if no GIs with "complete genomes" are returned, we'll go to GIs  "complete sequences", if no complete sequences, then we just go with the GIs we have
 
-		get_genbankfasta.pl -i bar.$bar.$SNAP_file.$genus.gi.list  > bar.$bar.$SNAP_file.$genus.gi.headers 
+		get_genbankfasta.pl -i bar.$bar.$SNAP_file.$genus.gi.list  > bar.$bar.$SNAP_file.$genus.gi.headers
 
 		egrep ">gi.*[Cc]omplete [Gg]enome" bar.$bar.$SNAP_file.$genus.gi.headers  | awk -F "|" '{print$2}'  | head -n $top_gis > bar.$bar.$SNAP_file.$genus.gi
 		if [ -s bar.$bar.$SNAP_file.$genus.gi ]