Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file modified License.txt
100755 → 100644
Empty file.
Empty file modified Readme.txt
100755 → 100644
Empty file.
546 changes: 259 additions & 287 deletions SURPI.sh

Large diffs are not rendered by default.

65 changes: 35 additions & 30 deletions abyss_minimus.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@
# Copyright (C) 2014 Samia N Naccache - All Rights Reserved
# SURPI has been released under a modified BSD license.
# Please see license file for details.

scriptname=${0##*/}
source debug.sh
source logging.sh

if [ $# -lt 5 ]
then
Expand All @@ -34,7 +35,7 @@ ignore_barcodes=$6
split_FASTA_size=100000

#### create list of barcodes in the fasta file in the following format: #barcode
echo -e "$(date)\t$scriptname\tDemultiplexing barcodes..."
log "Demultiplexing barcodes..."
START_DEMULTIPLEX=$(date +%s)

#The sed '/^#$/d' removes a blank #. This is needed in cases where barcodes are present along with reads that have no barcodes
Expand All @@ -54,10 +55,10 @@ else #$inputfile.barcodes doesn't exist, or size=0, then we only have 1 barcode,
echo "#" > $inputfile.barcodes
fi

echo -e "$(date)\t$scriptname\tCompleted demultiplexing barcodes."
log "Completed demultiplexing barcodes."
END_DEMULTIPLEX=$(date +%s)
diff_DEMULTIPLEX=$(( END_DEMULTIPLEX - START_DEMULTIPLEX ))
echo -e "$(date)\t$scriptname\tBarcode demultiplex took $diff_DEMULTIPLEX s."
log "Barcode demultiplex took $diff_DEMULTIPLEX s."
### generate fasta file for every separate barcode (demultiplex)

for f in `cat $inputfile.barcodes` ; do
Expand All @@ -68,78 +69,82 @@ for f in `cat $inputfile.barcodes` ; do
grep -E "$f(/|$)" $inputfile -A 1 --no-group-separator > bar$f.$inputfile
fi
# split demultiplexed fasta file into 100,000 read sub-fastas
echo -e "$(date)\t$scriptname\tSplitting FASTA into chunks of size: $split_FASTA_size."
log "Splitting FASTA into chunks of size: $split_FASTA_size."
START_SPLIT=$(date +%s)

cp bar$f.$inputfile bar$f.${inputfile}_n # So that the unsplit demultiplexed file is also denovo assembled #
split_fasta.pl -i bar$f.$inputfile -o bar$f.$inputfile -n $split_FASTA_size
echo -e "$(date)\t$scriptname\tCompleted splitting FASTA file."
log "Completed splitting FASTA file."
END_SPLIT=$(date +%s)
diff_SPLIT=$(( $END_SPLIT - $START_SPLIT ))
echo -e "$(date)\t$scriptname\tSplitting FASTA took $diff_SPLIT s."
### run abyss (deBruijn assembler) on each 100,000 read demultiplexed fasta file, including the unsplit demultiplexed file
echo -e "$(date)\t$scriptname\tRunning abyss on each $split_FASTA_size chunk..."
log "Splitting FASTA took $diff_SPLIT s."
### run abyss (deBruijn assembler) on each 100,000 read demultiplexed fasta file, including the unsplit demultiplexed file
log "Running abyss on each $split_FASTA_size chunk..."
START_ABYSS=$(date +%s)

for d in bar$f.${inputfile}_* ; do
abyss-pe k=$kmer name=$d.f se=$d np=$cores >& $d.abyss.log
for d in bar$f.${inputfile}_* ; do
echo $d
log "Command: abyss-pe k=$kmer name=$d.f se=$d np=$cores >& $d.abyss.log"
#abyss-pe k=$kmer name=$d.f se=$d np=$cores >& $d.abyss.log
abyss-pe k=$kmer name=$d.f se=$d >& $d.abyss.log
done

echo -e "$(date)\t$scriptname\tCompleted running abyss on each $split_FASTA_size chunk."
log "Completed running abyss on each $split_FASTA_size chunk."
END_ABYSS=$(date +%s)
diff_ABYSS=$(( END_ABYSS - START_ABYSS ))
echo -e "$(date)\t$scriptname\tAbyss took $diff_ABYSS s."
log "Abyss took $diff_ABYSS s."
###
### contigs from split files concatenated, after which reads smaller than the cutoff value are eliminated
echo -e "$(date)\t$scriptname\tStarting concatenating and cutoff of abyss output"
log "Starting concatenating and cutoff of abyss output"
START_CATCONTIGS=$(date +%s)

#concatenating contig files from different fasta splits, and adding kmer infromation and barcode information to contig headers
cat bar$f.${inputfile}_*.f-unitigs.fa | sed 's/ /_/g' | sed "s/$/"_kmer"$kmer""/g;n" | sed "s/$/"$f"/g;n" > all.bar$f.$inputfile.unitigs.fa
# only contigs larger than $cutoff_post_Abyss are retained
cat all.bar$f.$inputfile.unitigs.fa | awk 'NR%2==1 {x = $0} NR%2==0 { if (length($0) >= '$2') printf("%s\n%s\n",x,$0)}' > all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss.fa

echo -e "$(date)\t$scriptname\tDone concatenating and cutoff of abyss output"
log "Done concatenating and cutoff of abyss output"
END_CATCONTIGS=$(date +%s)
diff_CATCONTIGS=$(( END_CATCONTIGS - START_CATCONTIGS ))
echo -e "$(date)\t$scriptname\tConcatenating and cutoff of abyss output took $diff_CATCONTIGS s."
log "Concatenating and cutoff of abyss output took $diff_CATCONTIGS s."
###
### run Minimo (OLC assembler)
echo -e "$(date)\t$scriptname\tStarting Minimo..."
log "Starting Minimo..."
START_MINIMO=$(date +%s)
log "Command: Minimo all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss.fa -D FASTA_EXP=1"
Minimo all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss.fa -D FASTA_EXP=1
echo -e "$(date)\t$scriptname\tCompleted Minimo."
log "Completed Minimo."
END_MINIMO=$(date +%s)
diff_MINIMO=$(( END_MINIMO - START_MINIMO ))
echo -e "$(date)\t$scriptname\tMinimo took $diff_MINIMO s."
log "Minimo took $diff_MINIMO s."
###########
echo -e "$(date)\t$scriptname\tStarting cat barcode addition and cutoff of minimo output"
log "Starting cat barcode addition and cutoff of minimo output"
START_MIN_PROCESS=$(date +%s)
# Minimo output gives more than one line per sequence, so here we linearize sequences (linearization protocol from here http://seqanswers.com/forums/showthread.php?t=27567 )
# Minimo output gives more than one line per sequence, so here we linearize sequences (linearization protocol from here http://seqanswers.com/forums/showthread.php?t=27567 )
# then we add the relevant barcode to the end of the header contig. Contigs that survive untouched from abyss already have a barcode at the end of them, so that extra barcode is taken away
cat all-contigs.fa | awk '{if (substr($0,1,1)==">"){if (p){print "\n";} print $0} else printf("%s",$0);p++;}END{print "\n"}' | sed '/^$/d' | sed 's/ /_/g' | sed "s/$/"$f"/g;n" | sed "s/"$f""$f"/"$f"/g" | sed 's/#/#@/g' | sed 's/^>/>contig_/g' > all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss-minim.fa
# change generic name all-contigs.fa
cat all-contigs.fa | awk '{if (substr($0,1,1)==">"){if (p){print "\n";} print $0} else printf("%s",$0);p++;}END{print "\n"}' | sed '/^$/d' | sed 's/ /_/g' | sed "s/$/"$f"/g;n" | sed "s/"$f""$f"/"$f"/g" | sed 's/#/#@/g' | sed 's/^>/>contig_/g' > all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss-minim.fa
# change generic name all-contigs.fa
mv all-contigs.fa all-contigs.fa.$f
# only contigs larger than $cutoff_post_Minimo are retained
cat all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss-minim.fa | awk 'NR%2==1 {x = $0} NR%2==0 { if (length($0) >= '$3') printf("%s\n%s\n",x,$0)}' > all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss.${cutoff_post_Minimo}-mini.fa

echo -e "$(date)\t$scriptname\tDone cat barcode addition and cutoff of minimo output"
log "Done cat barcode addition and cutoff of minimo output"
END_MIN_PROCESS=$(date +%s)
diff_MIN_PROCESS=$(( END_MIN_PROCESS - START_MIN_PROCESS ))
echo -e "$(date)\t$scriptname\tcat barcode addition and cutoff of minimo output took $diff_MIN_PROCESS s"
log "cat barcode addition and cutoff of minimo output took $diff_MIN_PROCESS s"
done
###
### concatenate deBruijn -> OLC contigs from all barcodes together
echo -e "$(date)\t$scriptname\tConcatenating all barcodes together..."
log "Concatenating all barcodes together..."
START_CAT=$(date +%s)
cat all.bar*.$inputfile.unitigs.cut$cutoff_post_Abyss.${cutoff_post_Minimo}-mini.fa > all.$inputfile.unitigs.cut$cutoff_post_Abyss.${cutoff_post_Minimo}-mini.fa
echo -e "$(date)\t$scriptname\tCompleted concatenating all barcodes."
log "Completed concatenating all barcodes."
END_CAT=$(date +%s)
diff_CAT=$(( END_CAT - START_CAT ))
echo -e "$(date)\t$scriptname\tBarcode concatenation took $diff_CAT s."
log "Barcode concatenation took $diff_CAT s."

# cleaning up files by organizing directories, moving files into directories, and removing temporary files
mkdir $inputfile.dir
mkdir --parents $inputfile.dir
mv bar*$inputfile*.fa $inputfile.dir
if [ -e all.$inputfile.contigs.abyssmini.cut$cutoff_post_Abyss.$cutoff_post_Minimo.e1.NR.RAPSearch.m8 ]
then
Expand All @@ -148,7 +153,7 @@ fi
rm -f all.$inputfile.contigs.abyssmini.cut$cutoff_post_Abyss.$cutoff_post_Minimo.e1.NR.RAPSearch.aln
rm -f all.$inputfile.contigs.abyssmini.e1.NR.RAPSearch.m8.noheader
rm -f all.$inputfile.contigs.abyssmini.e1.NR.RAPSearch.m8.noheader.seq
rm -f all.$inputfile.contigs.abyssmini.e1.NR.RAPSearch.m8.noheader
rm -f all.$inputfile.contigs.abyssmini.e1.NR.RAPSearch.m8.noheader
rm -f all.$inputfile.contigs.abyssmini.e1.NR.RAPSearch.m8.noheader.ex.fa
rm -f all.$inputfile.unitigs.cut$cutoff_post_Abyss-contigs.sortlen.seq
rm -f all-contigs*
Expand Down
6 changes: 3 additions & 3 deletions compare_multiple_sam.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/python
#!/usr/bin/env python
#
# compare_multiple_sam.py
#
Expand All @@ -9,7 +9,7 @@
#
# Copyright (C) 2014 Charles Y Chiu - All Rights Reserved
# Permission to copy and modify is granted under the BSD license
# Last revised 3/21/2014
# Last revised 3/21/2014

import sys

Expand Down Expand Up @@ -61,7 +61,7 @@
lineList = []
for file in fileObjList:
lineList.append(file.readline())

for file in fileObjList:
file.close()
outputFile.close()
127 changes: 65 additions & 62 deletions compare_sam.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/python
#!/usr/bin/env python
#
# compare_sam.py
#
Expand All @@ -14,13 +14,13 @@
import os

def logHeader():
import os.path, sys, time
return "%s\t%s\t" % (time.strftime("%a %b %d %H:%M:%S %Z %Y"), os.path.basename(sys.argv[0]))
import os.path, sys, time
return "%s\t%s\t" % (time.strftime("%a %b %d %H:%M:%S %Z %Y"), os.path.basename(sys.argv[0]))

usage = "compare_sam.py <annotated SAM file> <outputFile>"
if len(sys.argv) != 3:
print usage
sys.exit(0)
print usage
sys.exit(0)

SAMfile1 = sys.argv[1]
outputFile1 = sys.argv[2]
Expand All @@ -41,63 +41,66 @@ def logHeader():
linenum += 1

while line1 != '':
data1 = line1.split()

# check to see whether line is in the SAM header or alignment section
if (data1[0][0]!="@"):
#We are in the SAM alignment section

edit_distance=int(data1[12].split(":")[2])

#using old snap (<=v0.15.4), could use length of line to determine if it was a hit or miss.
#new snap (>=v1.0) instead uses an edit distance of -1 to signify a miss.

if (edit_distance >= 0):
#this is a hit
new_hits += 1
sum_edistance_new += int(data1[12].split(":")[2])
firstentry = data1[0].split("|")
if (len(firstentry)>=2):
# this has been previously flagged as a hit, and is also now a hit. Compare edit distances to find best hit.
edit_distance_previous=firstentry[1]
edit_distance_current = data1[12].split(":")[2]

existing_hits += 1
sum_edistance_existing += int(edit_distance_previous)

if (int(edit_distance_current) <= int(edit_distance_previous)):
replacements += 1
sum_data1 += int(edit_distance_current)
# need to pick the new gi, not the old gi, bug fix 1/27/13
gi=data1[2].split("|")[1]
replacement_line = line1.replace("|" + firstentry[1] + "|" + firstentry[2],"|" + str(edit_distance_current) + "|" + str(gi),1)
outputFile.write(replacement_line)
else:
sum_data1 += int(edit_distance_previous)
outputFile.write(line1)
else:
edit_distance_current = data1[12].split(":")[2]
existing_hits += 1
sum_data1 += int(edit_distance_current)
gi=data1[2].split("|")[1]
replacement_line = line1.replace(data1[0],data1[0] + "|" + str(edit_distance_current) + "|" + str(gi),1)
outputFile.write(replacement_line)
replacements+=1
else:
#this is a miss
outputFile.write(line1)
firstentry = data1[0].split("|")
if (len(firstentry)>=2):
edit_distance_previous=firstentry[1]
existing_hits += 1
sum_data1 += int(edit_distance_previous)
sum_edistance_existing += int(edit_distance_previous)
else:
#We are in the SAM header section
outputFile.write(line1)

line1 = file1.readline()
linenum += 1
data1 = line1.split()

# check to see whether line is in the SAM header or alignment section
if (data1[0][0]!="@"):
#We are in the SAM alignment section

edit_distance=int(data1[12].split(":")[2])
mapped = not bool(int(data1[1]) & 0x4)

#using old snap (<=v0.15.4), could use length of line to determine if it was a hit or miss.
#new snap (>=v1.0) instead uses an edit distance of -1 to signify a miss.

# Not sure what above is saying.
# Use 0x4 mask in FLAG field to check for unmapped reads.
if (mapped):
#this is a hit
new_hits += 1
sum_edistance_new += edit_distance
firstentry = data1[0].split("|")
if (len(firstentry)>=2):
# this has been previously flagged as a hit, and is also now a hit. Compare edit distances to find best hit.
edit_distance_previous=firstentry[1]
edit_distance_current = data1[12].split(":")[2]

existing_hits += 1
sum_edistance_existing += int(edit_distance_previous)

if (int(edit_distance_current) <= int(edit_distance_previous)):
replacements += 1
sum_data1 += int(edit_distance_current)
# need to pick the new gi, not the old gi, bug fix 1/27/13
gi=data1[2].split("|")[1]
replacement_line = line1.replace("|" + firstentry[1] + "|" + firstentry[2],"|" + str(edit_distance_current) + "|" + str(gi),1)
outputFile.write(replacement_line)
else:
sum_data1 += int(edit_distance_previous)
outputFile.write(line1)
else:
edit_distance_current = data1[12].split(":")[2]
existing_hits += 1
sum_data1 += int(edit_distance_current)
gi=data1[2].split("|")[1]
replacement_line = line1.replace(data1[0],data1[0] + "|" + str(edit_distance_current) + "|" + str(gi),1)
outputFile.write(replacement_line)
replacements+=1
else:
#this is a miss
outputFile.write(line1)
firstentry = data1[0].split("|")
if (len(firstentry)>=2):
edit_distance_previous=firstentry[1]
existing_hits += 1
sum_data1 += int(edit_distance_previous)
sum_edistance_existing += int(edit_distance_previous)
else:
#We are in the SAM header section
outputFile.write(line1)

line1 = file1.readline()
linenum += 1
file1.close()
outputFile.close()

Expand Down
12 changes: 6 additions & 6 deletions coveragePlot.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#!/usr/bin/python
#!/usr/bin/env python
#
# coveragePlot.py
#
# This program generates genomic coverage plots
# This program generates genomic coverage plots
# Chiu Laboratory
# University of California, San Francisco
# January, 2014
Expand Down Expand Up @@ -47,7 +47,7 @@ def smart_truncate1(text, max_length=100, suffix='...'):

# print "Installed version is: %s." % mpl_version

#load function is deprecated as of matplotlib v1.3.1, replaced with
#load function is deprecated as of matplotlib v1.3.1, replaced with
if (LooseVersion(mpl_version) >= LooseVersion('1.3.1') ):
data = np.loadtxt(dataFile)
else:
Expand Down Expand Up @@ -78,7 +78,7 @@ def smart_truncate1(text, max_length=100, suffix='...'):
plot(data[:,0],data[:,1],color)
xlabel("base position",fontsize=8)
ylabel("fold coverage",fontsize=8)
title_text = sys.argv[2]
title_text = sys.argv[2]
suptitle(title_text,fontsize=9)
xMin, xMax, yMin, yMax = min(data[:,0]),max(data[:,0]),min(data[:,1]),max(data[:,1])
# add a 10% buffer to yMax
Expand All @@ -105,7 +105,7 @@ def smart_truncate1(text, max_length=100, suffix='...'):
plot(data[:,0],data[:,1],color)
xlabel("base position",fontsize=8)
ylabel("fold coverage",fontsize=8)
title_text = sys.argv[2]
title_text = sys.argv[2]
suptitle(title_text,fontsize=9)
xMin, xMax, yMin, yMax = min(data[:,0]),max(data[:,0]),min(data[:,1]),max(data[:,1])
yMax *= 1.1
Expand All @@ -122,7 +122,7 @@ def smart_truncate1(text, max_length=100, suffix='...'):
plot(data[:,0],data[:,1],color)
xlabel("base position",fontsize=8)
ylabel("fold coverage",fontsize=8)
title_text = sys.argv[2]
title_text = sys.argv[2]
suptitle(title_text,fontsize=9)
xMin, xMax, yMin, yMax = min(data[:,0]),max(data[:,0]),min(data[:,1]),max(data[:,1])
yMax *= 1.1
Expand Down
2 changes: 1 addition & 1 deletion coverage_generator_bp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ do
# create list of curated GIs $SNAP_file.$genus.gi.list.curatedgenome
# First retain only gis that are complete genomes if no GIs with "complete genomes" are returned, we'll go to GIs "complete sequences", if no complete sequences, then we just go with the GIs we have

get_genbankfasta.pl -i bar.$bar.$SNAP_file.$genus.gi.list > bar.$bar.$SNAP_file.$genus.gi.headers
get_genbankfasta.pl -i bar.$bar.$SNAP_file.$genus.gi.list > bar.$bar.$SNAP_file.$genus.gi.headers

egrep ">gi.*[Cc]omplete [Gg]enome" bar.$bar.$SNAP_file.$genus.gi.headers | awk -F "|" '{print$2}' | head -n $top_gis > bar.$bar.$SNAP_file.$genus.gi
if [ -s bar.$bar.$SNAP_file.$genus.gi ]
Expand Down
Loading