diff --git a/License.txt b/License.txt old mode 100755 new mode 100644 diff --git a/Readme.txt b/Readme.txt old mode 100755 new mode 100644 diff --git a/SURPI.sh b/SURPI.sh index 293e7f5..fe9a19c 100755 --- a/SURPI.sh +++ b/SURPI.sh @@ -13,25 +13,28 @@ # SURPI_version="1.0.22" -optspec=":f:hvz:" -bold=$(tput bold) -normal=$(tput sgr0) +optspec=":f:d:hvz:" +bold=$(tput bold 2> /dev/null || true) +normal=$(tput sgr0 2> /dev/null || true) green='\e[0;32m' red='\e[0;31m' endColor='\e[0m' host=$(hostname) scriptname=${0##*/} +source debug.sh +source logging.sh while getopts "$optspec" option; do case "${option}" in f) config_file=${OPTARG};; # get parameters from config file if specified + d) ref_path=${OPTARG:-/reference};; # Path to reference db directory. h) HELP=1;; v) VERIFICATION=1;; - z) create_config_file=${OPTARG} + z) create_config_file=${OPTARG} configprefix=${create_config_file%.fastq} ;; - :) echo "Option -$OPTARG requires an argument." >&2 + :) echo "Option -$OPTARG requires an argument." >&2 exit 1 ;; esac @@ -39,58 +42,59 @@ done if [[ $HELP -eq 1 || $# -lt 1 ]] then - cat < go_$configprefix + echo "PATH=$PATH:/usr/local/bin/surpi:/usr/local/bin:/usr/bin/:/bin" > go_$configprefix echo "nohup $scriptname -f $configprefix.config > SURPI.$configprefix.log 2> SURPI.$configprefix.err" >> go_$configprefix chmod +x go_$configprefix @@ -152,7 +156,8 @@ run_mode="Comprehensive" # $basef.cutadapt.fastq # $basef.preprocessed.fastq #preprocess="skip" - +#human_mapping="skip" +#alignment="skip" ########################## # Preprocessing @@ -177,7 +182,7 @@ quality_cutoff=18 ########################## #SNAP executable -snap="/usr/local/bin/snap-dev" +snap="snap" #SNAP edit distance for Computational Subtraction of host genome [Highly recommended default: d_human=12] #see Section 3.1.2 MaxDist description: http://snap.cs.berkeley.edu/downloads/snap-1.0beta-manual.pdf @@ -246,32 +251,32 @@ eBLASTn="1e-15" # SURPI will subtract all SNAP databases found in this directory from the input sequence # Useful if you want to subtract multiple genomes (without combining SNAP databases) # or, if you need to split a db if it is larger than available RAM. -SNAP_subtraction_folder="/reference/hg19" +SNAP_subtraction_folder="$ref_path/Subtraction_SNAP" # directory for SNAP-indexed databases of NCBI NT (for mapping phase in comprehensive mode) # directory must ONLY contain snap indexed databases -SNAP_COMPREHENSIVE_db_dir="/reference/COMP_SNAP" +SNAP_COMPREHENSIVE_db_dir="$ref_path/COMP_SNAP" # directory for SNAP-indexed databases for mapping phase in FAST mode # directory must ONLY contain snap indexed databases -SNAP_FAST_db_dir="/reference/FAST_SNAP" +SNAP_FAST_db_dir="$ref_path/FAST_SNAP" #Taxonomy Reference data directory #This folder should contain the 3 SQLite files created by the script "create_taxonomy_db.sh" #gi_taxid_nucl.db - nucleotide db of gi/taxonid #gi_taxid_prot.db - protein db of gi/taxonid #names_nodes_scientific.db - db of taxonid/taxonomy -taxonomy_db_directory="/reference/taxonomy" +taxonomy_db_directory="$ref_path/taxonomy" #RAPSearch viral database name: indexed protein dataset (all of Viruses) -#make sure that directory also includes the .info file -RAPSearch_VIRUS_db="/reference/RAPSearch/rapsearch_viral_db" +#make sure that directory also includes the .info file +RAPSearch_VIRUS_db="$ref_path/RAPSearch/rapsearch_viral_aa_130628_db_v2.12" #RAPSearch nr database name: indexed protein dataset (all of NR) -#make sure that directory also includes the .info file -RAPSearch_NR_db="/reference/RAPSearch/rapsearch_nr_db" +#make sure that directory also includes the .info file +RAPSearch_NR_db="$ref_path/RAPSearch/rapsearch_nr_db_v2.12" -ribo_snap_bac_euk_directory="/reference/RiboClean_SNAP" +ribo_snap_bac_euk_directory="$ref_path/RiboClean_SNAP" ########################## # Server related values @@ -286,17 +291,12 @@ ribo_snap_bac_euk_directory="/reference/RiboClean_SNAP" #This folder will not be created by SURPI, so be sure it already exists with proper permissions. temporary_files_directory="/tmp/" -#This parameter controls whether dropcache is used throughout the pipeline. If free RAM is less than cache_reset, -# then dropcache. If cache_reset=0, then dropcache will never be used. -cache_reset="0" - - ########################## # AWS related values ########################## # AWS_master_slave will start up a slave instance on AWS for each division of the nt database -# It will be more costly, but should run significantly faster than the solo method, which +# It will be more costly, but should run significantly faster than the solo method, which # runs each NT division through SNAP serially on a single machine. # If using the "AWS_master_slave" option, be sure that all parameters in the AWS section below are # set properly. @@ -306,7 +306,7 @@ cache_reset="0" #Which method to use for SNAP to nt [AWS_master_slave/solo] # AWS_master_slave will start up a slave instance on AWS for each division of the nt database -# It will be more costly, but should run significantly faster than the solo method, which +# It will be more costly, but should run significantly faster than the solo method, which # runs each NT division through SNAP serially on a single machine. # If using the "AWS_master_slave" option, be sure that all parameters in the AWS section below are # set properly. @@ -318,7 +318,7 @@ snap_nt_procedure="solo" ami_id="ami-5ef61936" #Number of slave instances will not exceed this value. Useful for testing, in order to restrict instance count. -#Otherwise, number of instances should be equal to number of SNAP-NT database divisions. This value is +#Otherwise, number of instances should be equal to number of SNAP-NT database divisions. This value is #automatically calculated by SURPI. max_slave_instances=29 @@ -402,25 +402,6 @@ then exit 65 fi -#set cache_reset. if none specified: -# >500GB -> 200GB -# >200GB -> 150GB -# otherwise -> 50GB -# note: this may cause problems on a machine with <50GB RAM -if [ ! $cache_reset ] -then - total_ram=$(grep MemTotal /proc/meminfo | awk '{print $2}') - if [ "$total_ram" -gt "500000000" ] #500GiB - then - cache_reset=200 # This is 200GB - elif [ "$total_ram" -gt "200000000" ] #200 GiB - then - cache_reset=150 - else - cache_reset=50 - fi -fi - #these 2 parameters are for cropping prior to snap in the preprocessing stage if [ ! $start_nt ] then @@ -486,7 +467,7 @@ nopathf=${FASTQ_file##*/} # remove the path to file basef=${nopathf%.fastq} #verify that all software dependencies are properly installed -declare -a dependency_list=("gt" "seqtk" "fastq" "fqextract" "cutadapt" "prinseq-lite.pl" "dropcache" "$snap" "rapsearch" "fastQValidator" "abyss-pe" "ABYSS-P" "Minimo") +declare -a dependency_list=("gt" "seqtk" "fastq" "fqextract" "cutadapt" "prinseq-lite.pl" "dropcache" "$snap" "rapsearch" "fastQValidator" "abyss-pe" "Minimo") echo "-----------------------------------------------------------------------------------------" echo "DEPENDENCY VERIFICATION" echo "-----------------------------------------------------------------------------------------" @@ -511,10 +492,8 @@ seqtk_version=$(seqtk 2>&1 | head -3 | tail -1 | awk '{print $2}') cutadapt_version=$(cutadapt --version) prinseqlite_version=$(prinseq-lite.pl --version 2>&1 | awk '{print $2}') snap_version=$(snap 2>&1 | grep version | awk '{print $5}') -snap_dev_version=$(snap-dev 2>&1 | grep version | awk '{print $5}') rapsearch_version=$(rapsearch 2>&1 | head -2 | tail -1 | awk '{print $2}') abyss_pe_version=$(abyss-pe version | head -2 | tail -1 | awk '{print $3}') -ABYSS_P_version=$(ABYSS-P --version | head -1 | awk '{print $3}') Minimo_version=$(Minimo -h | tail -2 | awk '{print $2}') echo -e "SURPI version: $SURPI_version" echo -e "config file version: $config_file_version" @@ -523,10 +502,8 @@ echo -e "seqtk: $seqtk_version" echo -e "cutadapt: $cutadapt_version" echo -e "prinseq-lite: $prinseqlite_version${endColor}" echo -e "snap: $snap_version${endColor}" -echo -e "snap-dev: $snap_dev_version${endColor}" echo -e "RAPSearch: $rapsearch_version" echo -e "abyss-pe: $abyss_pe_version" -echo -e "ABYSS-P: $ABYSS_P_version" echo -e "Minimo: $Minimo_version" echo "-----------------------------------------------------------------------------------------" @@ -546,9 +523,10 @@ do done echo -e "SNAP Comprehensive Mode database" -for f in $SNAP_COMPREHENSIVE_db_dir/* +for f_basename in $(ls -1v "$SNAP_COMPREHENSIVE_db_dir") do - if [ -f $f/Genome ] + f = "$SNAP_COMPREHENSIVE_db_dir"/"$f_basename" + if [ -f $f/Genome ] then echo -e "\t$f: ${green}OK${endColor}" else @@ -577,7 +555,8 @@ done #verify taxonomy is functioning properly result=$( taxonomy_lookup_embedded.pl -d nucl -q $taxonomy_db_directory 149408158 ) -if [ $result = "149408158" ] +echo "taxonomy result: ${result}" +if [ $result == "149408158" ] then echo -e "taxonomy: ${green}OK${endColor}" else @@ -678,7 +657,6 @@ echo "contigcutoff for abyss assembly unitigs: $contigcutoff" echo "abysskmer length: $abysskmer" echo "Ignore barcodes for assembly? $ignore_barcodes_for_de_novo" -echo "cache_reset (if 0, then dropcache will never be used): $cache_reset" echo "start_nt: $start_nt" echo "crop_length: $crop_length" @@ -735,10 +713,10 @@ then exit fi ########################################################### -echo -e "$(date)\t$scriptname\t########## STARTING SURPI PIPELINE ##########" +log "########## STARTING SURPI PIPELINE ##########" START_PIPELINE=$(date +%s) -echo -e "$(date)\t$scriptname\tFound file $FASTQ_file" -echo -e "$(date)\t$scriptname\tAfter removing path: $nopathf" +log "Found file $FASTQ_file" +log "After removing path: $nopathf" ############ Start up AWS slave machines ################## file_with_slave_ips="slave_list.txt" @@ -746,49 +724,51 @@ file_with_slave_ips="slave_list.txt" if [ "$snap_nt_procedure" = "AWS_master_slave" ] then # start the slaves as a background process. They should be ready to run at the SNAP to NT step in the pipeline. - start_slaves.sh $ami_id $actual_slave_instances $instance_type $keypair $security_group $availability_zone $file_with_slave_ips $placement_group & # > $basef.AWS.log 2>&1 + start_slaves.sh $ami_id $actual_slave_instances $instance_type $keypair $security_group $availability_zone $file_with_slave_ips $placement_group & # > $basef.AWS.log "2>&1 fi ############ PREPROCESSING ################## -if [ "$preprocess" != "skip" ] +if grep -qs preprocess_complete "$basef.checkpoint"; then + preprocess_complete=1 +fi +if [[ "$preprocess" != "skip" ]] && [[ "$preprocess_complete" != 1 ]] then - echo -e "$(date)\t$scriptname\t############### PREPROCESSING ###############" - echo -e "$(date)\t$scriptname\tStarting: preprocessing using $cores cores " + if [[ -f $basef.cutadapt.fastq.done ]]; then + cp $basef.cutadapt.fastq + fi + log "############### PREPROCESSING ###############" + log "Starting: preprocessing using $cores cores " START_PREPROC=$(date +%s) - echo -e "$(date)\t$scriptname\tParameters: preprocess_ncores.sh $basef.fastq $quality N $length_cutoff $cores Y N $adapter_set $start_nt $crop_length $temporary_files_directory >& $basef.preprocess.log" - preprocess_ncores.sh $basef.fastq $quality N $length_cutoff $cores $cache_reset N $adapter_set $start_nt $crop_length $temporary_files_directory $quality_cutoff >& $basef.preprocess.log - echo -e "$(date)\t$scriptname\tDone: preprocessing " + log "Parameters: preprocess_ncores.sh $basef.fastq $quality N $length_cutoff $cores Y N $adapter_set $start_nt $crop_length $temporary_files_directory >& $basef.preprocess.log" + preprocess_ncores.sh $basef.fastq $quality N $length_cutoff $cores N $adapter_set $start_nt $crop_length $temporary_files_directory $quality_cutoff >& $basef.preprocess.log + log "Done: preprocessing " END_PREPROC=$(date +%s) diff_PREPROC=$(( END_PREPROC - START_PREPROC )) - echo -e "$(date)\t$scriptname\tPreprocessing took $diff_PREPROC seconds" | tee timing.$basef.log -fi -# verify preprocessing step -if [ ! -s $basef.cutadapt.fastq ] || [ ! -s $basef.preprocessed.fastq ] -then - echo -e "$(date)\t$scriptname\t${red}Preprocessing appears to have failed. One of the following files does not exist, or is of 0 size:${endColor}" - echo "$basef.cutadapt.fastq" - echo "$basef.preprocessed.fastq" - exit + log "Preprocessing took $diff_PREPROC seconds" | tee timing.$basef.log + + # verify preprocessing step + if [ ! -s $basef.cutadapt.fastq ] || [ ! -s $basef.preprocessed.fastq ] + then + log "${red}Preprocessing appears to have failed. One of the following files does not exist, or is of 0 size:${endColor}" + echo "$basef.cutadapt.fastq" + echo "$basef.preprocessed.fastq" + exit + else + echo "preprocess_complete" >> "$basef.checkpoint" + fi fi ############# BEGIN SNAP PIPELINE ################# -freemem=$(free -g | awk '{print $4}' | head -n 2 | tail -1) -echo -e "$(date)\t$scriptname\tThere is $freemem GB available free memory...[cutoff=$cache_reset GB]" -if [[ $dropcache == "Y" ]] -then - if [ "$freemem" -lt "$cache_reset" ] - then - echo -e "$(date)\t$scriptname\tClearing cache..." - dropcache - fi -fi +basef_h=${nopathf%.fastq}.preprocessed.s20.h250n25d${d_human}xfu # remove fastq extension ############# HUMAN MAPPING ################# -if [ "$human_mapping" != "skip" ] +if grep -qs human_mapping_complete "$basef.checkpoint"; then + human_mapping_complete=1 +fi +if [ "$human_mapping" != "skip" ] && [[ "$human_mapping_complete" != 1 ]] then - echo -e "$(date)\t$scriptname\t############### SNAP TO HUMAN ###############" - basef_h=${nopathf%.fastq}.preprocessed.s20.h250n25d${d_human}xfu # remove fastq extension - echo -e "$(date)\t$scriptname\tBase file: $basef_h" - echo -e "$(date)\t$scriptname\tStarting: $basef_h human mapping" + log "############### SNAP TO HUMAN ###############" + log "Base file: $basef_h" + log "Starting: $basef_h human mapping" file_to_subtract="$basef.preprocessed.fastq" subtracted_output_file="$basef_h.human.snap.unmatched.sam" @@ -798,50 +778,43 @@ then for SNAP_subtraction_db in $SNAP_subtraction_folder/*; do SUBTRACTION_COUNTER=$[$SUBTRACTION_COUNTER +1] # check if SNAP db is cached in RAM, use optimal parameters depending on result - SNAP_db_cached=$(vmtouch -m500G -f "$SNAP_subtraction_db" | grep 'Resident Pages' | awk '{print $5}') - if [[ "$SNAP_db_cached" == "100%" ]] - then - echo -e "$(date)\t$scriptname\tSNAP database is cached ($SNAP_db_cached)." - SNAP_cache_option=" -map " - else - echo -e "$(date)\t$scriptname\tSNAP database is not cached ($SNAP_db_cached)." - SNAP_cache_option=" -pre -map " - fi - echo -e "$(date)\t$scriptname\tParameters: snap-dev single $SNAP_subtraction_db $file_to_subtract -o -sam $subtracted_output_file.$SUBTRACTION_COUNTER.sam -t $cores -x -f -h 250 -d ${d_human} -n 25 -F u $SNAP_cache_option" + #SNAP_db_cached=$(vmtouch -m500G -f "$SNAP_subtraction_db" | grep 'Resident Pages' | awk '{print $5}') + #if [[ "$SNAP_db_cached" == "100%" ]] + #then + #log "SNAP database is cached ($SNAP_db_cached)." + #SNAP_cache_option=" -map " + #else + #log "SNAP database is not cached ($SNAP_db_cached)." + #SNAP_cache_option=" -pre -map " + #fi + log "Parameters: snap single $SNAP_subtraction_db $file_to_subtract -o -sam $subtracted_output_file.$SUBTRACTION_COUNTER.sam -t $cores -x -f -h 250 -d ${d_human} -n 25 -F u $SNAP_cache_option" START_SUBTRACTION_STEP=$(date +%s) - snap-dev single "$SNAP_subtraction_db" "$file_to_subtract" -o -sam "$subtracted_output_file.$SUBTRACTION_COUNTER.sam" -t $cores -x -f -h 250 -d ${d_human} -n 25 -F u $SNAP_cache_option + snap single "$SNAP_subtraction_db" "$file_to_subtract" -o -sam "$subtracted_output_file.$SUBTRACTION_COUNTER.sam" -t $cores -x -f -h 250 -d ${d_human} -n 25 -F u $SNAP_cache_option END_SUBTRACTION_STEP=$(date +%s) - echo -e "$(date)\t$scriptname\tDone: SNAP to human" + log "Done: SNAP to human" diff_SUBTRACTION_STEP=$(( END_SUBTRACTION_STEP - START_SUBTRACTION_STEP )) - echo -e "$(date)\t$scriptname\tSubtraction step: $SUBTRACTION_COUNTER took $diff_SUBTRACTION_STEP seconds" + log "Subtraction step: $SUBTRACTION_COUNTER took $diff_SUBTRACTION_STEP seconds" file_to_subtract="$subtracted_output_file.$SUBTRACTION_COUNTER.sam" done egrep -v "^@" "$subtracted_output_file.$SUBTRACTION_COUNTER.sam" | awk '{if($3 == "*") print "@"$1"\n"$10"\n""+"$1"\n"$11}' > $(echo "$basef_h".human.snap.unmatched.sam | sed 's/\(.*\)\..*/\1/').fastq END_SUBTRACTION=$(date +%s) diff_SUBTRACTION=$(( END_SUBTRACTION - START_SUBTRACTION )) - rm $subtracted_output_file.*.sam - echo -e "$(date)\t$scriptname\tSubtraction took $diff_SUBTRACTION seconds" | tee -a timing.$basef.log -fi -######dropcache?############# -freemem=$(free -g | awk '{print $4}' | head -n 2 | tail -1) -echo -e "$(date)\t$scriptname\tThere is $freemem GB available free memory...[cutoff=$cache_reset GB]" -if [[ $dropcache == "Y" ]] -then - if [ "$freemem" -lt "$cache_reset" ] - then - echo -e "$(date)\t$scriptname\tClearing cache..." - dropcache - fi + # rm $subtracted_output_file.*.sam + log "Subtraction took $diff_SUBTRACTION seconds" | tee -a timing.$basef.log + echo "human_mapping_complete" >> "$basef.checkpoint" fi + ############################# SNAP TO NT ############################## if [ "$alignment" != "skip" ] then - if [ ! -f $basef.NT.snap.sam ] + if [ -f $basef.NT.snap.sam ] # Cache SNAP NT hits. then - echo -e "$(date)\t$scriptname\t####### SNAP UNMATCHED SEQUENCES TO NT ######" - echo -e -n "$(date)\t$scriptname\tCalculating number of sequences to analyze using SNAP to NT: " - echo $(awk 'NR%4==1' "$basef_h".human.snap.unmatched.fastq | wc -l) - echo -e "$(date)\t$scriptname\tStarting: Mapping by SNAP to NT from $basef_h.human.snap.unmatched.fastq" + log "Using cached $basef.NT.snap.sam" + else + log "####### SNAP UNMATCHED SEQUENCES TO NT ######" + num_seq_snap_nt = $(awk 'NR%4==1' "$basef_h".human.snap.unmatched.fastq | wc -l) + log "Calculating number of sequences to analyze using SNAP to NT: $num_seq_snap_nt" + log "Starting: Mapping by SNAP to NT from $basef_h.human.snap.unmatched.fastq" START_SNAPNT=$(date +%s) # SNAP to NT for unmatched reads (d value threshold cutoff = 12) @@ -849,8 +822,8 @@ then then if [ $snap_integrator = "inline" ] then - echo -e "$(date)\t$scriptname\tParameters: snap_nt.sh $basef_h.human.snap.unmatched.fastq ${SNAP_COMPREHENSIVE_db_dir} $cores $cache_reset $d_NT_alignment $snap" - snap_nt.sh "$basef_h.human.snap.unmatched.fastq" "${SNAP_COMPREHENSIVE_db_dir}" "$cores" "$cache_reset" "$d_NT_alignment" "$snap" + log "Parameters: snap_nt.sh $basef_h.human.snap.unmatched.fastq ${SNAP_COMPREHENSIVE_db_dir} $cores $d_NT_alignment $snap" + snap_nt.sh "$basef_h.human.snap.unmatched.fastq" "${SNAP_COMPREHENSIVE_db_dir}" "$cores" "$d_NT_alignment" "$snap" elif [ $snap_integrator = "end" ] then if [ "$snap_nt_procedure" = "AWS_master_slave" ] @@ -865,38 +838,40 @@ then sleep 2 done echo - echo -e "$(date)\t$scriptname\tParameters: snap_on_slave.sh $basef_h.human.snap.unmatched.fastq $pemkey $file_with_slave_ips $incoming_dir ${basef}.NT.snap.sam $d_NT_alignment" + log "Parameters: snap_on_slave.sh $basef_h.human.snap.unmatched.fastq $pemkey $file_with_slave_ips $incoming_dir ${basef}.NT.snap.sam $d_NT_alignment" snap_on_slave.sh "$basef_h.human.snap.unmatched.fastq" "$pemkey" "$file_with_slave_ips" "$incoming_dir" "${basef}.NT.snap.sam" "$d_human"> "$basef.AWS.log" 2>&1 elif [ "$snap_nt_procedure" = "solo" ] then - echo -e "$(date)\t$scriptname\tParameters: snap_nt_combine.sh $basef_h.human.snap.unmatched.fastq ${SNAP_COMPREHENSIVE_db_dir} $cores $cache_reset $d_NT_alignment $num_simultaneous_SNAP_runs" - snap_nt_combine.sh "$basef_h.human.snap.unmatched.fastq" "${SNAP_COMPREHENSIVE_db_dir}" "$cores" "$cache_reset" "$d_NT_alignment" "$num_simultaneous_SNAP_runs" + log "Parameters: snap_nt_combine.sh $basef_h.human.snap.unmatched.fastq ${SNAP_COMPREHENSIVE_db_dir} $cores $d_NT_alignment $num_simultaneous_SNAP_runs" + snap_nt_combine.sh "$basef_h.human.snap.unmatched.fastq" "${SNAP_COMPREHENSIVE_db_dir}" "$cores" "$d_NT_alignment" "$num_simultaneous_SNAP_runs" fi fi elif [ $run_mode = "Fast" ] then - echo -e "$(date)\t$scriptname\tParameters: snap_nt.sh $basef_h.human.snap.unmatched.fastq ${SNAP_FAST_db_dir} $cores $cache_reset $d_NT_alignment $snap" - snap_nt.sh "$basef_h.human.snap.unmatched.fastq" "${SNAP_FAST_db_dir}" "$cores" "$cache_reset" "$d_NT_alignment" "$snap" + log "Parameters: snap_nt.sh $basef_h.human.snap.unmatched.fastq ${SNAP_FAST_db_dir} $cores $d_NT_alignment $snap" + snap_nt.sh "$basef_h.human.snap.unmatched.fastq" "${SNAP_FAST_db_dir}" "$cores" "$d_NT_alignment" "$snap" fi - echo -e "$(date)\t$scriptname\tDone: SNAP to NT" + log "Done: SNAP to NT" END_SNAPNT=$(date +%s) diff_SNAPNT=$(( END_SNAPNT - START_SNAPNT )) - echo -e "$(date)\t$scriptname\tSNAP to NT took $diff_SNAPNT seconds." | tee -a timing.$basef.log + log "SNAP to NT took $diff_SNAPNT seconds." | tee -a timing.$basef.log mv -f "$basef_h.human.snap.unmatched.NT.sam" "$basef.NT.snap.sam" fi - echo -e "$(date)\t$scriptname\tStarting: parsing $basef.NT.snap.sam" - echo -e "$(date)\t$scriptname\textract matched/unmatched $basef.NT.snap.sam" - egrep -v "^@" $basef.NT.snap.sam | awk '{if($3 != "*") print }' > $basef.NT.snap.matched.sam - egrep -v "^@" $basef.NT.snap.sam | awk '{if($3 == "*") print }' > $basef.NT.snap.unmatched.sam - echo -e "$(date)\t$scriptname\tconvert sam to fastq from $basef.NT.snap.sam" - echo -e "$(date)\t$scriptname\tDone: parsing $basef.NT.snap.unmatched.sam" - if [ ! -f "$basef.NT.snap.matched.all.annotated" ] + + if [ ! -f "$basef.NT.snap.matched.fulllength.all.annotated.sorted" ] then + log "Starting: parsing $basef.NT.snap.sam" + log "extract matched/unmatched $basef.NT.snap.sam" + egrep -v "^@" $basef.NT.snap.sam | awk '{if($3 != "*") print }' > $basef.NT.snap.matched.sam + egrep -v "^@" $basef.NT.snap.sam | awk '{if($3 == "*") print }' > $basef.NT.snap.unmatched.sam + + log "convert sam to fastq from $basef.NT.snap.sam" + log "Done: parsing $basef.NT.snap.unmatched.sam" ## convert to FASTQ and retrieve full-length sequences - echo -e "$(date)\t$scriptname\tconvert to FASTQ and retrieve full-length sequences for SNAP NT matched hits" - echo -e "$(date)\t$scriptname\tParameters: extractHeaderFromFastq_ncores.sh $cores $basef.cutadapt.fastq $basef.NT.snap.matched.sam $basef.NT.snap.matched.fulllength.fastq $basef.NT.snap.unmatched.sam $basef.NT.snap.unmatched.fulllength.fastq" + log "convert to FASTQ and retrieve full-length sequences for SNAP NT matched hits" + log "Parameters: extractHeaderFromFastq_ncores.sh $cores $basef.cutadapt.fastq $basef.NT.snap.matched.sam $basef.NT.snap.matched.fulllength.fastq $basef.NT.snap.unmatched.sam $basef.NT.snap.unmatched.fulllength.fastq" extractHeaderFromFastq_ncores.sh "$cores" "$basef.cutadapt.fastq" "$basef.NT.snap.matched.sam" "$basef.NT.snap.matched.fulllength.fastq" "$basef.NT.snap.unmatched.sam" "$basef.NT.snap.unmatched.fulllength.fastq" #SNN140507 sort -k1,1 "$basef.NT.snap.matched.sam" > "$basef.NT.snap.matched.sorted.sam" cut -f1-9 "$basef.NT.snap.matched.sorted.sam" > "$basef.NT.snap.matched.sorted.sam.tmp1" @@ -904,72 +879,77 @@ then awk '(NR%4==1) {printf("%s\t",$0)} (NR%4==2) {printf("%s\t", $0)} (NR%4==0) {printf("%s\n",$0)}' "$basef.NT.snap.matched.fulllength.fastq" | sort -k1,1 | awk '{print $2 "\t" $3}' > "$basef.NT.snap.matched.fulllength.sequence.txt" #SNN140507 change this to bring in quality lines as well paste "$basef.NT.snap.matched.sorted.sam.tmp1" "$basef.NT.snap.matched.fulllength.sequence.txt" "$basef.NT.snap.matched.sorted.sam.tmp2" > "$basef.NT.snap.matched.fulllength.sam" ###retrieve taxonomy matched to NT ### - echo -e "$(date)\t$scriptname\ttaxonomy retrieval for $basef.NT.snap.matched.fulllength.sam" - echo -e "$(date)\t$scriptname\tParameters: taxonomy_lookup.pl $basef.NT.snap.matched.fulllength.sam sam nucl $cores $taxonomy_db_directory" + log "taxonomy retrieval for $basef.NT.snap.matched.fulllength.sam" + log "Parameters: taxonomy_lookup.pl $basef.NT.snap.matched.fulllength.sam sam nucl $cores $taxonomy_db_directory" taxonomy_lookup.pl "$basef.NT.snap.matched.fulllength.sam" sam nucl $cores $taxonomy_db_directory sort -k 13.7n "$basef.NT.snap.matched.fulllength.all.annotated" > "$basef.NT.snap.matched.fulllength.all.annotated.sorted" # sam format is no longer disturbed rm -f "$basef.NT.snap.matched.fulllength.gi" "$basef.NT.snap.matched.fullength.gi.taxonomy" fi -# adjust filenames for FAST mode - grep "Viruses;" "$basef.NT.snap.matched.fulllength.all.annotated.sorted" > "$basef.NT.snap.matched.fl.Viruses.annotated" - grep "Bacteria;" "$basef.NT.snap.matched.fulllength.all.annotated.sorted" > "$basef.NT.snap.matched.fl.Bacteria.annotated" - - ##SNN140507 cleanup bacterial reads - echo -e "$(date)\t$scriptname\tParameters: ribo_snap_bac_euk.sh $basef.NT.snap.matched.fl.Bacteria.annotated BAC $cores $ribo_snap_bac_euk_directory" - ribo_snap_bac_euk.sh $basef.NT.snap.matched.fl.Bacteria.annotated BAC $cores $ribo_snap_bac_euk_directory #SNN140507 - if [ $run_mode = "Comprehensive" ] - then - grep "Primates;" "$basef.NT.snap.matched.fulllength.all.annotated.sorted" > "$basef.NT.snap.matched.fl.Primates.annotated" - grep -v "Primates" "$basef.NT.snap.matched.fulllength.all.annotated.sorted" | grep "Mammalia" > "$basef.NT.snap.matched.fl.nonPrimMammal.annotated" - grep -v "Mammalia" "$basef.NT.snap.matched.fulllength.all.annotated.sorted" | grep "Chordata" > "$basef.NT.snap.matched.fl.nonMammalChordat.annotated" - grep -v "Chordata" "$basef.NT.snap.matched.fulllength.all.annotated.sorted" | grep "Eukaryota" > "$basef.NT.snap.matched.fl.nonChordatEuk.annotated" - ribo_snap_bac_euk.sh $basef.NT.snap.matched.fl.nonChordatEuk.annotated EUK $cores $ribo_snap_bac_euk_directory - fi - echo -e "$(date)\t$scriptname\tDone taxonomy retrieval" - echo -e "$(date)\t$scriptname\tParameters: table_generator.sh $basef.NT.snap.matched.fl.Viruses.annotated SNAP Y Y Y Y>& $basef.table_generator_snap.matched.fl.log" - table_generator.sh "$basef.NT.snap.matched.fl.Viruses.annotated" SNAP Y Y Y Y>& "$basef.table_generator_snap.matched.fl.log" - if [ $run_mode = "Comprehensive" ] - then - ### convert to FASTQ and retrieve full-length sequences to add to unmatched SNAP for viral RAPSearch### - egrep -v "^@" "$basef.NT.snap.matched.fl.Viruses.annotated" | awk '{if($3 != "*") print "@"$1"\n"$10"\n""+"$1"\n"$11}' > $(echo "$basef.NT.snap.matched.fl.Viruses.annotated" | sed 's/\(.*\)\..*/\1/').fastq - echo -e "$(date)\t$scriptname\tDone: convert to FASTQ and retrieve full-length sequences for SNAP NT hits " - fi - echo -e "$(date)\t$scriptname\t############# SORTING unmatched to NT BY LENGTH AND UNIQ AND LOOKUP ORIGINAL SEQUENCES #################" - if [ $run_mode = "Comprehensive" ] - then - #SNN 140507 extractHeaderFromFastq.csh "$basef.NT.snap.unmatched.fastq" FASTQ "$basef.cutadapt.fastq" "$basef.NT.snap.unmatched.fulllength.fastq" - sed "n;n;n;d" "$basef.NT.snap.unmatched.fulllength.fastq" | sed "n;n;d" | sed "s/^@/>/g" > "$basef.NT.snap.unmatched.fulllength.fasta" - fi - cat "$basef.NT.snap.unmatched.fulllength.fasta" | perl -e 'while (<>) {$h=$_; $s=<>; $seqs{$h}=$s;} foreach $header (reverse sort {length($seqs{$a}) <=> length($seqs{$b})} keys %seqs) {print $header.$seqs{$header}}' > $basef.NT.snap.unmatched.fulllength.sorted.fasta - if [ $run_mode = "Comprehensive" ] - then - echo -e "$(date)\t$scriptname\twe will be using 50 as the length of the cropped read for removing unique and low-complexity reads" - echo -e "$(date)\t$scriptname\tParameters: crop_reads.csh $basef.NT.snap.unmatched.fulllength.sorted.fasta 25 50 > $basef.NT.snap.unmatched.fulllength.sorted.cropped.fasta" - crop_reads.csh "$basef.NT.snap.unmatched.fulllength.sorted.fasta" 25 50 > "$basef.NT.snap.unmatched.fulllength.sorted.cropped.fasta" - echo -e "$(date)\t$scriptname\t*** reads cropped ***" - echo -e "$(date)\t$scriptname\tParameters: gt sequniq -seqit -force -o $basef.NT.snap.unmatched.fulllength.sorted.cropped.uniq.fasta $basef.NT.snap.unmatched.fulllength.sorted.cropped.fasta" - gt sequniq -seqit -force -o "$basef.NT.snap.unmatched.fulllength.sorted.cropped.uniq.fasta" "$basef.NT.snap.unmatched.fulllength.sorted.cropped.fasta" - echo -e "$(date)\t$scriptname\tParameters: extractAlltoFast.sh $basef.NT.snap.unmatched.fulllength.sorted.cropped.uniq.fasta FASTA $basef.NT.snap.unmatched.fulllength.fasta FASTA $basef.NT.snap.unmatched.uniq.fl.fasta FASTA" - extractAlltoFast.sh "$basef.NT.snap.unmatched.fulllength.sorted.cropped.uniq.fasta" FASTA "$basef.NT.snap.unmatched.fulllength.fasta" FASTA "$basef.NT.snap.unmatched.uniq.fl.fasta" FASTA #SNN140507 - fi - echo -e "$(date)\t$scriptname\tDone uniquing full length sequences of unmatched to NT" + + if [ ! -f "$basef.NT.snap.unmatched.uniq.fl.fasta" ] + then + # adjust filenames for FAST mode + grep "Viruses;" "$basef.NT.snap.matched.fulllength.all.annotated.sorted" > "$basef.NT.snap.matched.fl.Viruses.annotated" + grep "Bacteria;" "$basef.NT.snap.matched.fulllength.all.annotated.sorted" > "$basef.NT.snap.matched.fl.Bacteria.annotated" + + ##SNN140507 cleanup bacterial reads + log "Parameters: ribo_snap_bac_euk.sh $basef.NT.snap.matched.fl.Bacteria.annotated BAC $cores $ribo_snap_bac_euk_directory" + ribo_snap_bac_euk.sh $basef.NT.snap.matched.fl.Bacteria.annotated BAC $cores $ribo_snap_bac_euk_directory #SNN140507 + if [ $run_mode = "Comprehensive" ] + then + grep "Primates;" "$basef.NT.snap.matched.fulllength.all.annotated.sorted" > "$basef.NT.snap.matched.fl.Primates.annotated" + grep -v "Primates" "$basef.NT.snap.matched.fulllength.all.annotated.sorted" | grep "Mammalia" > "$basef.NT.snap.matched.fl.nonPrimMammal.annotated" + grep -v "Mammalia" "$basef.NT.snap.matched.fulllength.all.annotated.sorted" | grep "Chordata" > "$basef.NT.snap.matched.fl.nonMammalChordat.annotated" + grep -v "Chordata" "$basef.NT.snap.matched.fulllength.all.annotated.sorted" | grep "Eukaryota" > "$basef.NT.snap.matched.fl.nonChordatEuk.annotated" + ribo_snap_bac_euk.sh $basef.NT.snap.matched.fl.nonChordatEuk.annotated EUK $cores $ribo_snap_bac_euk_directory + fi + log "Done taxonomy retrieval" + log "Parameters: table_generator.sh $basef.NT.snap.matched.fl.Viruses.annotated SNAP Y Y Y Y>& $basef.table_generator_snap.matched.fl.log" + table_generator.sh "$basef.NT.snap.matched.fl.Viruses.annotated" SNAP Y Y Y Y>& "$basef.table_generator_snap.matched.fl.log" + if [ $run_mode = "Comprehensive" ] + then + ### convert to FASTQ and retrieve full-length sequences to add to unmatched SNAP for viral RAPSearch### + egrep -v "^@" "$basef.NT.snap.matched.fl.Viruses.annotated" | awk '{if($3 != "*") print "@"$1"\n"$10"\n""+"$1"\n"$11}' > $(echo "$basef.NT.snap.matched.fl.Viruses.annotated" | sed 's/\(.*\)\..*/\1/').fastq + log "Done: convert to FASTQ and retrieve full-length sequences for SNAP NT hits " + fi + log "############# SORTING unmatched to NT BY LENGTH AND UNIQ AND LOOKUP ORIGINAL SEQUENCES #################" + if [ $run_mode = "Comprehensive" ] + then + #SNN 140507 extractHeaderFromFastq.csh "$basef.NT.snap.unmatched.fastq" FASTQ "$basef.cutadapt.fastq" "$basef.NT.snap.unmatched.fulllength.fastq" + sed "n;n;n;d" "$basef.NT.snap.unmatched.fulllength.fastq" | sed "n;n;d" | sed "s/^@/>/g" > "$basef.NT.snap.unmatched.fulllength.fasta" + fi + cat "$basef.NT.snap.unmatched.fulllength.fasta" | perl -e 'while (<>) {$h=$_; $s=<>; $seqs{$h}=$s;} foreach $header (reverse sort {length($seqs{$a}) <=> length($seqs{$b})} keys %seqs) {print $header.$seqs{$header}}' > $basef.NT.snap.unmatched.fulllength.sorted.fasta + if [ $run_mode = "Comprehensive" ] + then + log "we will be using 50 as the length of the cropped read for removing unique and low-complexity reads" + log "Parameters: crop_reads.csh $basef.NT.snap.unmatched.fulllength.sorted.fasta 25 50 > $basef.NT.snap.unmatched.fulllength.sorted.cropped.fasta" + crop_reads.csh "$basef.NT.snap.unmatched.fulllength.sorted.fasta" 25 50 > "$basef.NT.snap.unmatched.fulllength.sorted.cropped.fasta" + log "*** reads cropped ***" + log "Parameters: gt sequniq -seqit -force -o $basef.NT.snap.unmatched.fulllength.sorted.cropped.uniq.fasta $basef.NT.snap.unmatched.fulllength.sorted.cropped.fasta" + gt sequniq -seqit -force -o "$basef.NT.snap.unmatched.fulllength.sorted.cropped.uniq.fasta" "$basef.NT.snap.unmatched.fulllength.sorted.cropped.fasta" + log "Parameters: extractAlltoFast.sh $basef.NT.snap.unmatched.fulllength.sorted.cropped.uniq.fasta FASTA $basef.NT.snap.unmatched.fulllength.fasta FASTA $basef.NT.snap.unmatched.uniq.fl.fasta FASTA" + extractAlltoFast.sh "$basef.NT.snap.unmatched.fulllength.sorted.cropped.uniq.fasta" FASTA "$basef.NT.snap.unmatched.fulllength.fasta" FASTA "$basef.NT.snap.unmatched.uniq.fl.fasta" FASTA #SNN140507 + fi + log "Done uniquing full length sequences of unmatched to NT" + fi fi + ####################### DENOVO CONTIG ASSEMBLY ##### if [ $run_mode = "Comprehensive" ] then - echo -e "$(date)\t$scriptname\t######### Running ABYSS and Minimus #########" + log "######### Running ABYSS and Minimus #########" START_deNovo=$(date +%s) - echo -e "$(date)\t$scriptname\tAdding matched viruses to NT unmatched" + log "Adding matched viruses to NT unmatched" sed "n;n;n;d" "$basef.NT.snap.matched.fl.Viruses.fastq" | sed "n;n;d" | sed "s/^@/>/g" | sed 's/>/>Vir/g' > "$basef.NT.snap.matched.fl.Viruses.fasta" gt sequniq -seqit -force -o "$basef.NT.snap.matched.fl.Viruses.uniq.fasta" "$basef.NT.snap.matched.fl.Viruses.fasta" cat "$basef.NT.snap.unmatched.uniq.fl.fasta" "$basef.NT.snap.matched.fl.Viruses.uniq.fasta" > "$basef.NT.snap.unmatched_addVir_uniq.fasta" - echo -e "$(date)\t$scriptname\tStarting deNovo assembly" - echo -e "$(date)\t$scriptname\tParameters: abyss_minimus.sh $basef.NT.snap.unmatched_addVir_uniq.fasta $length $contigcutoff $cores $abysskmer $ignore_barcodes_for_de_novo" + log "Starting deNovo assembly" + log "Parameters: abyss_minimus.sh $basef.NT.snap.unmatched_addVir_uniq.fasta $length $contigcutoff $cores $abysskmer $ignore_barcodes_for_de_novo" abyss_minimus.sh "$basef.NT.snap.unmatched_addVir_uniq.fasta" "$length" "$contigcutoff" "$cores" "$abysskmer" "$ignore_barcodes_for_de_novo" - echo -e "$(date)\t$scriptname\tCompleted deNovo assembly: generated all.$basef.NT.snap.unmatched_addVir_uniq.fasta.unitigs.cut${length}.${contigcutoff}-mini.fa" + log "Completed deNovo assembly: generated all.$basef.NT.snap.unmatched_addVir_uniq.fasta.unitigs.cut${length}.${contigcutoff}-mini.fa" END_deNovo=$(date +%s) diff_deNovo=$(( END_deNovo - START_deNovo )) - echo -e "$(date)\t$scriptname\tdeNovo Assembly took $diff_deNovo seconds." | tee -a timing.$basef.log + log "deNovo Assembly took $diff_deNovo seconds." | tee -a timing.$basef.log fi #######RAPSearch##### #################### RAPSearch to Vir ########### @@ -979,59 +959,55 @@ then then if [ -f "$basef.NT.snap.unmatched.uniq.fl.fasta" ] then - echo -e "$(date)\t$scriptname\t############# RAPSearch to ${RAPSearch_VIRUS_db} ON NT-UNMATCHED SEQUENCES #################" - if [[ $dropcache == "Y" ]] - then - dropcache - fi - echo -e "$(date)\t$scriptname\tStarting: RAPSearch $basef.NT.snap.unmatched.uniq.fl.fasta " + log "############# RAPSearch to ${RAPSearch_VIRUS_db} ON NT-UNMATCHED SEQUENCES #################" + log "Starting: RAPSearch $basef.NT.snap.unmatched.uniq.fl.fasta " START14=$(date +%s) - echo -e "$(date)\t$scriptname\tParameters: rapsearch -q $basef.NT.snap.unmatched.uniq.fl.fasta -d $RAPSearch_VIRUS_db -o $basef.$rapsearch_database.RAPSearch.e1 -z $cores -e $ecutoff_Vir -v 1 -b 1 -t N >& $basef.$rapsearch_database.RAPSearch.log" + log "Parameters: rapsearch -q $basef.NT.snap.unmatched.uniq.fl.fasta -d $RAPSearch_VIRUS_db -o $basef.$rapsearch_database.RAPSearch.e1 -z $cores -e $ecutoff_Vir -v 1 -b 1 -t N >& $basef.$rapsearch_database.RAPSearch.log" rapsearch -q "$basef.NT.snap.unmatched.uniq.fl.fasta" -d $RAPSearch_VIRUS_db -o $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir} -z "$cores" -e "$ecutoff_Vir" -v 1 -b 1 -t N >& $basef.$rapsearch_database.RAPSearch.log - echo -e "$(date)\t$scriptname\tDone RAPSearch" + log "Done RAPSearch" END14=$(date +%s) diff=$(( END14 - START14 )) - echo -e "$(date)\t$scriptname\tRAPSearch to Vir Took $diff seconds" - echo -e "$(date)\t$scriptname\tStarting: add FASTA sequences to RAPSearch m8 output file " + log "RAPSearch to Vir Took $diff seconds" + log "Starting: add FASTA sequences to RAPSearch m8 output file " START15=$(date +%s) sed -i '/^#/d' $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8 seqtk subseq $basef.NT.snap.unmatched.uniq.fl.fasta $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8 > $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta sed '/>/d' $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta > $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta.seq paste $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8 $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta.seq > $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.addseq.m8 - echo -e "$(date)\t$scriptname\tParameters: taxonomy_lookup.pl $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.addseq.m8 blast prot $cores $taxonomy_db_directory" + log "Parameters: taxonomy_lookup.pl $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.addseq.m8 blast prot $cores $taxonomy_db_directory" taxonomy_lookup.pl $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.addseq.m8 blast prot $cores $taxonomy_db_directory mv $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.addseq.all.annotated $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.annotated - echo -e "$(date)\t$scriptname\tParameters: table_generator.sh $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.annotated RAP N Y N N" + log "Parameters: table_generator.sh $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.annotated RAP N Y N N" table_generator.sh $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.annotated RAP N Y N N - echo -e "$(date)\t$scriptname\tDone: converting RAPSearch Vir output to fasta" + log "Done: converting RAPSearch Vir output to fasta" END15=$(date +%s) diff=$(( END15 - START15 )) - echo -e "$(date)\t$scriptname\tConverting RAPSearch Vir output to fasta sequences took $diff seconds." | tee -a timing.$basef.log + log "Converting RAPSearch Vir output to fasta sequences took $diff seconds." | tee -a timing.$basef.log cat "$basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta" "all.$basef.NT.snap.unmatched_addVir_uniq.fasta.unitigs.cut${length}.${contigcutoff}-mini.fa" > "$basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta" else - echo -e "$(date)\t$scriptname\tCannot run viral RAPSearch - necessary input file ($basef.$rapsearch_database.RAPSearch.e$ecutoff_Vir.m8) does not exist" - echo -e "$(date)\t$scriptname\tconcatenating RAPSearchvirus output and Contigs" + log "Cannot run viral RAPSearch - necessary input file ($basef.$rapsearch_database.RAPSearch.e$ecutoff_Vir.m8) does not exist" + log "concatenating RAPSearchvirus output and Contigs" fi - echo -e "$(date)\t$scriptname\t############# Cleanup RAPSearch Vir by RAPSearch to NR #################" + log "############# Cleanup RAPSearch Vir by RAPSearch to NR #################" if [ -f $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta ] then - echo -e "$(date)\t$scriptname\tStarting: RAPSearch to $RAPSearch_NR_db of $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta :" + log "Starting: RAPSearch to $RAPSearch_NR_db of $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta :" START16=$(date +%s) - echo -e "$(date)\t$scriptname\tParameters: rapsearch -q $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta -d $RAPSearch_NR_db -o $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR} -z $cores -e $ecutoff_NR -v 1 -b 1 -t N -a T" + log "Parameters: rapsearch -q $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta -d $RAPSearch_NR_db -o $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR} -z $cores -e $ecutoff_NR -v 1 -b 1 -t N -a T" rapsearch -q $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta -d $RAPSearch_NR_db -o $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR} -z $cores -e $ecutoff_NR -v 1 -b 1 -t N -a T - echo -e "$(date)\t$scriptname\trapsearch to nr done" + log "rapsearch to nr done" sed -i '/^#/d' $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.m8 - echo -e "$(date)\t$scriptname\tremoved extra #" + log "removed extra #" END16=$(date +%s) diff=$(( END16 - START16 )) - echo -e "$(date)\t$scriptname\tRAPSearch to NR took $diff seconds." | tee -a timing.$basef.log - echo -e "$(date)\t$scriptname\tStarting: Seq retrieval and Taxonomy $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}" + log "RAPSearch to NR took $diff seconds." | tee -a timing.$basef.log + log "Starting: Seq retrieval and Taxonomy $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}" START17=$(date +%s) seqtk subseq $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta \ $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.m8 > \ $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.m8.fasta - echo -e "$(date)\t$scriptname\tretrieved sequences" + log "retrieved sequences" cat $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.m8.fasta | \ awk '{if (substr($0,1,1)==">"){if (p){print "\n";} print $0} else printf("%s",$0);p++;}END{print "\n"}' | \ sed '/^$/d' | \ @@ -1039,30 +1015,30 @@ then paste $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e$ecutoff_Vir.NR.e${ecutoff_NR}.m8 \ $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.m8.fasta.seq > \ $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.addseq.m8 - echo -e "$(date)\t$scriptname\tmade addseq file" - echo -e "$(date)\t$scriptname\t############# RAPSearch Taxonomy" - echo -e "$(date)\t$scriptname\tParameters: taxonomy_lookup.pl $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.addseq.m8 blast prot $cores $taxonomy_db_directory" + log "made addseq file" + log "############# RAPSearch Taxonomy" + log "Parameters: taxonomy_lookup.pl $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.addseq.m8 blast prot $cores $taxonomy_db_directory" taxonomy_lookup.pl $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.addseq.m8 blast prot $cores $taxonomy_db_directory cp $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.addseq.all.annotated $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.annotated - echo -e "$(date)\t$scriptname\tretrieved taxonomy" + log "retrieved taxonomy" grep "Viruses" $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.annotated > $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.Viruses.annotated egrep "^contig" $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.annotated > $basef.Contigs.NR.RAPSearch.e${ecutoff_NR}.annotated - echo -e "$(date)\t$scriptname\textracted RAPSearch taxonomy" - echo -e "$(date)\t$scriptname\tStarting Readcount table" - echo -e "$(date)\t$scriptname\tParameters: table_generator.sh $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.Viruses.annotated RAP Y Y Y Y" + log "extracted RAPSearch taxonomy" + log "Starting Readcount table" + log "Parameters: table_generator.sh $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.Viruses.annotated RAP Y Y Y Y" table_generator.sh $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.Viruses.annotated RAP Y Y Y Y - echo -e "$(date)\t$scriptname\tParameters: table_generator.sh $basef.Contigs.NR.RAPSearch.e${ecutoff_NR}.annotated RAP Y Y Y Y" + log "Parameters: table_generator.sh $basef.Contigs.NR.RAPSearch.e${ecutoff_NR}.annotated RAP Y Y Y Y" table_generator.sh $basef.Contigs.NR.RAPSearch.e${ecutoff_NR}.annotated RAP Y Y Y Y #allow contigs to be incorporated into coverage maps by making contig barcodes the same as non-contig barcodes (removing the @) sed 's/@//g' $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.Viruses.annotated > $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.Viruses.annotated.bar.inc - echo -e "$(date)\t$scriptname\tmaking coverage maps" + log "making coverage maps" # coverage_generator_bp.sh (divides each fasta file into $cores cores then runs BLASTn using one core each. - echo -e "$(date)\t$scriptname\tParameters: coverage_generator_bp.sh $basef.NT.snap.matched.fl.Viruses.annotated $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.Viruses.annotated.bar.inc $eBLASTn $cores 10 1 $basef" + log "Parameters: coverage_generator_bp.sh $basef.NT.snap.matched.fl.Viruses.annotated $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.Viruses.annotated.bar.inc $eBLASTn $cores 10 1 $basef" coverage_generator_bp.sh $basef.NT.snap.matched.fl.Viruses.annotated $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.Viruses.annotated.bar.inc $eBLASTn $cores 10 1 $basef awk '{print$1}' $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.annotated > $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.annotated.header awk '{print$1}' $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.annotated > $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.annotated.header - # find headers in viral rapsearch that are no longer found in rapsearch to nr + # find headers in viral rapsearch that are no longer found in rapsearch to nr sort $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.annotated.header \ $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.annotated.header | \ uniq -d | \ @@ -1077,38 +1053,38 @@ then cat $basef.not.in.NR.[a-z][a-z][a-z][a-z][a-z][a-z].annotated > $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.annotated.not.in.NR.annotated rm -r $basef.not.in.NR.[a-z][a-z][a-z][a-z][a-z][a-z] rm -r $basef.not.in.NR.[a-z][a-z][a-z][a-z][a-z][a-z].annotated - echo -e "$(date)\t$scriptname\tParameters: table_generator.sh $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.annotated.not.in.NR.annotated RAP N Y N N" + log "Parameters: table_generator.sh $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.annotated.not.in.NR.annotated RAP N Y N N" table_generator.sh $basef.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.annotated.not.in.NR.annotated RAP N Y N N END17=$(date +%s) diff=$(( END17 - START17 )) - echo -e "$(date)\t$scriptname\tRAPSearch seq retrieval, taxonomy and readcount took $diff seconds" | tee -a timing.$basef.log + log "RAPSearch seq retrieval, taxonomy and readcount took $diff seconds" | tee -a timing.$basef.log else - echo -e "$(date)\t$scriptname\tCannot run RAPSearch to NR - necessary input file ($basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta) does not exist" + log "Cannot run RAPSearch to NR - necessary input file ($basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.m8.fasta) does not exist" fi fi ##################RAPSearch to NR ####### if [ "$rapsearch_database" == "NR" ] then - echo -e "$(date)\t$scriptname\t#################### RAPSearch to NR ###########" + log "#################### RAPSearch to NR ###########" cat "$basef.NT.snap.unmatched.uniq.fl.fasta" "all.$basef.NT.snap.unmatched_addVir_uniq.fasta.unitigs.cut${length}.${contigcutoff}-mini.fa" > "$basef.Contigs.NT.snap.unmatched.uniq.fl.fasta" if [ -f $basef.Contigs.NT.snap.unmatched.uniq.fl.fasta ] then - echo -e "$(date)\t$scriptname\t############# RAPSearch to NR #################" - echo -e "$(date)\t$scriptname\tStarting: RAPSearch to NR $basef.Contigs.NT.snap.unmatched.uniq.fl.fasta" + log "############# RAPSearch to NR #################" + log "Starting: RAPSearch to NR $basef.Contigs.NT.snap.unmatched.uniq.fl.fasta" START16=$(date +%s) - echo -e "$(date)\t$scriptname\tParameters:rapsearch -q $basef.Contigs.NT.snap.unmatched.uniq.fl.fasta -d $RAPSearch_NR_db -o $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR} -z $cores -e $ecutoff_NR -v 1 -b 1 -t N -a $RAPSearch_NR_fast_mode" + log "Parameters:rapsearch -q $basef.Contigs.NT.snap.unmatched.uniq.fl.fasta -d $RAPSearch_NR_db -o $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR} -z $cores -e $ecutoff_NR -v 1 -b 1 -t N -a $RAPSearch_NR_fast_mode" rapsearch -q "$basef.Contigs.NT.snap.unmatched.uniq.fl.fasta" -d "$RAPSearch_NR_db" -o "$basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}" -z $cores -e $ecutoff_NR -v 1 -b 1 -t N -a $RAPSearch_NR_fast_mode - echo -e "$(date)\t$scriptname\tRAPSearch to NR done" + log "RAPSearch to NR done" sed -i '/^#/d' $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.m8 - echo -e "$(date)\t$scriptname\tremoved extra #" + log "removed extra #" END16=$(date +%s) diff=$(( END16 - START16 )) - echo -e "$(date)\t$scriptname\tRAPSearch to NR took $diff seconds" | tee -a timing.$basef.log - echo -e "$(date)\t$scriptname\tStarting: Seq retrieval and Taxonomy $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}" + log "RAPSearch to NR took $diff seconds" | tee -a timing.$basef.log + log "Starting: Seq retrieval and Taxonomy $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}" START17=$(date +%s) seqtk subseq $basef.Contigs.NT.snap.unmatched.uniq.fl.fasta $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.m8 > $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.m8.fasta - echo -e "$(date)\t$scriptname\tretrieved sequences" + log "retrieved sequences" cat $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.m8.fasta | \ awk '{if (substr($0,1,1)==">"){if (p){print "\n";} print $0} else printf("%s",$0);p++;}END{print "\n"}' | \ sed '/^$/d' | \ @@ -1118,40 +1094,36 @@ then paste $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.m8 \ $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.m8.fasta.seq > \ $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.addseq.m8 - echo -e "$(date)\t$scriptname\tmade addseq file" - echo -e "$(date)\t$scriptname\t############# RAPSearch Taxonomy" - echo -e "$(date)\t$scriptname\tParameters: taxonomy_lookup.pl $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.addseq.m8 blast prot $cores $taxonomy_db_directory" + log "made addseq file" + log "############# RAPSearch Taxonomy" + log "Parameters: taxonomy_lookup.pl $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.addseq.m8 blast prot $cores $taxonomy_db_directory" taxonomy_lookup.pl $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.addseq.m8 blast prot $cores $taxonomy_db_directory cp $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.addseq.all.annotated $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.annotated - echo -e "$(date)\t$scriptname\tretrieved taxonomy" + log "retrieved taxonomy" grep "Viruses" $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.annotated > $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.Viruses.annotated - egrep "^contig" $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.annotated > $basef.Contigs.$rapsearch_database.RAPSearch.e${ecutoff_NR}.annotated - echo -e "$(date)\t$scriptname\textracted RAPSearch taxonomy" - echo -e "$(date)\t$scriptname\tStarting Readcount table" - echo -e "$(date)\t$scriptname\tParameters: table_generator.sh $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.Viruses.annotated RAP Y Y Y Y" + egrep "^contig" $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.annotated > $basef.Contigs.$rapsearch_database.RAPSearch.e${ecutoff_NR}.annotated + log "extracted RAPSearch taxonomy" + log "Starting Readcount table" + log "Parameters: table_generator.sh $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.Viruses.annotated RAP Y Y Y Y" table_generator.sh $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.Viruses.annotated RAP Y Y Y Y grep -v Viruses $basef.Contigs.$rapsearch_database.RAPSearch.e${ecutoff_NR}.annotated > $basef.Contigs.$rapsearch_database.RAPSearch.e${ecutoff_NR}.noVir.annotated - echo -e "$(date)\t$scriptname\tParameters: table_generator.sh $basef.Contigs.$rapsearch_database.RAPSearch.e${ecutoff_NR}.noVir.annotated RAP N Y N N" + log "Parameters: table_generator.sh $basef.Contigs.$rapsearch_database.RAPSearch.e${ecutoff_NR}.noVir.annotated RAP N Y N N" table_generator.sh $basef.Contigs.$rapsearch_database.RAPSearch.e${ecutoff_NR}.noVir.annotated RAP N Y N N sed 's/@//g' $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.Viruses.annotated > $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.Viruses.annotated.bar.inc - echo -e "$(date)\t$scriptname\tmaking coverage maps" - echo -e "$(date)\t$scriptname\tParameters: coverage_generator_bp.sh $basef.NT.snap.matched.fl.Viruses.annotated $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.Viruses.annotated.bar.inc $eBLASTn 10 10 1 $basef" + log "making coverage maps" + log "Parameters: coverage_generator_bp.sh $basef.NT.snap.matched.fl.Viruses.annotated $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.Viruses.annotated.bar.inc $eBLASTn 10 10 1 $basef" coverage_generator_bp.sh $basef.NT.snap.matched.fl.Viruses.annotated $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_NR}.Viruses.annotated.bar.inc $eBLASTn 10 10 1 $basef END17=$(date +%s) diff=$(( END17 - START17 )) - echo -e "$(date)\t$scriptname\tRAPSearch seq retrieval, taxonomy and table readcount and coverage took $diff seconds." | tee -a timing.$basef.log + log "RAPSearch seq retrieval, taxonomy and table readcount and coverage took $diff seconds." | tee -a timing.$basef.log else - echo -e "$(date)\t$scriptname\tCannot run RAPSearch to NR - necessary input file ($basef.Contigs.NT.snap.unmatched.uniq.fl.fasta) does not exist" + log "Cannot run RAPSearch to NR - necessary input file ($basef.Contigs.NT.snap.unmatched.uniq.fl.fasta) does not exist" fi fi - if [[ $dropcache == "Y" ]] - then - dropcache - fi fi ############################# OUTPUT FINAL COUNTS ############################# -echo -e "$(date)\t$scriptname\tStarting: generating readcounts.$basef.log report" +log "Starting: generating readcounts.$basef.log report" START_READCOUNT=$(date +%s) headerid_top=$(head -1 $basef.fastq | cut -c1-4) @@ -1162,8 +1134,8 @@ if [ "$headerid_top" == "$headerid_bottom" ] # We should adjust this code to check that all headers are unique, rather than just the first and last then headerid=$(head -1 $basef.fastq | cut -c1-4 | sed 's/@//g') - echo -e "$(date)\t$scriptname\theaderid_top $headerid_top = headerid_bottom $headerid_bottom and headerid = $headerid" - echo -e "$(date)\t$scriptname\tParameters: readcount.sh $basef $headerid Y $basef.fastq $basef.preprocessed.fastq $basef.preprocessed.s20.h250n25d12xfu.human.snap.unmatched.fastq $basef.NT.snap.matched.fulllength.all.annotated.sorted $basef.NT.snap.matched.fl.Viruses.annotated $basef.NT.snap.matched.fl.Bacteria.annotated $basef.NT.snap.matched.fl.nonChordatEuk.annotated $basef.NT.snap.unmatched.sam $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.Viruses.annotated" + log "headerid_top $headerid_top = headerid_bottom $headerid_bottom and headerid = $headerid" + log "Parameters: readcount.sh $basef $headerid Y $basef.fastq $basef.preprocessed.fastq $basef.preprocessed.s20.h250n25d12xfu.human.snap.unmatched.fastq $basef.NT.snap.matched.fulllength.all.annotated.sorted $basef.NT.snap.matched.fl.Viruses.annotated $basef.NT.snap.matched.fl.Bacteria.annotated $basef.NT.snap.matched.fl.nonChordatEuk.annotated $basef.NT.snap.unmatched.sam $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.Viruses.annotated" readcount.sh $basef $headerid Y $basef.fastq \ $basef.preprocessed.fastq \ $basef.preprocessed.s20.h250n25d12xfu.human.snap.unmatched.fastq \ @@ -1173,18 +1145,18 @@ then $basef.NT.snap.matched.fl.nonChordatEuk.annotated \ $basef.NT.snap.unmatched.sam \ $basef.Contigs.and.NTunmatched.$rapsearch_database.RAPSearch.e${ecutoff_Vir}.NR.e${ecutoff_NR}.Viruses.annotated - echo -e "$(date)\t$scriptname\tDone: generating readcounts.$basef.log report" + log "Done: generating readcounts.$basef.log report" END_READCOUNT=$(date +%s) diff_READCOUNT=$(( END_READCOUNT - START_READCOUNT )) - echo -e "$(date)\t$scriptname\tGenerating read count report Took $diff_READCOUNT seconds" | tee -a timing.$basef.log + log "Generating read count report Took $diff_READCOUNT seconds" | tee -a timing.$basef.log else - echo -e "$(date)\t$scriptname\treadcount.sh aborted due to non-unique header id" + log "readcount.sh aborted due to non-unique header id" fi -echo -e "$(date)\t$scriptname\t#################### SURPI PIPELINE COMPLETE ##################" +log "#################### SURPI PIPELINE COMPLETE ##################" END_PIPELINE=$(date +%s) diff_PIPELINE=$(( END_PIPELINE - START_PIPELINE )) -echo -e "$(date)\t$scriptname\tTotal run time of pipeline took $diff_PIPELINE seconds" | tee -a timing.$basef.log +log "Total run time of pipeline took $diff_PIPELINE seconds" | tee -a timing.$basef.log echo "Script and Parameters = $0 $@ " > $basef.pipeline_parameters.log echo "Raw Read quality = $quality" >> $basef.pipeline_parameters.log diff --git a/abyss_minimus.sh b/abyss_minimus.sh index ed14d85..99d62d7 100755 --- a/abyss_minimus.sh +++ b/abyss_minimus.sh @@ -15,8 +15,9 @@ # Copyright (C) 2014 Samia N Naccache - All Rights Reserved # SURPI has been released under a modified BSD license. # Please see license file for details. - scriptname=${0##*/} +source debug.sh +source logging.sh if [ $# -lt 5 ] then @@ -34,7 +35,7 @@ ignore_barcodes=$6 split_FASTA_size=100000 #### create list of barcodes in the fasta file in the following format: #barcode -echo -e "$(date)\t$scriptname\tDemultiplexing barcodes..." +log "Demultiplexing barcodes..." START_DEMULTIPLEX=$(date +%s) #The sed '/^#$/d' removes a blank #. This is needed in cases where barcodes are present along with reads that have no barcodes @@ -54,10 +55,10 @@ else #$inputfile.barcodes doesn't exist, or size=0, then we only have 1 barcode, echo "#" > $inputfile.barcodes fi -echo -e "$(date)\t$scriptname\tCompleted demultiplexing barcodes." +log "Completed demultiplexing barcodes." END_DEMULTIPLEX=$(date +%s) diff_DEMULTIPLEX=$(( END_DEMULTIPLEX - START_DEMULTIPLEX )) -echo -e "$(date)\t$scriptname\tBarcode demultiplex took $diff_DEMULTIPLEX s." +log "Barcode demultiplex took $diff_DEMULTIPLEX s." ### generate fasta file for every separate barcode (demultiplex) for f in `cat $inputfile.barcodes` ; do @@ -68,30 +69,33 @@ for f in `cat $inputfile.barcodes` ; do grep -E "$f(/|$)" $inputfile -A 1 --no-group-separator > bar$f.$inputfile fi # split demultiplexed fasta file into 100,000 read sub-fastas - echo -e "$(date)\t$scriptname\tSplitting FASTA into chunks of size: $split_FASTA_size." + log "Splitting FASTA into chunks of size: $split_FASTA_size." START_SPLIT=$(date +%s) cp bar$f.$inputfile bar$f.${inputfile}_n # So that the unsplit demultiplexed file is also denovo assembled # split_fasta.pl -i bar$f.$inputfile -o bar$f.$inputfile -n $split_FASTA_size - echo -e "$(date)\t$scriptname\tCompleted splitting FASTA file." + log "Completed splitting FASTA file." END_SPLIT=$(date +%s) diff_SPLIT=$(( $END_SPLIT - $START_SPLIT )) - echo -e "$(date)\t$scriptname\tSplitting FASTA took $diff_SPLIT s." - ### run abyss (deBruijn assembler) on each 100,000 read demultiplexed fasta file, including the unsplit demultiplexed file - echo -e "$(date)\t$scriptname\tRunning abyss on each $split_FASTA_size chunk..." + log "Splitting FASTA took $diff_SPLIT s." + ### run abyss (deBruijn assembler) on each 100,000 read demultiplexed fasta file, including the unsplit demultiplexed file + log "Running abyss on each $split_FASTA_size chunk..." START_ABYSS=$(date +%s) - for d in bar$f.${inputfile}_* ; do - abyss-pe k=$kmer name=$d.f se=$d np=$cores >& $d.abyss.log + for d in bar$f.${inputfile}_* ; do + echo $d + log "Command: abyss-pe k=$kmer name=$d.f se=$d np=$cores >& $d.abyss.log" + #abyss-pe k=$kmer name=$d.f se=$d np=$cores >& $d.abyss.log + abyss-pe k=$kmer name=$d.f se=$d >& $d.abyss.log done - echo -e "$(date)\t$scriptname\tCompleted running abyss on each $split_FASTA_size chunk." + log "Completed running abyss on each $split_FASTA_size chunk." END_ABYSS=$(date +%s) diff_ABYSS=$(( END_ABYSS - START_ABYSS )) - echo -e "$(date)\t$scriptname\tAbyss took $diff_ABYSS s." + log "Abyss took $diff_ABYSS s." ### ### contigs from split files concatenated, after which reads smaller than the cutoff value are eliminated - echo -e "$(date)\t$scriptname\tStarting concatenating and cutoff of abyss output" + log "Starting concatenating and cutoff of abyss output" START_CATCONTIGS=$(date +%s) #concatenating contig files from different fasta splits, and adding kmer infromation and barcode information to contig headers @@ -99,47 +103,48 @@ for f in `cat $inputfile.barcodes` ; do # only contigs larger than $cutoff_post_Abyss are retained cat all.bar$f.$inputfile.unitigs.fa | awk 'NR%2==1 {x = $0} NR%2==0 { if (length($0) >= '$2') printf("%s\n%s\n",x,$0)}' > all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss.fa - echo -e "$(date)\t$scriptname\tDone concatenating and cutoff of abyss output" + log "Done concatenating and cutoff of abyss output" END_CATCONTIGS=$(date +%s) diff_CATCONTIGS=$(( END_CATCONTIGS - START_CATCONTIGS )) - echo -e "$(date)\t$scriptname\tConcatenating and cutoff of abyss output took $diff_CATCONTIGS s." + log "Concatenating and cutoff of abyss output took $diff_CATCONTIGS s." ### ### run Minimo (OLC assembler) - echo -e "$(date)\t$scriptname\tStarting Minimo..." + log "Starting Minimo..." START_MINIMO=$(date +%s) + log "Command: Minimo all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss.fa -D FASTA_EXP=1" Minimo all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss.fa -D FASTA_EXP=1 - echo -e "$(date)\t$scriptname\tCompleted Minimo." + log "Completed Minimo." END_MINIMO=$(date +%s) diff_MINIMO=$(( END_MINIMO - START_MINIMO )) - echo -e "$(date)\t$scriptname\tMinimo took $diff_MINIMO s." + log "Minimo took $diff_MINIMO s." ########### - echo -e "$(date)\t$scriptname\tStarting cat barcode addition and cutoff of minimo output" + log "Starting cat barcode addition and cutoff of minimo output" START_MIN_PROCESS=$(date +%s) - # Minimo output gives more than one line per sequence, so here we linearize sequences (linearization protocol from here http://seqanswers.com/forums/showthread.php?t=27567 ) + # Minimo output gives more than one line per sequence, so here we linearize sequences (linearization protocol from here http://seqanswers.com/forums/showthread.php?t=27567 ) # then we add the relevant barcode to the end of the header contig. Contigs that survive untouched from abyss already have a barcode at the end of them, so that extra barcode is taken away - cat all-contigs.fa | awk '{if (substr($0,1,1)==">"){if (p){print "\n";} print $0} else printf("%s",$0);p++;}END{print "\n"}' | sed '/^$/d' | sed 's/ /_/g' | sed "s/$/"$f"/g;n" | sed "s/"$f""$f"/"$f"/g" | sed 's/#/#@/g' | sed 's/^>/>contig_/g' > all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss-minim.fa - # change generic name all-contigs.fa + cat all-contigs.fa | awk '{if (substr($0,1,1)==">"){if (p){print "\n";} print $0} else printf("%s",$0);p++;}END{print "\n"}' | sed '/^$/d' | sed 's/ /_/g' | sed "s/$/"$f"/g;n" | sed "s/"$f""$f"/"$f"/g" | sed 's/#/#@/g' | sed 's/^>/>contig_/g' > all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss-minim.fa + # change generic name all-contigs.fa mv all-contigs.fa all-contigs.fa.$f # only contigs larger than $cutoff_post_Minimo are retained cat all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss-minim.fa | awk 'NR%2==1 {x = $0} NR%2==0 { if (length($0) >= '$3') printf("%s\n%s\n",x,$0)}' > all.bar$f.$inputfile.unitigs.cut$cutoff_post_Abyss.${cutoff_post_Minimo}-mini.fa - echo -e "$(date)\t$scriptname\tDone cat barcode addition and cutoff of minimo output" + log "Done cat barcode addition and cutoff of minimo output" END_MIN_PROCESS=$(date +%s) diff_MIN_PROCESS=$(( END_MIN_PROCESS - START_MIN_PROCESS )) - echo -e "$(date)\t$scriptname\tcat barcode addition and cutoff of minimo output took $diff_MIN_PROCESS s" + log "cat barcode addition and cutoff of minimo output took $diff_MIN_PROCESS s" done ### ### concatenate deBruijn -> OLC contigs from all barcodes together -echo -e "$(date)\t$scriptname\tConcatenating all barcodes together..." +log "Concatenating all barcodes together..." START_CAT=$(date +%s) cat all.bar*.$inputfile.unitigs.cut$cutoff_post_Abyss.${cutoff_post_Minimo}-mini.fa > all.$inputfile.unitigs.cut$cutoff_post_Abyss.${cutoff_post_Minimo}-mini.fa -echo -e "$(date)\t$scriptname\tCompleted concatenating all barcodes." +log "Completed concatenating all barcodes." END_CAT=$(date +%s) diff_CAT=$(( END_CAT - START_CAT )) -echo -e "$(date)\t$scriptname\tBarcode concatenation took $diff_CAT s." +log "Barcode concatenation took $diff_CAT s." # cleaning up files by organizing directories, moving files into directories, and removing temporary files -mkdir $inputfile.dir +mkdir --parents $inputfile.dir mv bar*$inputfile*.fa $inputfile.dir if [ -e all.$inputfile.contigs.abyssmini.cut$cutoff_post_Abyss.$cutoff_post_Minimo.e1.NR.RAPSearch.m8 ] then @@ -148,7 +153,7 @@ fi rm -f all.$inputfile.contigs.abyssmini.cut$cutoff_post_Abyss.$cutoff_post_Minimo.e1.NR.RAPSearch.aln rm -f all.$inputfile.contigs.abyssmini.e1.NR.RAPSearch.m8.noheader rm -f all.$inputfile.contigs.abyssmini.e1.NR.RAPSearch.m8.noheader.seq -rm -f all.$inputfile.contigs.abyssmini.e1.NR.RAPSearch.m8.noheader +rm -f all.$inputfile.contigs.abyssmini.e1.NR.RAPSearch.m8.noheader rm -f all.$inputfile.contigs.abyssmini.e1.NR.RAPSearch.m8.noheader.ex.fa rm -f all.$inputfile.unitigs.cut$cutoff_post_Abyss-contigs.sortlen.seq rm -f all-contigs* diff --git a/compare_multiple_sam.py b/compare_multiple_sam.py index 1af36c8..ce151ed 100755 --- a/compare_multiple_sam.py +++ b/compare_multiple_sam.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python # # compare_multiple_sam.py # @@ -9,7 +9,7 @@ # # Copyright (C) 2014 Charles Y Chiu - All Rights Reserved # Permission to copy and modify is granted under the BSD license -# Last revised 3/21/2014 +# Last revised 3/21/2014 import sys @@ -61,7 +61,7 @@ lineList = [] for file in fileObjList: lineList.append(file.readline()) - + for file in fileObjList: file.close() outputFile.close() diff --git a/compare_sam.py b/compare_sam.py index 2bd65af..a8feb98 100755 --- a/compare_sam.py +++ b/compare_sam.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python # # compare_sam.py # @@ -14,13 +14,13 @@ import os def logHeader(): - import os.path, sys, time - return "%s\t%s\t" % (time.strftime("%a %b %d %H:%M:%S %Z %Y"), os.path.basename(sys.argv[0])) + import os.path, sys, time + return "%s\t%s\t" % (time.strftime("%a %b %d %H:%M:%S %Z %Y"), os.path.basename(sys.argv[0])) usage = "compare_sam.py " if len(sys.argv) != 3: - print usage - sys.exit(0) + print usage + sys.exit(0) SAMfile1 = sys.argv[1] outputFile1 = sys.argv[2] @@ -41,63 +41,66 @@ def logHeader(): linenum += 1 while line1 != '': - data1 = line1.split() - - # check to see whether line is in the SAM header or alignment section - if (data1[0][0]!="@"): - #We are in the SAM alignment section - - edit_distance=int(data1[12].split(":")[2]) - - #using old snap (<=v0.15.4), could use length of line to determine if it was a hit or miss. - #new snap (>=v1.0) instead uses an edit distance of -1 to signify a miss. - - if (edit_distance >= 0): - #this is a hit - new_hits += 1 - sum_edistance_new += int(data1[12].split(":")[2]) - firstentry = data1[0].split("|") - if (len(firstentry)>=2): - # this has been previously flagged as a hit, and is also now a hit. Compare edit distances to find best hit. - edit_distance_previous=firstentry[1] - edit_distance_current = data1[12].split(":")[2] - - existing_hits += 1 - sum_edistance_existing += int(edit_distance_previous) - - if (int(edit_distance_current) <= int(edit_distance_previous)): - replacements += 1 - sum_data1 += int(edit_distance_current) - # need to pick the new gi, not the old gi, bug fix 1/27/13 - gi=data1[2].split("|")[1] - replacement_line = line1.replace("|" + firstentry[1] + "|" + firstentry[2],"|" + str(edit_distance_current) + "|" + str(gi),1) - outputFile.write(replacement_line) - else: - sum_data1 += int(edit_distance_previous) - outputFile.write(line1) - else: - edit_distance_current = data1[12].split(":")[2] - existing_hits += 1 - sum_data1 += int(edit_distance_current) - gi=data1[2].split("|")[1] - replacement_line = line1.replace(data1[0],data1[0] + "|" + str(edit_distance_current) + "|" + str(gi),1) - outputFile.write(replacement_line) - replacements+=1 - else: - #this is a miss - outputFile.write(line1) - firstentry = data1[0].split("|") - if (len(firstentry)>=2): - edit_distance_previous=firstentry[1] - existing_hits += 1 - sum_data1 += int(edit_distance_previous) - sum_edistance_existing += int(edit_distance_previous) - else: - #We are in the SAM header section - outputFile.write(line1) - - line1 = file1.readline() - linenum += 1 + data1 = line1.split() + + # check to see whether line is in the SAM header or alignment section + if (data1[0][0]!="@"): + #We are in the SAM alignment section + + edit_distance=int(data1[12].split(":")[2]) + mapped = not bool(int(data1[1]) & 0x4) + + #using old snap (<=v0.15.4), could use length of line to determine if it was a hit or miss. + #new snap (>=v1.0) instead uses an edit distance of -1 to signify a miss. + + # Not sure what above is saying. + # Use 0x4 mask in FLAG field to check for unmapped reads. + if (mapped): + #this is a hit + new_hits += 1 + sum_edistance_new += edit_distance + firstentry = data1[0].split("|") + if (len(firstentry)>=2): + # this has been previously flagged as a hit, and is also now a hit. Compare edit distances to find best hit. + edit_distance_previous=firstentry[1] + edit_distance_current = data1[12].split(":")[2] + + existing_hits += 1 + sum_edistance_existing += int(edit_distance_previous) + + if (int(edit_distance_current) <= int(edit_distance_previous)): + replacements += 1 + sum_data1 += int(edit_distance_current) + # need to pick the new gi, not the old gi, bug fix 1/27/13 + gi=data1[2].split("|")[1] + replacement_line = line1.replace("|" + firstentry[1] + "|" + firstentry[2],"|" + str(edit_distance_current) + "|" + str(gi),1) + outputFile.write(replacement_line) + else: + sum_data1 += int(edit_distance_previous) + outputFile.write(line1) + else: + edit_distance_current = data1[12].split(":")[2] + existing_hits += 1 + sum_data1 += int(edit_distance_current) + gi=data1[2].split("|")[1] + replacement_line = line1.replace(data1[0],data1[0] + "|" + str(edit_distance_current) + "|" + str(gi),1) + outputFile.write(replacement_line) + replacements+=1 + else: + #this is a miss + outputFile.write(line1) + firstentry = data1[0].split("|") + if (len(firstentry)>=2): + edit_distance_previous=firstentry[1] + existing_hits += 1 + sum_data1 += int(edit_distance_previous) + sum_edistance_existing += int(edit_distance_previous) + else: + #We are in the SAM header section + outputFile.write(line1) + + line1 = file1.readline() + linenum += 1 file1.close() outputFile.close() diff --git a/coveragePlot.py b/coveragePlot.py index c95981e..0c48713 100755 --- a/coveragePlot.py +++ b/coveragePlot.py @@ -1,8 +1,8 @@ -#!/usr/bin/python +#!/usr/bin/env python # # coveragePlot.py # -# This program generates genomic coverage plots +# This program generates genomic coverage plots # Chiu Laboratory # University of California, San Francisco # January, 2014 @@ -47,7 +47,7 @@ def smart_truncate1(text, max_length=100, suffix='...'): # print "Installed version is: %s." % mpl_version -#load function is deprecated as of matplotlib v1.3.1, replaced with +#load function is deprecated as of matplotlib v1.3.1, replaced with if (LooseVersion(mpl_version) >= LooseVersion('1.3.1') ): data = np.loadtxt(dataFile) else: @@ -78,7 +78,7 @@ def smart_truncate1(text, max_length=100, suffix='...'): plot(data[:,0],data[:,1],color) xlabel("base position",fontsize=8) ylabel("fold coverage",fontsize=8) - title_text = sys.argv[2] + title_text = sys.argv[2] suptitle(title_text,fontsize=9) xMin, xMax, yMin, yMax = min(data[:,0]),max(data[:,0]),min(data[:,1]),max(data[:,1]) # add a 10% buffer to yMax @@ -105,7 +105,7 @@ def smart_truncate1(text, max_length=100, suffix='...'): plot(data[:,0],data[:,1],color) xlabel("base position",fontsize=8) ylabel("fold coverage",fontsize=8) - title_text = sys.argv[2] + title_text = sys.argv[2] suptitle(title_text,fontsize=9) xMin, xMax, yMin, yMax = min(data[:,0]),max(data[:,0]),min(data[:,1]),max(data[:,1]) yMax *= 1.1 @@ -122,7 +122,7 @@ def smart_truncate1(text, max_length=100, suffix='...'): plot(data[:,0],data[:,1],color) xlabel("base position",fontsize=8) ylabel("fold coverage",fontsize=8) - title_text = sys.argv[2] + title_text = sys.argv[2] suptitle(title_text,fontsize=9) xMin, xMax, yMin, yMax = min(data[:,0]),max(data[:,0]),min(data[:,1]),max(data[:,1]) yMax *= 1.1 diff --git a/coverage_generator_bp.sh b/coverage_generator_bp.sh index 9510a8d..6f13feb 100755 --- a/coverage_generator_bp.sh +++ b/coverage_generator_bp.sh @@ -93,7 +93,7 @@ do # create list of curated GIs $SNAP_file.$genus.gi.list.curatedgenome # First retain only gis that are complete genomes if no GIs with "complete genomes" are returned, we'll go to GIs "complete sequences", if no complete sequences, then we just go with the GIs we have - get_genbankfasta.pl -i bar.$bar.$SNAP_file.$genus.gi.list > bar.$bar.$SNAP_file.$genus.gi.headers + get_genbankfasta.pl -i bar.$bar.$SNAP_file.$genus.gi.list > bar.$bar.$SNAP_file.$genus.gi.headers egrep ">gi.*[Cc]omplete [Gg]enome" bar.$bar.$SNAP_file.$genus.gi.headers | awk -F "|" '{print$2}' | head -n $top_gis > bar.$bar.$SNAP_file.$genus.gi if [ -s bar.$bar.$SNAP_file.$genus.gi ] diff --git a/create_SURPI_data.sh b/create_SURPI_data.sh index 8e13c42..93a1cd9 100755 --- a/create_SURPI_data.sh +++ b/create_SURPI_data.sh @@ -15,7 +15,7 @@ # Copyright (C) 2014 Scot Federman - All Rights Reserved # SURPI has been released under a modified BSD license. # Please see license file for details. -# Last revised 7/7/2014 +# Last revised 7/7/2014 scriptname=${0##*/} green='\e[0;32m' @@ -23,8 +23,9 @@ red='\e[0;31m' endColor='\e[0m' DATE=$(date +%m%d%Y) -db_dir="NCBI_$DATE" -curated_dir="curated_$DATE" +echo $DATE > DATE.txt +db_dir="NCBI" +curated_dir="curated" cleanup_dir="rawdata" @@ -34,6 +35,7 @@ cleanup_dir="rawdata" #You should likely stick with the default values here. If changing them, then the corresponding #values will also need to be changed within the SURPI config file. reference_dir="reference" +subtraction_dir="Subtraction_SNAP" taxonomy_dir="taxonomy" RAPSearch_dir="RAPSearch" FAST_dir="FAST_SNAP" @@ -41,7 +43,7 @@ SNAP_nt_dir="COMP_SNAP" RiboClean_dir="RiboClean_SNAP" # SNAP_nt_chunks= # of chunks to split nt into, SNAP index will be created for each chunk. -# Currently, the minimum number of chunks is around 17-20 +# Currently, the minimum number of chunks is around 17-20 # SNAP 0.15.4 will not successfully index nt with less than 17 chunks, though this will vary a bit with each NT release. # I picked 20 here as a safe default. # This will likely have to be increased when using SNAP 1.0, which has different indexing characteristics, @@ -53,9 +55,6 @@ if [ ! -d "$reference_dir" ]; then mkdir "$reference_dir" fi -#set SNAP index Ofactor. See SNAP documentation for details. -Ofactor=1000 - #download NCBI data to $db_dir, curated data to $curated_dir echo -e "$(date)\t$scriptname\tdownload_SURPI_data.sh -d $db_dir -c $curated_dir" download_SURPI_data.sh -d "$db_dir" -c "$curated_dir" @@ -68,12 +67,10 @@ if [ ! -d "$reference_dir/$taxonomy_dir" ]; then fi create_taxonomy_db.sh -d "$db_dir" -mv gi_taxid_nucl.db "$reference_dir/$taxonomy_dir/gi_taxid_nucl_$DATE.db" -mv gi_taxid_prot.db "$reference_dir/$taxonomy_dir/gi_taxid_prot_$DATE.db" -mv names_nodes_scientific.db "$reference_dir/$taxonomy_dir/names_nodes_scientific_$DATE.db" -ln -s "gi_taxid_nucl_$DATE.db" "$reference_dir/$taxonomy_dir/gi_taxid_nucl.db" -ln -s "gi_taxid_prot_$DATE.db" "$reference_dir/$taxonomy_dir/gi_taxid_prot.db" -ln -s "names_nodes_scientific_$DATE.db" "$reference_dir/$taxonomy_dir/names_nodes_scientific.db" +mv gi_taxid_nucl.db "$reference_dir/$taxonomy_dir/gi_taxid_nucl.db" +mv gi_taxid_prot.db "$reference_dir/$taxonomy_dir/gi_taxid_prot.db" +mv names_nodes_scientific.db "$reference_dir/$taxonomy_dir/names_nodes_scientific.db" + # ##create RAPSearch nr db and move into $RAPSearch_dir # @@ -85,10 +82,10 @@ echo -e "$(date)\t$scriptname\tDecompressing nr..." pigz -dc -k "$db_dir/nr.gz" > nr echo -e "$(date)\t$scriptname\tStarting prerapsearch on nr..." -prerapsearch -d nr -n "rapsearch_nr_${DATE}_db_v2.12" +prerapsearch -d nr -n "rapsearch_nr_db_v2.12" echo -e "$(date)\t$scriptname\tCompleted prerapsearch on nr." -mv rapsearch_nr_${DATE}_db_v2.12 "$reference_dir/$RAPSearch_dir" -mv rapsearch_nr_${DATE}_db_v2.12.info "$reference_dir/$RAPSearch_dir" +mv rapsearch_nr_db_v2.12 "$reference_dir/$RAPSearch_dir" +mv rapsearch_nr_db_v2.12.info "$reference_dir/$RAPSearch_dir" # ## Index curated data @@ -111,13 +108,13 @@ pigz -dc -k "$curated_dir/rdp_typed_iso_goodq_9210seqs.fa.gz" > rdp_typed_iso_go echo -e "$(date)\t$scriptname\tIndexing curated data..." echo -e "$(date)\t$scriptname\tSNAP indexing hg19_rRNA_mito_Hsapiens_rna.fa..." -snap index hg19_rRNA_mito_Hsapiens_rna.fa snap_index_hg19_rRNA_mito_Hsapiens_rna -hg19 -O$Ofactor +snap index hg19_rRNA_mito_Hsapiens_rna.fa snap_index_hg19_rRNA_mito_Hsapiens_rna -hg19 -locationSize 5 echo -e "$(date)\t$scriptname\tSNAP indexing Bacterial_Refseq_05172012.CLEAN.LenFiltered.uniq.fa..." -snap index Bacterial_Refseq_05172012.CLEAN.LenFiltered.uniq.fa snap_index_Bacterial_Refseq_05172012.CLEAN.LenFiltered.uniq_s16 -s 16 -O$Ofactor +snap index Bacterial_Refseq_05172012.CLEAN.LenFiltered.uniq.fa snap_index_Bacterial_Refseq_05172012.CLEAN.LenFiltered.uniq_s16 -s 16 -locationSize 5 echo -e "$(date)\t$scriptname\tStarting prerapsearch on rapsearch_viral_aa_130628_db_v2.12.fasta..." prerapsearch -d rapsearch_viral_aa_130628_db_v2.12.fasta -n "rapsearch_viral_aa_130628_db_v2.12" echo -e "$(date)\t$scriptname\tSNAP indexing viruses-5-2012_trimmedgi-MOD_addedgi.fa..." -snap index viruses-5-2012_trimmedgi-MOD_addedgi.fa snap_index_viruses-5-2012_trimmedgi-MOD_addedgi -O$Ofactor +snap index viruses-5-2012_trimmedgi-MOD_addedgi.fa snap_index_viruses-5-2012_trimmedgi-MOD_addedgi -locationSize 5 #RiboClean additions if [ ! -d "$reference_dir/$RiboClean_dir" ]; then @@ -125,20 +122,24 @@ if [ ! -d "$reference_dir/$RiboClean_dir" ]; then fi echo -e "$(date)\t$scriptname\tSNAP indexing 18s_rRNA_gene_not_partial.fa..." -snap index 18s_rRNA_gene_not_partial.fa snap_index_18s_rRNA_gene_not_partial.fa -O$Ofactor +snap index 18s_rRNA_gene_not_partial.fa snap_index_18s_rRNA_gene_not_partial.fa -locationSize 5 echo -e "$(date)\t$scriptname\tSNAP indexing viruses-5-2012_trimmedgi-MOD_addedgi.fa..." -snap index 23s.fa snap_index_23sRNA -O$Ofactor +snap index 23s.fa snap_index_23sRNA -locationSize 5 echo -e "$(date)\t$scriptname\tSNAP indexing rdp_typed_iso_goodq_9210seqs.fa..." -snap index 28s_rRNA_gene_NOT_partial_18s_spacer_5.8s.fa snap_index_28s_rRNA_gene_NOT_partial_18s_spacer_5.8s.fa -O$Ofactor +snap index 28s_rRNA_gene_NOT_partial_18s_spacer_5.8s.fa snap_index_28s_rRNA_gene_NOT_partial_18s_spacer_5.8s.fa -locationSize 5 echo -e "$(date)\t$scriptname\tSNAP indexing viruses-5-2012_trimmedgi-MOD_addedgi.fa..." -snap index rdp_typed_iso_goodq_9210seqs.fa snap_index_rdp_typed_iso_goodq_9210seqs -O$Ofactor +snap index rdp_typed_iso_goodq_9210seqs.fa snap_index_rdp_typed_iso_goodq_9210seqs -locationSize 5 if [ ! -d "$reference_dir/$FAST_dir" ]; then mkdir "$reference_dir/$FAST_dir" fi +if [ ! -d "$reference_dir/$subtraction_dir" ]; then + mkdir "$reference_dir/$subtraction_dir" +fi + echo -e "$(date)\t$scriptname\tMoving curated data into place..." -mv snap_index_hg19_rRNA_mito_Hsapiens_rna "$reference_dir" +mv snap_index_hg19_rRNA_mito_Hsapiens_rna "$reference_dir/$subtraction_dir" mv snap_index_Bacterial_Refseq_05172012.CLEAN.LenFiltered.uniq_s16 "$reference_dir/$FAST_dir" mv rapsearch_viral_aa_130628_db_v2.12 "$reference_dir/$RAPSearch_dir" mv rapsearch_viral_aa_130628_db_v2.12.info "$reference_dir/$RAPSearch_dir" @@ -160,14 +161,14 @@ else fi echo -e "$(date)\t$scriptname\tStarting indexing of SNAP-nt..." echo -e "$(date)\t$scriptname\tcreate_snap_to_nt.sh -n $SNAP_nt_chunks -f nt" -create_snap_to_nt.sh -n "$SNAP_nt_chunks" -f "nt" -p "$DATE" +create_snap_to_nt.sh -n "$SNAP_nt_chunks" -f "nt" -p "prefix" echo -e "$(date)\t$scriptname\tCompleted indexing of SNAP-nt." if [ ! -d "$reference_dir/$SNAP_nt_dir" ]; then mkdir "$reference_dir/$SNAP_nt_dir" fi -mv "snap_index_${DATE}.nt."* "$reference_dir/$SNAP_nt_dir" +mv "snap_index_prefix.nt."* "$reference_dir/$SNAP_nt_dir" # ## Cleanup @@ -181,6 +182,6 @@ mv *.fa "$cleanup_dir" mv *.fasta "$cleanup_dir" mv nr "$cleanup_dir" mv nt "$cleanup_dir" -mv "$DATE"* "$cleanup_dir" +mv "prefix"* "$cleanup_dir" echo -e "$(date)\t$scriptname\t${green}Completed creation of SURPI reference data.${endColor}" diff --git a/create_snap_to_nt.sh b/create_snap_to_nt.sh index edb297e..3ea42a1 100755 --- a/create_snap_to_nt.sh +++ b/create_snap_to_nt.sh @@ -15,7 +15,7 @@ # Copyright (C) 2014 Scot Federman - All Rights Reserved # SURPI has been released under a modified BSD license. # Please see license file for details. -# Last revised 7/2/2014 +# Last revised 7/2/2014 scriptname=${0##*/} bold=$(tput bold) @@ -25,10 +25,7 @@ red='\e[0;31m' endColor='\e[0m' prefix=$(date +%m%d%Y) -#set SNAP index Ofactor. See SNAP documentation for details -Ofactor=1000 - -while getopts ":n:hs:f:" option; do +while getopts ":n:hp:s:f:" option; do case "${option}" in f) fasta_file_to_index=${OPTARG};; n) num_chunks=${OPTARG};; @@ -44,20 +41,20 @@ done if [[ ${HELP} -eq 1 || $# -lt 1 ]] then cat < gi_taxid_nucl.dmp pigz -dc -k "$db_directory/gi_taxid_prot.dmp.gz" > gi_taxid_prot.dmp # the below grep "fixes" the issue whereby aliases, mispellings, and other alternate names are returned. -# We could simply look for a name that is a "scientific name", +# We could simply look for a name that is a "scientific name", # but this shrinks the db a bit, speeding up lookups, and removes data we do not need at this time. echo -e "$(date)\t$scriptname\tRetaining scientific names..." grep "scientific name" names.dmp > names_scientificname.dmp diff --git a/crop_reads.csh b/crop_reads.csh index 4b5a56d..29fc65b 100755 --- a/crop_reads.csh +++ b/crop_reads.csh @@ -9,7 +9,7 @@ # # # will go from position 10 to 85 (or whatever size is available) -# +# # # $1 = file to crop # $2 = position to start crop ($start_nt) @@ -18,11 +18,11 @@ # Copyright (C) 2014 Charles Y Chiu - All Rights Reserved # SURPI has been released under a modified BSD license. # Please see license file for details. -# Last revised 1/26/2014 - +# Last revised 1/26/2014 + if ($#argv != 3) then echo "Usage: crop_reads.csh " exit(1) endif -cat $argv[1] | awk '(NR%2==1){print $0} (NR%2==0){print substr($0,'"$argv[2]"','"$argv[3]"')}' +cat $argv[1] | awk '(NR%2==1){print $0} (NR%2==0){print substr($0,'"$argv[2]"','"$argv[3]"')}' diff --git a/cutadapt_quality.csh b/cutadapt_quality.csh index 9198d9a..f29c2de 100755 --- a/cutadapt_quality.csh +++ b/cutadapt_quality.csh @@ -3,7 +3,7 @@ # cutadapt_quality.csh # # runs cutadapt to remove primer sequences from Illumina files -# also accepts a quality argument for Illumina / Sanger quality +# also accepts a quality argument for Illumina / Sanger quality # user specifies length cutoff # user specifies whether short reads less than length cutoff are kept; if so, they are converted to size=1 # @@ -19,7 +19,7 @@ # Copyright (C) 2014 Charles Chiu - All Rights Reserved # SURPI has been released under a modified BSD license. # Please see license file for details. - + if ($#argv != 7) then echo "Usage: cutadapt_quality.csh " exit(1) @@ -113,6 +113,6 @@ endif #set numreads_end = `egrep -c "@HWI|@M00|@SRR" $inputfile:r.cutadapt.fastq` #@ reads_removed = $numreads_start - $numreads_end -#echo $reads_removed" reads removed by cutadapt" +#echo $reads_removed" reads removed by cutadapt" #echo $numreads_end" reads at end of cutadapt" rm -f $TEMPDIR{$$} diff --git a/debug.sh b/debug.sh new file mode 100755 index 0000000..8a07c32 --- /dev/null +++ b/debug.sh @@ -0,0 +1,41 @@ +#!/bin/bash +set -e +trap _exit_trap EXIT +trap _err_trap ERR +_showed_traceback=f + +function _exit_trap +{ + local _ec="$?" + if [[ $_ec != 0 && "${_showed_traceback}" != t ]]; then + traceback 1 + fi +} + +function _err_trap +{ + local _ec="$?" + local _cmd="${BASH_COMMAND:-unknown}" + traceback 1 + _showed_traceback=t + echo "The command ${_cmd} exited with exit code ${_ec}." 1>&2 +} + +function traceback +{ + # Hide the traceback() call. + local -i start=$(( ${1:-0} + 1 )) + local -i end=${#BASH_SOURCE[@]} + local -i i=0 + local -i j=0 + + echo "Traceback (last called is first):" 1>&2 + for ((i=${start}; i < ${end}; i++)); do + j=$(( $i - 1 )) + local function="${FUNCNAME[$i]}" + local file="${BASH_SOURCE[$i]}" + local line="${BASH_LINENO[$j]}" + echo " ${function}() in ${file}:${line}" 1>&2 + done +} + diff --git a/download_SURPI_data.sh b/download_SURPI_data.sh index c2bf6a3..f20f32e 100755 --- a/download_SURPI_data.sh +++ b/download_SURPI_data.sh @@ -12,7 +12,7 @@ # Copyright (C) 2014 Scot Federman - All Rights Reserved # SURPI has been released under a modified BSD license. # Please see license file for details. -# Last revised 7/2/2014 +# Last revised 7/2/2014 scriptname=${0##*/} bold=$(tput bold) @@ -38,10 +38,10 @@ done if [[ ${HELP-} -eq 1 ]] then cat < " + echo "Usage: $scriptname " exit 65 fi @@ -36,7 +36,7 @@ if [ $inputfile_type = BLASTN ] then awk '{print$1}' $inputfile > $inputfile.header echo -e "$(date)\t$scriptname\tuniqued blastn file, replaced beginning with @" - + if [ $parentfile_type = FASTA ] then seqtk subseq $parentfile $inputfile.header > $outputfile @@ -59,12 +59,12 @@ then echo -e "$(date)\t$scriptname\tDone preparing input Fasta file " if [ $parentfile_type = FASTA ] - then + then seqtk subseq $parentfile $inputfile.header > $outputfile elif [ $parentfile_type = FASTQ ] then if [ $output_format = FASTQ ] - then + then cat $parentfile | fqextract $inputfile.header > $outputfile elif [ $output_format = FASTA ] then diff --git a/extractHeaderFromFastq.csh b/extractHeaderFromFastq.csh index d441aa2..5945279 100755 --- a/extractHeaderFromFastq.csh +++ b/extractHeaderFromFastq.csh @@ -14,7 +14,7 @@ # Copyright (C) 2014 Charles Y Chiu - All Rights Reserved # SURPI has been released under a modified BSD license. # Please see license file for details. -# Last revised 1/26/2014 +# Last revised 1/26/2014 if ($#argv != 4) then echo "Usage: extractHeaderFromFastq.csh
" diff --git a/extractHeaderFromFastq_ncores.sh b/extractHeaderFromFastq_ncores.sh index c0e63eb..f95e4ee 100755 --- a/extractHeaderFromFastq_ncores.sh +++ b/extractHeaderFromFastq_ncores.sh @@ -50,7 +50,7 @@ split -l $LinesPerCore -a 3 $parentfile $parentfile.SplitXS & awk '{print$1}' $queryfile > $queryfile.header & echo -e "$(date)\t$scriptname\textracting header from $queryfile" -for job in `jobs -p` +for job in $(jobs -p) do wait $job done diff --git a/extractSamFromSam.sh b/extractSamFromSam.sh index 31cfe1a..79b91e6 100755 --- a/extractSamFromSam.sh +++ b/extractSamFromSam.sh @@ -1,7 +1,7 @@ #!/bin/bash -# +# # extactSamFromSam.sh -# +# # extract SAM reads corresponding to a SAM header file from another SAM reference file and writes to # SAM output file # Chiu Laboratory @@ -40,7 +40,7 @@ else let "numlines = `wc -l basef | awk '{print $1}'`" let "LinesPerCore = numlines / $cores" echo -e "$(date)\t$scriptname\twill use $cores cores with $LinesPerCore entries per core" - + split -l $LinesPerCore $basef echo -e "$(date)\t$scriptname\textracting reads from $baseg using headers from $basef" diff --git a/fastq-extractBarcodedSRA.sh b/fastq-extractBarcodedSRA.sh index f990a01..1b2fd58 100755 --- a/fastq-extractBarcodedSRA.sh +++ b/fastq-extractBarcodedSRA.sh @@ -18,7 +18,7 @@ if [ $# -lt 1 ]; then exit fi -# +# # gets absolute path to SRA file # SRA_PATH=$(readlink -e $1) @@ -34,7 +34,7 @@ files=($header*.fastq) # restore barcode and R1/R2 designation to FASTQ headers for f in "${files[@]}" -do +do barcode=$(echo "$f" | sed 's/_[12]//g' | sed 's/.fastq//g' | sed 's/.*_//g') readnum=$(echo "$f" | sed 's/.*_\([12]\).fastq/\1/g' | sed '/'$header'/d') echo -e "$(date)\t$scriptname\tRestoring barcode # $barcode and R1/R2 designation to $f" diff --git a/get_genbankfasta.pl b/get_genbankfasta.pl index c4c2f22..995872e 100755 --- a/get_genbankfasta.pl +++ b/get_genbankfasta.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl # # get_genbankfasta.pl # @@ -33,7 +33,7 @@ if ($opt_h) { print < " + echo "Usage: $scriptname <.fasta> " echo "not uniquing Blastn anymore 9/13/13 given a fasta file and the GI or FASTA of the reference to assemble to -> makes an assembly. blastn e value in 1e-8 format . jedi has 16 cores available" exit fi @@ -59,7 +59,7 @@ done cat $1*.$2.$3.$4.blastn > $1.$2.$3.$4.Blastn -uniq_blastn_nopipes.sh $1.$2.$3.$4.Blastn +uniq_blastn_nopipes.sh $1.$2.$3.$4.Blastn extractAlltoFast.sh $1.$2.$3.$4.Blastn.uniq BLASTN $1 FASTA $1.$2.$3.$4.Blastn.uniq.ex.fa FASTA ####figuring out length of sequence in gi####### @@ -79,7 +79,7 @@ echo -e -n "$(date)\t$scriptname\t%Coverage = " echo "scale=6;100*$coverage/$gilength" | bc let "sumofcolumntwo = `awk '{print$2}' $1.$2.$3.$4.map | awk '{sum += $1} END{print sum}'`" -echo -e -n "$(date)\t$scriptname\tAverage depth of coverage (x) = " +echo -e -n "$(date)\t$scriptname\tAverage depth of coverage (x) = " echo "scale=6;$sumofcolumntwo/$gilength" | bc let "numberBlastnReads = `egrep -c "^SCS|^HWI|gi|^M00|^kmer|^SRR" $1.$2.$3.$4.Blastn.uniq`" echo -e "$(date)\t$scriptname\tNumber of reads contributing to assembly $numberBlastnReads" @@ -94,7 +94,7 @@ echo "Reference sequence length = $gilength bp" >> $1.$2.$3.$4.report echo "Coverage in bp = $coverage" >> $1.$2.$3.$4.report echo -n "%Coverage = " >> $1.$2.$3.$4.report echo "scale=6;100*$coverage/$gilength" | bc >> $1.$2.$3.$4.report -echo -n "Average depth of coverage = " >> $1.$2.$3.$4.report +echo -n "Average depth of coverage = " >> $1.$2.$3.$4.report echo "scale=6;$sumofcolumntwo/$gilength" | bc >> $1.$2.$3.$4.report #echo "Average depth of coverage = $averagedepthcoverage x " >> $1.$2.$3.$4.report echo "Number of reads contributing to assembly = $numberBlastnReads" >> $1.$2.$3.$4.report @@ -106,7 +106,7 @@ ps2pdf14 $1.$2.$3.$4.ps $1.$2.$3.$4.pdf #### Cleanup ######## rm -f $1*$2.$3.$4.blastn rm -f $1*$2.$3.$4.error -rm -f $3.$2.fasta.nhr +rm -f $3.$2.fasta.nhr rm -f $3.$2.fasta.nin rm -f $3.$2.fasta.nsq #rm -f $1.$2.$3.$4.Blastn diff --git a/preprocess.sh b/preprocess.sh index 443ba3d..032ba42 100755 --- a/preprocess.sh +++ b/preprocess.sh @@ -8,7 +8,7 @@ # January, 2014 # # cutadapt=>crop->dust removal (no uniq) *** -# +# # 12/20/12 - modified to switch to cutadapt for trimming # 12/31/12 - modified from Cshell to BASH version for timing # @@ -17,6 +17,8 @@ # Please see license file for details. scriptname=${0##*/} +source debug.sh +source logging.sh if [ $# != 10 ]; then echo "Usage: $scriptname " @@ -44,15 +46,15 @@ fi if [ $quality = "S" ] then - echo -e "$(date)\t$scriptname\tselected Sanger quality" + log "selected Sanger quality" else - echo -e "$(date)\t$scriptname\tselected Illumina quality" + log "selected Illumina quality" fi # fix header if space is present s=`head -1 $inputfile | awk '{if ($0 ~ / /) {print "SPACE"} else {print "NOSPACE"}}'` -echo -e "$(date)\t$scriptname\t$s in header" +log "$s in header" nopathf=${1##*/} basef=${nopathf%.fastq} @@ -60,7 +62,7 @@ basef=${nopathf%.fastq} #################### START OF PREPROCESSING, READ1 ######################### # run cutadapt, Read1 -echo -e "$(date)\t$scriptname\t********** running cutadapt, Read1 **********" +log "********** running cutadapt, Read1 **********" if [ $s == "SPACE" ] then sed "s/\([@HWI|@M00135|@SRR][^ ]*\) \(.\):.:0:\(.*\)/\1#\3\/\2/g" $inputfile > $basef.modheader.fastq @@ -76,12 +78,12 @@ fi END1=$(date +%s) diff=$(( $END1 - $START1 )) -echo -e "$(date)\t$scriptname\tDone cutadapt: CUTADAPT took $diff seconds" +log "Done cutadapt: CUTADAPT took $diff seconds" # run uniq, Read1 if [ $run_uniq == "Y" ] then - echo -e "$(date)\t$scriptname\t********** running uniq, Read1 **********" + log "********** running uniq, Read1 **********" START1=$(date +%s) if [ $quality = "S" ] @@ -93,31 +95,31 @@ then END1=$(date +%s) diff=$(( $END1 - $START1 )) - echo -e "$(date)\t$scriptname\tDone uniq: UNIQ took $diff seconds" + log "Done uniq: UNIQ took $diff seconds" fi # run crop, Read 1 -echo -e "$(date)\t$scriptname\t********** running crop, Read1 **********" +log "********** running crop, Read1 **********" START1=$(date +%s) -if [ $run_uniq == "Y" ] +if [ $run_uniq == "Y" ] then - echo -e "$(date)\t$scriptname\tWe will be using $crop_length as the length of the cropped read" + log "We will be using $crop_length as the length of the cropped read" crop_reads.csh $basef.cutadapt.uniq.fastq $start_nt $crop_length > $basef.cutadapt.uniq.cropped.fastq else - echo -e "$(date)\t$scriptname\tWe will be using $crop_length as the length of the cropped read" + log "We will be using $crop_length as the length of the cropped read" crop_reads.csh $basef.cutadapt.fastq $start_nt $crop_length > $basef.cutadapt.cropped.fastq fi END1=$(date +%s) diff=$(( $END1 - $START1 )) -echo -e "$(date)\t$scriptname\tDone crop: CROP took $diff seconds" +log "Done crop: CROP took $diff seconds" # run dust, Read1 -echo -e "$(date)\t$scriptname\t********** running dust, Read1 **********" +log "********** running dust, Read1 **********" START1=$(date +%s) -if [ $run_uniq == "Y" ] +if [ $run_uniq == "Y" ] then prinseq-lite.pl -fastq $basef.cutadapt.uniq.cropped.fastq -out_format 3 -out_good $basef.cutadapt.uniq.cropped.dusted -out_bad $basef.cutadapt.uniq.cropped.dusted.bad -log -lc_method dust -lc_threshold 7 mv -f $basef.cutadapt.uniq.cropped.dusted.fastq $basef.preprocessed.fastq @@ -128,4 +130,4 @@ fi END1=$(date +%s) diff=$(( $END1 - $START1 )) -echo -e "$(date)\t$scriptname\tDone dust: DUST took $diff seconds" +log "Done dust: DUST took $diff seconds" diff --git a/preprocess_ncores.sh b/preprocess_ncores.sh index 820e50a..1b8ba03 100755 --- a/preprocess_ncores.sh +++ b/preprocess_ncores.sh @@ -1,5 +1,5 @@ #!/bin/bash -# +# # preprocess_ncores.sh # # This script runs preprocessing across multiple cores (FASTA/FASTQ header modification, quality filtering, adapter trimming, and low-complexity filtering) @@ -13,8 +13,9 @@ # Please see license file for details. scriptname=${0##*/} +source logging.sh -if [ $# != 12 ]; then +if [ $# != 11 ]; then echo "Usage: $scriptname <# of cores> " exit fi @@ -25,13 +26,12 @@ quality=$2 run_uniq=$3 length_cutoff=$4 cores=$5 -cache_reset=$6 -keep_short_reads=$7 -adapter_set=$8 -start_nt=$9 -crop_length=${10} -temporary_files_directory=${11} -quality_cutoff=${12} +keep_short_reads=$6 +adapter_set=$7 +start_nt=$8 +crop_length=${9} +temporary_files_directory=${10} +quality_cutoff=${11} ### if [ ! -f $inputfile ] @@ -40,40 +40,33 @@ then exit fi -freemem=$(free -g | awk '{print $4}' | head -n 2 | tail -1) -echo -e "$(date)\t$scriptname\tThere is $freemem GB available free memory...[cutoff=$free_cache_cutoff GB]" -if [ $freemem -lt $free_cache_cutoff ] -then - echo -e "$(date)\t$scriptname\tClearing cache..." - dropcache -fi - START=$(date +%s) -echo -e "$(date)\t$scriptname\tSplitting $inputfile..." +log "Splitting $inputfile..." let "numlines = `wc -l $inputfile | awk '{print $1}'`" let "FASTQentries = numlines / 4" -echo -e "$(date)\t$scriptname\tThere are $FASTQentries FASTQ entries in $inputfile" +log "There are $FASTQentries FASTQ entries in $inputfile" let "LinesPerCore = numlines / $cores" -let "FASTQperCore = LinesPerCore / 4" +# Prevent leaving a small remainder file after split by adding 1. +let "FASTQperCore = LinesPerCore / 4 + 1" let "SplitPerCore = FASTQperCore * 4" -echo -e "$(date)\t$scriptname\twill use $cores cores with $FASTQperCore entries per core" +log "will use $cores cores with $FASTQperCore entries per core" split -l $SplitPerCore $inputfile END_SPLIT=$(date +%s) diff_SPLIT=$(( END_SPLIT - START )) -echo -e "$(date)\t$scriptname\tDone splitting: " -echo -e "$(date)\t$scriptname\tSPLITTING took $diff_SPLIT seconds" +log "Done splitting: " +log "SPLITTING took $diff_SPLIT seconds" -echo -e "$(date)\t$scriptname\tRunning preprocess script for each chunk..." +log "Running preprocess script for each chunk..." for f in x?? do mv $f $f.fastq - echo -e "$(date)\t$scriptname\tpreprocess.sh $f.fastq $quality N $length_cutoff $keep_short_reads $adapter_set $start_nt $crop_length $temporary_files_directory >& $f.preprocess.log &" + log "preprocess.sh $f.fastq $quality N $length_cutoff $keep_short_reads $adapter_set $start_nt $crop_length $temporary_files_directory >& $f.preprocess.log &" preprocess.sh $f.fastq $quality N $length_cutoff $keep_short_reads $adapter_set $start_nt $crop_length $temporary_files_directory $quality_cutoff >& $f.preprocess.log & done @@ -82,7 +75,7 @@ do wait $job done -echo -e "$(date)\t$scriptname\tDone preprocessing for each chunk..." +log "Done preprocessing for each chunk..." nopathf2=${1##*/} basef2=${nopathf2%.fastq} @@ -108,20 +101,20 @@ do cat $basef.preprocessed.fastq >> $basef2.preprocessed.fastq rm -f $basef.preprocessed.fastq cat $basef.cutadapt.cropped.dusted.bad.fastq >> $basef2.cutadapt.cropped.dusted.bad.fastq - rm -f $basef.cutadapt.cropped.dusted.bad.fastq + rm -f $basef.cutadapt.cropped.dusted.bad.fastq rm -f $f rm -f $basef.modheader.fastq rm -f $basef.cutadapt.summary.log rm -f $basef.adapterinfo.log - rm -f $basef.cutadapt.cropped.fastq + rm -f $basef.cutadapt.cropped.fastq done -echo -e "$(date)\t$scriptname\tDone concatenating output..." +log "Done concatenating output..." if [ $run_uniq == "Y" ]; # selecting unique reads then - echo -e "$(date)\t$scriptname\tSelecting unique reads" + log "Selecting unique reads" START_UNIQ=$(date +%s) # selecting unique reads sed "n;n;n;d" $basef2.preprocessed.fastq | sed "n;n;d" | sed "s/^@/>/g" > $basef2.preprocessed.fasta @@ -130,28 +123,28 @@ then cp -f $basef2.uniq.fastq $basef2.preprocessed.fastq END_UNIQ=$(date +%s) diff_UNIQ=$(( END_UNIQ - START_UNIQ )) - echo -e "$(date)\t$scriptname\tUNIQ took $diff_UNIQ seconds" + log "UNIQ took $diff_UNIQ seconds" else - echo -e "$(date)\t$scriptname\tIncluding duplicates (did not run UNIQ)" + log "Including duplicates (did not run UNIQ)" fi END=$(date +%s) diff_TOTAL=$(( END - START )) let "avgtime1=`grep CUTADAPT $basef2.preprocess.log | awk '{print $12}' | sort -n | awk '{ a[i++]=$1} END {print a[int(i/2)];}'`" -echo -e "$(date)\t$scriptname\tmedian CUTADAPT time per core: $avgtime1 seconds" +log "median CUTADAPT time per core: $avgtime1 seconds" if [ $run_uniq = "Y" ]; then let "avgtime2 = $diff_UNIQ" - echo -e "$(date)\t$scriptname\tUNIQ time: $diff_UNIQ seconds" + log "UNIQ time: $diff_UNIQ seconds" else let "avgtime2=0" fi let "avgtime3=`grep DUST $basef2.preprocess.log | awk '{print $12}' | sort -n | awk '{ a[i++]=$1} END {print a[int(i/2)];}'`" -echo -e "$(date)\t$scriptname\tmedian DUST time per core: $avgtime3 seconds" +log "median DUST time per core: $avgtime3 seconds" let "totaltime = diff_SPLIT + avgtime1 + avgtime2 + avgtime3" -echo -e "$(date)\t$scriptname\tTOTAL TIME: $totaltime seconds" +log "TOTAL TIME: $totaltime seconds" -echo -e "$(date)\t$scriptname\tTOTAL CLOCK TIME (INCLUDING OVERHEAD): $diff_TOTAL seconds" +log "TOTAL CLOCK TIME (INCLUDING OVERHEAD): $diff_TOTAL seconds" diff --git a/ribo_snap_bac_euk.sh b/ribo_snap_bac_euk.sh index 261e3de..5a8a080 100755 --- a/ribo_snap_bac_euk.sh +++ b/ribo_snap_bac_euk.sh @@ -44,12 +44,12 @@ fasta_to_fastq $inputfile.fasta > $inputfile.fakq # if Bacteria.annotated file h crop_reads.csh $inputfile.fakq 10 75 > $inputfile.fakq.crop # snap against large ribosomal subunit -snap single $SNAP_index_Large $inputfile.fakq.crop -o $inputfile.noLargeS.unmatched.sam -t $cores -x -f -h 250 -d 18 -n 200 -F u +snap single $SNAP_index_Large -fastq $inputfile.fakq.crop -o $inputfile.noLargeS.unmatched.sam -t $cores -x -f -h 250 -d 18 -n 200 -F u egrep -v "^@" $inputfile.noLargeS.unmatched.sam | awk '{if($3 == "*") print "@"$1"\n"$10"\n""+"$1"\n"$11}' > $(echo "$inputfile".noLargeS.unmatched.sam | sed 's/\(.*\)\..*/\1/').fastq echo -e "$(date)\t$scriptname\tDone: first snap alignment" # snap against small ribosomal subunit -snap single $SNAP_index_Small $inputfile.noLargeS.unmatched.fastq -o $inputfile.noSmallS_LargeS.sam -t $cores -h 250 -d 18 -n 200 -F u +snap single $SNAP_index_Small -fastq $inputfile.noLargeS.unmatched.fastq -o $inputfile.noSmallS_LargeS.sam -t $cores -h 250 -d 18 -n 200 -F u echo -e "$(date)\t$scriptname\tDone: second snap alignment" # convert snap unmatched to ribo output to header format @@ -58,8 +58,8 @@ awk '{print$1}' $inputfile.noSmallS_LargeS.sam | sed '/^@/d' > $inputfile.noSmal # retrieve reads from original $inputfile extractSamFromSam.sh $inputfile.noSmallS_LargeS.header.sam $inputfile $basef.noRibo.annotated -echo -e "$(date)\t$scriptname\tCreated $inputfile.noRibo.annotated" -table_generator.sh $basef.noRibo.annotated SNAP N Y N N +echo -e "$(date)\t$scriptname\tCreated $inputfile.noRibo.annotated" +table_generator.sh $basef.noRibo.annotated SNAP N Y N N rm -f $inputfile.noLargeS.sam rm -f $inputfile.noLargeS.matched.sam diff --git a/slave/setup_slave.sh b/slave/setup_slave.sh index e9a52a8..58a5261 100755 --- a/slave/setup_slave.sh +++ b/slave/setup_slave.sh @@ -22,7 +22,7 @@ if [ ! -d $bin_folder ] then mkdir $bin_folder fi - + CWD=$(pwd) # set timezone diff --git a/snap_nt.sh b/snap_nt.sh index 50c31b3..c010cff 100755 --- a/snap_nt.sh +++ b/snap_nt.sh @@ -1,22 +1,23 @@ #!/bin/bash -# +# # snap_nt.sh # # This script runs SNAP against the NT database # Chiu Laboratory # University of California, San Francisco # -# +# # Copyright (C) 2014 Charles Chiu - All Rights Reserved # SURPI has been released under a modified BSD license. # Please see license file for details. - -expected_args=6 scriptname=${0##*/} +source debug.sh +source logging.sh +expected_args=5 if [ $# -lt $expected_args ] then - echo "Usage: $scriptname " + echo "Usage: $scriptname " exit 65 fi @@ -24,76 +25,76 @@ fi inputfile=$1 SNAP_NT_index_directory=$2 cores=$3 -free_cache_cutoff=$4 -SNAP_d_cutoff=$5 -snap=$6 +SNAP_d_cutoff=$4 +snap=$5 ### -echo -e "$(date)\t$scriptname\tStarting SNAP to NT" +log "Starting SNAP to NT" START1=$(date +%s) -echo -e "$(date)\t$scriptname\tInput file: $inputfile" +log "Input file: $inputfile" nopathf=${inputfile##*/} # remove the path to file -echo -e "$(date)\t$scriptname\tAfter removing path: $nopathf" +log "After removing path: $nopathf" basef=${nopathf%.fastq} # remove FASTQextension -echo -e "$(date)\t$scriptname\tAfter removing FASTQ extension: $basef" - -echo -e "$(date)\t$scriptname\tMapping $basef to NT..." - -rm -f $basef.prev.sam -rm -f $basef.tmp.sam -rm -f $basef.tmp2.sam -rm -f $basef.NT.sam - +log "After removing FASTQ extension: $basef" + +log "Mapping $basef to NT..." + +cleanup() { + rm -f $basef.prev.sam || true + rm -f $basef.tmp.sam || true + rm -f $basef.tmp2.sam || true + rm -f $basef.NT.sam || true + rm $basef.snapNT.log || true + rm $basef.timeNT.log || true +} +cleanup counter=0 -for snap_index in $SNAP_NT_index_directory/* ; do - freemem=$(free -g | awk '{print $4}' | head -n 2 | tail -1) - echo -e "$(date)\t$scriptname\tThere is $freemem GB available free memory...[cutoff=$free_cache_cutoff GB]" - if [ $freemem -lt $free_cache_cutoff ] - then - echo -e "$(date)\t$scriptname\tClearing cache..." - dropcache - fi - nopathsnap_index=${snap_index##*/} # remove the path to file - echo -e "$(date)\t$scriptname\tFound $snap_index ... processing ..." +for snap_index_basename in $(ls -1v "$SNAP_NT_index_directory") ; do + #nopathsnap_index=${snap_index##*/} # remove the path to file + snap_index = "$SNAP_NT_index_directory/$snap_index_basename" + log "Found $snap_index_basename ... processing ..." START2=$(date +%s) if [[ $counter -eq 0 ]] then #running first SNAP chunk - /usr/bin/time -o $basef.snap.log $snap single $snap_index $basef.fastq -o $basef.tmp.sam -t $cores -x -f -h 250 -d $SNAP_d_cutoff -n 25 > $basef.time.log + /usr/bin/time -o $basef.time.log $snap single $snap_index $basef.fastq -o $basef.$nopathsnap_index.sam -t $cores -x -f -h 250 -d $SNAP_d_cutoff -n 25 > $basef.snap.log + ln --symbolic --force $basef.$nopathsnap_index.sam $basef.tmp.sam # cp $basef.tmp.sam temp.sam else #running 2nd SNAP chunk through last SNAP chunk - /usr/bin/time -o $basef.snap.log $snap single $snap_index $basef.tmp.fastq -o $basef.tmp.sam -t $cores -x -f -h 250 -d $SNAP_d_cutoff -n 25 > $basef.time.log + /usr/bin/time -o $basef.time.log $snap single $snap_index $basef.tmp.fastq -o $basef.$nopathsnap_index.sam -t $cores -x -f -h 250 -d $SNAP_d_cutoff -n 25 > $basef.snap.log + ln --symbolic --force $basef.$nopathsnap_index.sam $basef.tmp.sam fi cat $basef.snap.log >> $basef.snapNT.log cat $basef.time.log >> $basef.timeNT.log - + compare_sam.py $basef.tmp.sam $basef.prev.sam cat $basef.prev.sam | egrep -v "^@" | awk '{print "@"$1"\n"$10"\n""+"$1"\n"$11}' > $basef.tmp.fastq counter=1 - + END2=$(date +%s) - echo -e "$(date)\t$scriptname\tDone with $snap_index." + log "Done with $snap_index." diff=$(( $END2 - $START2 )) - echo -e "$(date)\t$scriptname\tMapping of $snap_index took $diff seconds." + log "Mapping of $snap_index took $diff seconds." done # need to restore the hits -update_sam.py $basef.prev.sam $basef.NT.sam +update_sam.py $basef.prev.sam $basef.NT.sam rm -f $basef.tmp.sam rm -f $basef.tmp.fastq rm -f $basef.prev.sam +rm -f $basef.$nopathsnap_index.* END1=$(date +%s) -echo -e "$(date)\t$scriptname\tDone with SNAP_NT" +log "Done with SNAP_NT" diff=$(( $END1 - $START1 )) -echo -e "$(date)\t$scriptname\toutput written to $basef.NT.sam" -echo -e "$(date)\t$scriptname\tSNAP_NT took $diff seconds" +log "output written to $basef.NT.sam" +log "SNAP_NT took $diff seconds" diff --git a/snap_nt_combine.sh b/snap_nt_combine.sh index 08b71d7..cb700aa 100755 --- a/snap_nt_combine.sh +++ b/snap_nt_combine.sh @@ -19,7 +19,7 @@ # Copyright (C) 2014 Charles Chiu - All Rights Reserved # Permission to copy and modify is granted under the BSD license -expected_args=6 +expected_args=5 scriptname=${0##*/} if [ $# -lt $expected_args ] @@ -32,9 +32,8 @@ fi inputfile=$1 SNAP_NT_index_directory=$2 cores=$3 -free_cache_cutoff=$4 -SNAP_d_cutoff=$5 -simultaneous_SNAPs=$6 +SNAP_d_cutoff=$4 +simultaneous_SNAPs=$5 ### echo -e "$(date)\t$scriptname\tStarting SNAP to NT" @@ -55,14 +54,6 @@ for snap_index in $SNAP_NT_index_directory/*; do nopathsnap_index=${snap_index##*/} # remove the path to file echo -e "$(date)\t$scriptname\tStarting SNAP on $nopathsnap_index" - freemem=$(free -g | awk '{print $4}' | head -n 2 | tail -1) - echo -e "$(date)\t$scriptname\tThere is $freemem GB available free memory...[cutoff=$free_cache_cutoff GB]" - if [ $freemem -lt $free_cache_cutoff ] - then - echo -e "$(date)\t$scriptname\tClearing cache..." - dropcache - fi - START_SNAP=$(date +%s) /usr/bin/time -o $basef.$nopathsnap_index.snap.log snap single $snap_index $basef.fastq -o $basef.$nopathsnap_index.sam -t $cores -x -f -h 250 -d $SNAP_d_cutoff -n 25 > $basef.$nopathsnap_index.time.log SNAP_DONE=$(date +%s) diff --git a/split_fasta.pl b/split_fasta.pl index 060cb92..f67f660 100755 --- a/split_fasta.pl +++ b/split_fasta.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl #split_fasta.pl version 1.0 #This script accepts a file consisting of multiple FASTA formatted sequence records. #It splits the file into multiple new files, each consisting of a subset of the original records. @@ -80,7 +80,7 @@ #write record to file print (OUTFILE ">$sequenceTitle\n"); print (OUTFILE "$sequenceEntry\n"); - $seqCount++; + $seqCount++; $seqThisFile++; if ($seqThisFile == $numberToCopy) { diff --git a/start_slaves.sh b/start_slaves.sh index 622df47..e428d77 100755 --- a/start_slaves.sh +++ b/start_slaves.sh @@ -33,14 +33,14 @@ file_with_slave_ips=$7 placement_group=$8 ### -#This is the time (in seconds) this script waits after starting up the AWS machines. +#This is the time (in seconds) this script waits after starting up the AWS machines. #It allows the instances time to start up. In practice, 120s appears to be sufficient. WAIT_TIME=120 #folder where slave scripts are located slave_folder="/usr/local/bin/surpi/slave" #script located on slave to run -slave_script="slave_setup.sh" +slave_script="slave_setup.sh" #this parameter is currently tied to the $keypair used during slave_setup.sh. should be cleaned up prior to release pemkey="/home/ubuntu/.ssh/surpi.pem" @@ -106,7 +106,7 @@ do done # instance_id_list=( $result ) # instance_id_size=${#instance_id_list[@]} -# +# # if [ "$instance_id_size" != "$number_of_instances" ]; then # echo "ERROR: could not create $number_of_instances instances" # echo "$number_of_instances instances were created." @@ -150,7 +150,7 @@ do echo -e "$(date)\t$scriptname\t$attached" >> slave.$COUNTER.log 2>&1 #verify $attached here to verify that EBS was attached - + echo -e "$(date)\t$scriptname\tconnecting and running $slave_script script..." >> slave.$COUNTER.log 2>&1 ssh -o StrictHostKeyChecking=no -i $pemkey ubuntu@$host "/home/ubuntu/$slave_script" >> slave.$COUNTER.log 2>&1 & let COUNTER=COUNTER+1 diff --git a/table_generator.sh b/table_generator.sh index 240d3b1..4491367 100755 --- a/table_generator.sh +++ b/table_generator.sh @@ -7,7 +7,7 @@ # University of California, San Francisco # January, 2014 # -# input file: annotated sam or annotated -m 8 file with taxonomies provided in the following format at the end of the .sam and -m 8 file: +# input file: annotated sam or annotated -m 8 file with taxonomies provided in the following format at the end of the .sam and -m 8 file: # "gi# --family --genus --species" # Output files are tab delimited files ending in .counttable whereby rows represent taxonomic annotations at various levels (family, genus, species, gi) # Columns represent individual barcodes found in the dataset, and cells contain the number of reads @@ -20,7 +20,7 @@ scriptname=${0##*/} if [ $# -lt 6 ]; then - echo "Usage: $scriptname " + echo "Usage: $scriptname " exit fi @@ -33,7 +33,7 @@ genus=$5 family=$6 ### -###substitute forward slash with @_ because forward slash in species name makes it ungreppable. using @_ because @ is used inside the contig barcode (ie. #@13 is barcode 13, contig generated) +###substitute forward slash with @_ because forward slash in species name makes it ungreppable. using @_ because @ is used inside the contig barcode (ie. #@13 is barcode 13, contig generated) create_tab_delimited_table.pl -f $file_type $inputfile | sed 's/ /_/g' | sed 's/,/_/g' | sed 's/\//@_/g' > $inputfile.tempsorted echo -e "$(date)\t$scriptname\tdone creating $inputfile.tempsorted" @@ -61,7 +61,7 @@ then done echo -e "GI\tSpecies\tGenus\tFamily(@=contigbarcode)" > $inputfile.header cat $inputfile.header $inputfile.gi.uniq.columntable > $inputfile.gi.counttable_temp - paste $inputfile.gi.counttable_temp bar*.$inputfile.gi.output > $inputfile.gi.counttable + paste $inputfile.gi.counttable_temp bar*.$inputfile.gi.output > $inputfile.gi.counttable sed -i 's/@_/ /g' $inputfile.gi.counttable echo -e "$(date)\t$scriptname\tdone generating gi counttable" fi @@ -84,7 +84,7 @@ then done echo -e "Species\tGenus\tFamily(@=contigbarcode)" > $inputfile.header cat $inputfile.header $inputfile.species.uniq.columntable > $inputfile.species.counttable_temp - paste $inputfile.species.counttable_temp bar*.$inputfile.species.output > $inputfile.species.counttable + paste $inputfile.species.counttable_temp bar*.$inputfile.species.output > $inputfile.species.counttable sed -i 's/@_/ /g' $inputfile.species.counttable echo -e "$(date)\t$scriptname\tdone generating species counttable" fi @@ -106,7 +106,7 @@ then done echo -e "Genus\tFamily(@=contigbarcode)" > $inputfile.header cat $inputfile.header $inputfile.genus.uniq.columntable > $inputfile.genus.counttable_temp - paste $inputfile.genus.counttable_temp bar*.$inputfile.genus.output > $inputfile.genus.counttable + paste $inputfile.genus.counttable_temp bar*.$inputfile.genus.output > $inputfile.genus.counttable sed -i 's/@_/ /g' $inputfile.genus.counttable echo -e "$(date)\t$scriptname\tdone generating genus counttable" fi diff --git a/taxonomy_lookup.pl b/taxonomy_lookup.pl index 79d0eac..35d7b85 100755 --- a/taxonomy_lookup.pl +++ b/taxonomy_lookup.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl # # taxonomy_lookup.pl # @@ -12,6 +12,7 @@ # Please see license file for details. use strict; +use warnings; #use diagnostics; use Time::HiRes qw[gettimeofday tv_interval]; use DBI; @@ -80,12 +81,11 @@ # sort/uniq gi file my $startsort = [gettimeofday()]; -system ("sort --parallel=$cores -u $basef_inputfile.gi > $basef_inputfile.gi.uniq"); - +system ("sort -u $basef_inputfile.gi > $basef_inputfile.gi.uniq"); my $sorttime = tv_interval($startsort); print localtime()."\t$0\tTime to sort -u gi file: $sorttime seconds\n"; -# Parallelization can occur at this point in the code. Since the file in now sorted, it can be split into n chunks +# Parallelization can occur at this point in the code. Since the file in now sorted, it can be split into n chunks # with no overlap. my %taxonomy; @@ -110,19 +110,19 @@ chomp; my ($family, $genus, $species, $lineage) = ("") x 4; my $result = taxonomy_fgsl($_, $seq_type); - + if ($result =~ /family--(.*?)\t/) { $family = $1; } - + if ($result =~ /genus--(.*?)\t/) { $genus = $1; } - + if ($result =~ /species--(.*?)\t/) { $species = $1; } - + if ($result =~ /lineage--(.*)$/) { $lineage = $1; } @@ -228,7 +228,7 @@ sub taxonomy_fgsl { $sth->execute(($taxid)); if (@noderow = $sth->fetchrow_array) { (my $tid, my $parent, my $rank) = @noderow; - + if ($namerow = $name_return->fetchrow_array) { $name = trim($namerow); } diff --git a/taxonomy_lookup_embedded.pl b/taxonomy_lookup_embedded.pl index 5709a89..d8e7a51 100755 --- a/taxonomy_lookup_embedded.pl +++ b/taxonomy_lookup_embedded.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl # # taxonomy_lookup_embedded.pl # @@ -9,9 +9,10 @@ # Copyright (C) 2014 Scot Federman - All Rights Reserved # SURPI has been released under a modified BSD license. # Please see license file for details. -# Last revised 1/26/2014 +# Last revised 1/26/2014 use DBI; +use warnings; use strict; use Getopt::Std; use Time::HiRes qw[gettimeofday tv_interval]; @@ -36,12 +37,12 @@ if ($opt_h) { print <