From efa073341be9a17501613ed0e8bea2106769ea0f Mon Sep 17 00:00:00 2001 From: Pengfei Xuan Date: Mon, 21 Sep 2015 22:26:03 -0400 Subject: [PATCH] Improve align() by using process substitution (replace named pipe). - simplify implementation - avoid to create intermediate named pipes - improve performance --- bin/speedseq | 90 ++++++++++++++++++++-------------------------------- 1 file changed, 35 insertions(+), 55 deletions(-) diff --git a/bin/speedseq b/bin/speedseq index cd3a1de..8d13c98 100755 --- a/bin/speedseq +++ b/bin/speedseq @@ -209,7 +209,7 @@ global options: ;; R) RG="$OPTARG" - RG_FMT="-R '$OPTARG'" + RG_FMT="-R $OPTARG" ;; p) INTERLEAVED=1 @@ -353,21 +353,11 @@ global options: if [[ $VERBOSE -eq 1 ]] then - echo " - mkdir -p $TEMP_DIR/full $TEMP_DIR/spl $TEMP_DIR/disc - mkfifo $TEMP_DIR/spl_pipe $TEMP_DIR/disc_pipe" + echo "mkdir -p $TEMP_DIR/full $TEMP_DIR/spl $TEMP_DIR/disc" fi # create temp files mkdir -p $TEMP_DIR/full $TEMP_DIR/spl $TEMP_DIR/disc - if [[ ! -e $TEMP_DIR/spl_pipe ]] - then - mkfifo $TEMP_DIR/spl_pipe - fi - if [[ ! -e $TEMP_DIR/disc_pipe ]] - then - mkfifo $TEMP_DIR/disc_pipe - fi # alignment command if [[ "$INTERLEAVED" -eq 1 ]] @@ -376,63 +366,53 @@ global options: then echo -e " $BWA mem -t $THREADS -p $INS_DIST $RG_FMT $REF $FQ | \\ - $SAMBLASTER $INCLUDE_DUPS --addMateTags --maxSplitCount $MAX_SPLIT_COUNT --minNonOverlap $MIN_NON_OVERLAP --splitterFile $TEMP_DIR/spl_pipe --discordantFile $TEMP_DIR/disc_pipe | \\ + $SAMBLASTER $INCLUDE_DUPS --addMateTags --maxSplitCount $MAX_SPLIT_COUNT --minNonOverlap $MIN_NON_OVERLAP \\ + --splitterFile >(gawk '{ if (\$0~\"^@\") { print; next } else { \$10=\"*\"; \$11=\"*\"; print } }' OFS=\"\\t\" | \\ + $SAMBAMBA view -S -f bam -l 0 /dev/stdin | \\ + $SAMBAMBA sort -t 4 -m 1G --tmpdir=$TEMP_DIR/spl -o $OUTPUT.splitters.bam /dev/stdin) \\ + --discordantFile >(gawk '{ if (\$0~\"^@\") { print; next } else { \$10=\"*\"; \$11=\"*\"; print } }' OFS=\"\\t\" | \\ + $SAMBAMBA view -S -f bam /dev/stdin | \\ + $SAMBAMBA sort -t 4 -m 1G --tmpdir=$TEMP_DIR/disc -o $OUTPUT.discordants.bam /dev/stdin) | \\ $SAMBAMBA view -S -f bam -l 0 /dev/stdin | \\ - $SAMBAMBA sort -t $THREADS -m $((${SORT_MEM}-2))G --tmpdir=$TEMP_DIR/full -o $OUTPUT.bam /dev/stdin - - gawk '{ if (\$0~\"^@\") { print; next } else { \$10=\"*\"; \$11=\"*\"; print } }' OFS=\"\\t\" $TEMP_DIR/spl_pipe | \\ - $SAMBAMBA view -S -f bam -l 0 /dev/stdin | \\ - $SAMBAMBA sort -t 4 -m 1G --tmpdir=$TEMP_DIR/spl -o $OUTPUT.splitters.bam /dev/stdin - gawk '{ if (\$0~\"^@\") { print; next } else { \$10=\"*\"; \$11=\"*\"; print } }' OFS=\"\\t\" $TEMP_DIR/disc_pipe | \\ - $SAMBAMBA view -S -f bam /dev/stdin | \\ - $SAMBAMBA sort -t 4 -m 1G --tmpdir=$TEMP_DIR/disc -o $OUTPUT.discordants.bam /dev/stdin" + $SAMBAMBA sort -t $THREADS -m $((${SORT_MEM}-2))G --tmpdir=$TEMP_DIR/full -o $OUTPUT.bam /dev/stdin" fi - - echo " - $BWA mem -t $THREADS -p $INS_DIST $RG_FMT $REF $FQ | \ - $SAMBLASTER $INCLUDE_DUPS --addMateTags --maxSplitCount $MAX_SPLIT_COUNT --minNonOverlap $MIN_NON_OVERLAP --splitterFile $TEMP_DIR/spl_pipe --discordantFile $TEMP_DIR/disc_pipe | \ - $SAMBAMBA view -S -f bam -l 0 /dev/stdin | \ - $SAMBAMBA sort -t $THREADS -m $((${SORT_MEM}-2))G --tmpdir=$TEMP_DIR/full -o $OUTPUT.bam /dev/stdin - - gawk '{ if (\$0~\"^@\") { print; next } else { \$10=\"*\"; \$11=\"*\"; print } }' OFS=\"\\t\" $TEMP_DIR/spl_pipe | \ + $BWA mem -t $THREADS -p $INS_DIST $RG_FMT $REF $FQ | \ + $SAMBLASTER $INCLUDE_DUPS --addMateTags --maxSplitCount $MAX_SPLIT_COUNT --minNonOverlap $MIN_NON_OVERLAP \ + --splitterFile >(gawk '{ if ($0~"^@") { print; next } else { $10="*"; $11="*"; print } }' OFS="\t" | \ $SAMBAMBA view -S -f bam -l 0 /dev/stdin | \ - $SAMBAMBA sort -t 4 -m 1G --tmpdir=$TEMP_DIR/spl -o $OUTPUT.splitters.bam /dev/stdin - gawk '{ if (\$0~\"^@\") { print; next } else { \$10=\"*\"; \$11=\"*\"; print } }' OFS=\"\\t\" $TEMP_DIR/disc_pipe | \ + $SAMBAMBA sort -t 4 -m 1G --tmpdir=$TEMP_DIR/spl -o $OUTPUT.splitters.bam /dev/stdin) \ + --discordantFile >(gawk '{ if ($0~"^@") { print; next } else { $10="*"; $11="*"; print } }' OFS="\t" | \ $SAMBAMBA view -S -f bam /dev/stdin | \ - $SAMBAMBA sort -t 4 -m 1G --tmpdir=$TEMP_DIR/disc -o $OUTPUT.discordants.bam /dev/stdin - " | $PARALLEL -j 3 + $SAMBAMBA sort -t 4 -m 1G --tmpdir=$TEMP_DIR/disc -o $OUTPUT.discordants.bam /dev/stdin) | \ + $SAMBAMBA view -S -f bam -l 0 /dev/stdin | \ + $SAMBAMBA sort -t $THREADS -m $((${SORT_MEM}-2))G --tmpdir=$TEMP_DIR/full -o $OUTPUT.bam /dev/stdin else if [[ $VERBOSE -eq 1 ]] then echo -e " $BWA mem -t $THREADS $INS_DIST $RG_FMT $REF $FQ1 $FQ2 | \\ - $SAMBLASTER $INCLUDE_DUPS --addMateTags --maxSplitCount $MAX_SPLIT_COUNT --minNonOverlap $MIN_NON_OVERLAP --splitterFile $TEMP_DIR/spl_pipe --discordantFile $TEMP_DIR/disc_pipe | \\ + $SAMBLASTER $INCLUDE_DUPS --addMateTags --maxSplitCount $MAX_SPLIT_COUNT --minNonOverlap $MIN_NON_OVERLAP \\ + --splitterFile >(gawk '{ if (\$0~\"^@\") { print; next } else { \$10=\"*\"; \$11=\"*\"; print } }' OFS=\"\\t\" | \\ + $SAMBAMBA view -S -f bam -l 0 /dev/stdin | \\ + $SAMBAMBA sort -t 4 -m 1G --tmpdir=$TEMP_DIR/spl -o $OUTPUT.splitters.bam /dev/stdin) \\ + --discordantFile >(gawk '{ if (\$0~\"^@\") { print; next } else { \$10=\"*\"; \$11=\"*\"; print } }' OFS=\"\\t\" | \\ + $SAMBAMBA view -S -f bam /dev/stdin | \\ + $SAMBAMBA sort -t 4 -m 1G --tmpdir=$TEMP_DIR/disc -o $OUTPUT.discordants.bam /dev/stdin) | \\ $SAMBAMBA view -S -f bam -l 0 /dev/stdin | \\ - $SAMBAMBA sort -t $THREADS -m $((${SORT_MEM}-2))G --tmpdir=$TEMP_DIR/full -o $OUTPUT.bam /dev/stdin - - gawk '{ if (\$0~\"^@\") { print; next } else { \$10=\"*\"; \$11=\"*\"; print } }' OFS=\"\\t\" $TEMP_DIR/spl_pipe | \\ - $SAMBAMBA view -S -f bam -l 0 /dev/stdin | \\ - $SAMBAMBA sort -t 4 -m 1G --tmpdir=$TEMP_DIR/spl -o $OUTPUT.splitters.bam /dev/stdin - gawk '{ if (\$0~\"^@\") { print; next } else { \$10=\"*\"; \$11=\"*\"; print } }' OFS=\"\\t\" $TEMP_DIR/disc_pipe | \\ - $SAMBAMBA view -S -f bam /dev/stdin | \\ - $SAMBAMBA sort -t 4 -m 1G --tmpdir=$TEMP_DIR/disc -o $OUTPUT.discordants.bam /dev/stdin" + $SAMBAMBA sort -t $THREADS -m $((${SORT_MEM}-2))G --tmpdir=$TEMP_DIR/full -o $OUTPUT.bam /dev/stdin" fi - - echo " - $BWA mem -t $THREADS $INS_DIST $RG_FMT $REF $FQ1 $FQ2 | \ - $SAMBLASTER $INCLUDE_DUPS --addMateTags --maxSplitCount $MAX_SPLIT_COUNT --minNonOverlap $MIN_NON_OVERLAP --splitterFile $TEMP_DIR/spl_pipe --discordantFile $TEMP_DIR/disc_pipe | \ - $SAMBAMBA view -S -f bam -l 0 /dev/stdin | \ - $SAMBAMBA sort -t $THREADS -m $((${SORT_MEM}-2))G --tmpdir=$TEMP_DIR/full -o $OUTPUT.bam /dev/stdin - - gawk '{ if (\$0~\"^@\") { print; next } else { \$10=\"*\"; \$11=\"*\"; print } }' OFS=\"\\t\" $TEMP_DIR/spl_pipe | \ + $BWA mem -t $THREADS $INS_DIST $RG_FMT $REF $FQ1 $FQ2 | \ + $SAMBLASTER $INCLUDE_DUPS --addMateTags --maxSplitCount $MAX_SPLIT_COUNT --minNonOverlap $MIN_NON_OVERLAP \ + --splitterFile >(gawk '{ if ($0~"^@") { print; next } else { $10="*"; $11="*"; print } }' OFS="\t" | \ $SAMBAMBA view -S -f bam -l 0 /dev/stdin | \ - $SAMBAMBA sort -t 4 -m 1G --tmpdir=$TEMP_DIR/spl -o $OUTPUT.splitters.bam /dev/stdin - gawk '{ if (\$0~\"^@\") { print; next } else { \$10=\"*\"; \$11=\"*\"; print } }' OFS=\"\\t\" $TEMP_DIR/disc_pipe | \ + $SAMBAMBA sort -t 4 -m 1G --tmpdir=$TEMP_DIR/spl -o $OUTPUT.splitters.bam /dev/stdin) \ + --discordantFile >(gawk '{ if ($0~"^@") { print; next } else { $10="*"; $11="*"; print } }' OFS="\t" | \ $SAMBAMBA view -S -f bam /dev/stdin | \ - $SAMBAMBA sort -t 4 -m 1G --tmpdir=$TEMP_DIR/disc -o $OUTPUT.discordants.bam /dev/stdin - " | $PARALLEL -j 3 + $SAMBAMBA sort -t 4 -m 1G --tmpdir=$TEMP_DIR/disc -o $OUTPUT.discordants.bam /dev/stdin) | \ + $SAMBAMBA view -S -f bam -l 0 /dev/stdin | \ + $SAMBAMBA sort -t $THREADS -m $((${SORT_MEM}-2))G --tmpdir=$TEMP_DIR/full -o $OUTPUT.bam /dev/stdin fi - + # index the files if [[ $VERBOSE -eq 1 ]] then