1- /* -*- Mode : C++ ; indent-tabs-mode: nil ; c-file-style: "stroustrup" -*-
1+ /* -*- mode : C++ ; indent-tabs-mode: nil ; c-file-style: "stroustrup" -*-
22
33 Project: samblaster
44 Fast mark duplicates in read-ID grouped SAM file.
1010
1111 License Information:
1212
13- Copyright 2013-2015 Gregory G. Faust
13+ Copyright 2013-2016 Gregory G. Faust
1414
1515 Licensed under the MIT license (the "License");
1616 You may not use this file except in compliance with the License.
@@ -121,7 +121,7 @@ inline UINT64 diffTVs (struct timeval * startTV, struct timeval * endTV)
121121// and also so that we can keep a freelist of them.
122122
123123// We need to pre-define these for the SAM specific fields.
124- typedef UINT32 pos_t ; // Type for reference offsets.
124+ typedef UINT32 pos_t ; // Type for reference offsets (pos can be negative w/ softclips, but still works fine as unsigned rollover) .
125125typedef UINT64 sgn_t ; // Type for signatures for offsets and lengths.
126126// And the type itself for the next pointer.
127127typedef struct splitLine splitLine_t;
@@ -142,6 +142,8 @@ struct splitLine
142142 int flag;
143143 pos_t pos;
144144 int seqNum;
145+ pos_t binPos;
146+ int binNum;
145147 int SQO;
146148 int EQO;
147149 int sclip;
@@ -370,6 +372,34 @@ inline void writeLine(splitLine_t * line, FILE * output)
370372 outputString (line->buffer , output);
371373}
372374
375+ // Check the first line of a file (e.g. input) for bam signature.
376+ void checkBAMfile (splitLine_t * line)
377+ {
378+ // If the file is a bam file, we can't rely on fields.
379+ // So, look at the underlying buffer for the line.
380+
381+ // First define the signature to look for.
382+ int values[] = {31 , -117 , 8 , 4 , 66 , 67 , 2 };
383+ int offsets[] = {0 , 1 , 2 , 3 , 12 , 13 , 14 };
384+ int count = 7 ;
385+
386+ // Check for empty file or likely a sam file with a header.
387+ // This is necessary, as the @HD line may not be long enough to check for the BAM signature.
388+ if (line == NULL ) fatalError (" samblaster: Input file is empty. Exiting.\n " );
389+ if (line->buffer [0 ] == ' @' ) return ;
390+
391+ // If a SAM file has no header, an alignment row should easily be long enough.
392+ if (line->bufLen <= offsets[count-1 ]) fatalError (" samblaster: Input file is empty. Exiting.\n " );
393+ // Check for the BAM signature.
394+ for (int i=0 ; i<count; i++)
395+ {
396+ if ((int )line->buffer [offsets[i]] != values[i]) return ;
397+ }
398+
399+ // If we are here, we almost certainly have a bamfile.
400+ fatalError (" samblaster: Input file appears to be in BAM format. SAM input is required. Exiting.\n " );
401+ }
402+
373403// /////////////////////////////////////////////////////////////////////////////
374404// SAM and signature set related structures.
375405// /////////////////////////////////////////////////////////////////////////////
@@ -518,9 +548,12 @@ struct state_struct
518548 char * unmappedClippedFileName;
519549 sigSet_t * sigs;
520550 seqMap_t seqs;
551+ UINT32 * seqLens;
552+ UINT64 * seqOffs;
521553 splitLine_t ** splitterArray;
522554 int splitterArrayMaxSize;
523555 UINT32 sigArraySize;
556+ int binCount;
524557 int minNonOverlap;
525558 int maxSplitCount;
526559 int minIndelSize;
@@ -585,31 +618,38 @@ void deleteState(state_t * s)
585618 {
586619 free ((char *)(iter->first ));
587620 }
621+ if (s->seqLens != NULL ) free (s->seqLens );
622+ if (s->seqOffs != NULL ) free (s->seqOffs );
588623 delete s;
589624}
590625
591626
592627// /////////////////////////////////////////////////////////////////////////////
593628// Signatures
594629// /////////////////////////////////////////////////////////////////////////////
595-
630+ // We now calculate signatures as offsets into a super contig that includes the entire genome.
631+ // And partition it into equaly sized bins.
632+ // This performs better on genomes with large numbers of small contigs,
633+ // without performance degradation on more standard genomes.
634+ // Thanks to https://github.com/carsonhh for the suggestion.
635+ // ///////////////////////////////////////////////////////////////////////////
596636inline sgn_t calcSig (splitLine_t * first, splitLine_t * second)
597637{
598638 // Total nonsense to get the compiler to actually work.
599- UINT64 t1 = first->pos ;
639+ UINT64 t1 = first->binPos ;
600640 UINT64 t2 = t1 << 32 ;
601- UINT64 final = t2 | second->pos ;
641+ UINT64 final = t2 | second->binPos ;
602642 return (sgn_t )final ;
603643}
604644
605- inline UINT32 calcSigArrOff (splitLine_t * first, splitLine_t * second, seqMap_t & seqs )
645+ inline UINT32 calcSigArrOff (splitLine_t * first, splitLine_t * second, int binCount )
606646{
607- UINT32 s1 = (first->seqNum * 2 ) + (isReverseStrand (first) ? 1 : 0 );
608- UINT32 s2 = (second->seqNum * 2 ) + (isReverseStrand (second) ? 1 : 0 );
609- UINT32 retval = (s1 * seqs. size () * 2 ) + s2;
647+ UINT32 s1 = (first->binNum * 2 ) + (isReverseStrand (first) ? 1 : 0 );
648+ UINT32 s2 = (second->binNum * 2 ) + (isReverseStrand (second) ? 1 : 0 );
649+ UINT32 retval = (s1 * binCount * 2 ) + s2;
610650#ifdef DEBUG
611651 fprintf (stderr, " 1st %d %d -> %d 2nd %d %d -> %d count %lu result %d\n " ,
612- first->seqNum , isReverseStrand (first), s1, second->seqNum , isReverseStrand (second), s2, seqs. size () , retval);
652+ first->binNum , isReverseStrand (first), s1, second->binNum , isReverseStrand (second), s2, binCount , retval);
613653#endif
614654 return retval;
615655}
@@ -737,6 +777,8 @@ inline int getEndDiag(splitLine_t * line)
737777// /////////////////////////////////////////////////////////////////////////////
738778// Process SAM Blocks
739779// /////////////////////////////////////////////////////////////////////////////
780+ #define BIN_SHIFT 27 // bin window is 27 bits wide
781+ #define BIN_MASK ((1 << 27 )-1 ) // bin window is 27 bits wide
740782
741783// This is apparently no longer called.
742784void outputSAMBlock (splitLine_t * block, FILE * output)
@@ -773,8 +815,9 @@ inline void swapPtrs(splitLine_t ** first, splitLine_t ** second)
773815void brokenBlock (splitLine_t *block, int count)
774816{
775817 char * temp;
776- asprintf (&temp, " samblaster: Can't find first and/or second of pair in sam block of length %d for id: %s\n %s\n " ,
777- count, block->fields [QNAME], " samblaster: Are you sure the input is sorted by read ids?" );
818+ asprintf (&temp, " samblaster: Can't find first and/or second of pair in sam block of length %d for id: %s\n %s%s:%s\n %s" ,
819+ count, block->fields [QNAME], " samblaster: At location: " , block->fields [RNAME], block->fields [POS],
820+ " samblaster: Are you sure the input is sorted by read ids?" );
778821 fatalError (temp);
779822}
780823
@@ -841,21 +884,27 @@ void markDupsDiscordants(splitLine_t * block, state_t * state)
841884 int mask = (FIRST_SEG | SECOND_SEG);
842885 // Process the first of the pair.
843886 // Get the list of reads that match the second of the pair.
844- int count = fillSplitterArray<false >(block, state, second->flag & mask, true );
845- for (int i=0 ; i<count; ++i)
887+ if (isMapped (first))
846888 {
847- splitLine_t * line = state->splitterArray [i];
848- addTag (line, " MC:Z:" , first->fields [CIGAR]);
849- addTag (line, " MQ:i:" , first->fields [MAPQ]);
889+ int count = fillSplitterArray<false >(block, state, second->flag & mask, true );
890+ for (int i=0 ; i<count; ++i)
891+ {
892+ splitLine_t * line = state->splitterArray [i];
893+ addTag (line, " MC:Z:" , first->fields [CIGAR]);
894+ addTag (line, " MQ:i:" , first->fields [MAPQ]);
895+ }
850896 }
851897 // Process the second of the pair.
852898 // Get the list of reads that match the first of the pair.
853- count = fillSplitterArray<false >(block, state, first->flag & mask, true );
854- for (int i=0 ; i<count; ++i)
899+ if (isMapped (second))
855900 {
856- splitLine_t * line = state->splitterArray [i];
857- addTag (line, " MC:Z:" , second->fields [CIGAR]);
858- addTag (line, " MQ:i:" , second->fields [MAPQ]);
901+ count = fillSplitterArray<false >(block, state, first->flag & mask, true );
902+ for (int i=0 ; i<count; ++i)
903+ {
904+ splitLine_t * line = state->splitterArray [i];
905+ addTag (line, " MC:Z:" , second->fields [CIGAR]);
906+ addTag (line, " MQ:i:" , second->fields [MAPQ]);
907+ }
859908 }
860909 }
861910
@@ -878,17 +927,26 @@ void markDupsDiscordants(splitLine_t * block, state_t * state)
878927 // Calculate and store the second position and sequence name.
879928 calcOffsets (second);
880929 second->seqNum = getSeqNum (second, RNAME, state);
930+ UINT64 seqOff = state->seqOffs [second->seqNum ]; // genome relative position
931+ second->binNum = (seqOff + second->pos ) >> BIN_SHIFT;
932+ second->binPos = (seqOff + second->pos ) & BIN_MASK;
933+
881934 if (orphan)
882935 {
883936 // We have an orphan, so we just zero out the pos and seqnum
884937 first->pos = 0 ;
885938 first->seqNum = 0 ;
939+ first->binNum = 0 ;
940+ first->binPos = 0 ;
886941 }
887942 else
888943 {
889944 // Not an orphan, so handle first on its own.
890945 calcOffsets (first);
891946 first->seqNum = getSeqNum (first, RNAME, state);
947+ seqOff = state->seqOffs [first->seqNum ]; // genome relative position
948+ first->binNum = (seqOff + first->pos ) >> BIN_SHIFT;
949+ first->binPos = (seqOff + first->pos ) & BIN_MASK;
892950 }
893951
894952 // The fact of which alignment is first or second in the template is not relevant for determining dups.
@@ -900,7 +958,7 @@ void markDupsDiscordants(splitLine_t * block, state_t * state)
900958 // Now find the signature of the pair.
901959 sgn_t sig = calcSig (first, second);
902960 // Calculate the offset into the signatures array.
903- UINT32 off = calcSigArrOff (first, second, state->seqs );
961+ UINT32 off = calcSigArrOff (first, second, state->binCount );
904962 // Attempt insert into the sigs structure.
905963 // The return value will tell us if it was already there.
906964 bool insert = hashTableInsert (&(state->sigs [off]), sig);
@@ -1136,14 +1194,14 @@ void processSAMBlock(splitLine_t * block, state_t * state)
11361194 splitCount += 1 ;
11371195 }
11381196 }
1197+
11391198 disposeSplitLines (block);
11401199}
11411200
11421201
11431202// /////////////////////////////////////////////////////////////////////////////
11441203// Main Routine with helpers.
11451204// /////////////////////////////////////////////////////////////////////////////
1146-
11471205void printPGsamLine (FILE * f, state_t * s)
11481206{
11491207 if (f == NULL ) return ;
@@ -1394,24 +1452,52 @@ int main (int argc, char *argv[])
13941452
13951453 // Read in the SAM header and create the seqs structure.
13961454 state->seqs [strdup (" *" )] = 0 ;
1455+ state->seqLens = (UINT32*)calloc (1 , sizeof (UINT32)); // initialize to 0
1456+ state->seqOffs = (UINT64*)calloc (1 , sizeof (UINT64)); // initialize to 0
13971457 int count = 1 ;
1458+ UINT64 totalLen = 0 ;
13981459 splitLine_t * line;
1399- while (true )
1460+ // Read the first line to prime the loop, and also to allow checking for malformed input.
1461+ line = readLine (state->inputFile );
1462+ checkBAMfile (line);
1463+ while (line != NULL && line->fields [0 ][0 ] == ' @' )
14001464 {
1401- line = readLine (state->inputFile );
1402- // Check if we have exhausted the header.
1403- if (line == NULL || line->fields [0 ][0 ] != ' @' ) break ;
14041465 // Process input line to see if it defines a sequence.
14051466 if (streq (line->fields [0 ], " @SQ" ))
14061467 {
1468+ char * seqID = NULL ;
1469+ int seqNum = 0 ;
1470+ UINT32 seqLen = 0 ;
1471+ UINT64 seqOff = 0 ;
14071472 for (int i=1 ; i<line->numFields ; ++i)
14081473 {
14091474 if (strncmp (line->fields [i], " SN:" , 3 ) == 0 )
14101475 {
1411- // Unless we are marking dups, we don't need to use sequence numbers.
1412- if (!state-> acceptDups ) state-> seqs [ strdup (line-> fields [i]+ 3 )] = count;
1476+ seqID = line-> fields [i]+ 3 ;
1477+ seqNum = count;
14131478 count += 1 ;
14141479 }
1480+ else if (strncmp (line->fields [i], " LN:" , 3 ) == 0 )
1481+ {
1482+ seqLen = (UINT32)str2int (line->fields [i]+3 );
1483+ seqOff = totalLen;
1484+ totalLen += (UINT64)(seqLen+1 );
1485+ }
1486+ }
1487+
1488+ // Unless we are marking dups, we don't need to use sequence numbers.
1489+ if (!state->acceptDups )
1490+ {
1491+ // grow seqLens and seqOffs arrays
1492+ if (seqNum % 32768 == 1 )
1493+ {
1494+ state->seqLens = (UINT32*)realloc (state->seqLens , (seqNum+32768 )*sizeof (UINT32));
1495+ state->seqOffs = (UINT64*)realloc (state->seqOffs , (seqNum+32768 )*sizeof (UINT64));
1496+ }
1497+
1498+ state->seqs [strdup (seqID)] = seqNum;
1499+ state->seqLens [seqNum] = seqLen;
1500+ state->seqOffs [seqNum] = seqOff;
14151501 }
14161502 }
14171503 // Output the header line.
@@ -1421,7 +1507,11 @@ int main (int argc, char *argv[])
14211507 if (state->splitterFile != NULL )
14221508 writeLine (line, state->splitterFile );
14231509 disposeSplitLines (line);
1510+
1511+ // Read in next line.
1512+ line = readLine (state->inputFile );
14241513 }
1514+
14251515 // Output the @PG header lines.
14261516 printPGsamLine (state->outputFile , state);
14271517 printPGsamLine (state->discordantFile , state);
@@ -1443,12 +1533,14 @@ int main (int argc, char *argv[])
14431533 if (!state->acceptDups )
14441534 {
14451535 // Make sure we can handle the number of sequences.
1446- if (count >= (1 << 15 ))
1536+ int binCount = (totalLen >> BIN_SHIFT)+1 ;
1537+ if (binCount >= (1 << 15 ))
14471538 {
14481539 fatalError (" samblaster: Too many sequences in header of input sam file. Exiting.\n " );
14491540 }
14501541
1451- state->sigArraySize = count * count * 4 ;
1542+ state->binCount = binCount;
1543+ state->sigArraySize = binCount * binCount * 4 ;
14521544 state->sigs = (sigSet_t *) malloc (state->sigArraySize * sizeof (sigSet_t));
14531545 if (state->sigs == NULL ) fatalError (" samblaster: Unable to allocate signature set array." );
14541546 for (UINT32 i=0 ; i<state->sigArraySize ; i++) hashTableInit (&(state->sigs [i]));
0 commit comments