Skip to content

Commit dadeb37

Browse files
committed
Merge branch 's23test'
2 parents 3fe403f + b7ad2cf commit dadeb37

File tree

6 files changed

+130
-38
lines changed

6 files changed

+130
-38
lines changed

LICENSE.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
The MIT License (MIT)
22

3-
Copyright (c) 2013,2014 Gregory G. Faust
3+
Copyright (c) 2013-2016 Gregory G. Faust
44

55
Permission is hereby granted, free of charge, to any person obtaining a copy
66
of this software and associated documentation files (the "Software"), to deal

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Determine the samblaster build number
2-
BUILDNUM = 22
2+
BUILDNUM = 23
33
# INTERNAL = TRUE
44

55
OBJS = samblaster.o sbhash.o

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ Click the preceeding link or download the file from this repository.
1111

1212
---
1313

14-
**Current version:** 0.1.22
14+
**Current version:** 0.1.23
1515

1616
Support for Linux and OSX (Version 10.7 or higher).
1717

samblaster.cpp

Lines changed: 125 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* -*- Mode: C++ ; indent-tabs-mode: nil ; c-file-style: "stroustrup" -*-
1+
/* -*- mode: C++ ; indent-tabs-mode: nil ; c-file-style: "stroustrup" -*-
22
33
Project: samblaster
44
Fast mark duplicates in read-ID grouped SAM file.
@@ -10,7 +10,7 @@
1010
1111
License Information:
1212
13-
Copyright 2013-2015 Gregory G. Faust
13+
Copyright 2013-2016 Gregory G. Faust
1414
1515
Licensed under the MIT license (the "License");
1616
You may not use this file except in compliance with the License.
@@ -121,7 +121,7 @@ inline UINT64 diffTVs (struct timeval * startTV, struct timeval * endTV)
121121
// and also so that we can keep a freelist of them.
122122

123123
// We need to pre-define these for the SAM specific fields.
124-
typedef UINT32 pos_t; // Type for reference offsets.
124+
typedef UINT32 pos_t; // Type for reference offsets (pos can be negative w/ softclips, but still works fine as unsigned rollover).
125125
typedef UINT64 sgn_t; // Type for signatures for offsets and lengths.
126126
// And the type itself for the next pointer.
127127
typedef struct splitLine splitLine_t;
@@ -142,6 +142,8 @@ struct splitLine
142142
int flag;
143143
pos_t pos;
144144
int seqNum;
145+
pos_t binPos;
146+
int binNum;
145147
int SQO;
146148
int EQO;
147149
int sclip;
@@ -370,6 +372,34 @@ inline void writeLine(splitLine_t * line, FILE * output)
370372
outputString(line->buffer, output);
371373
}
372374

375+
// Check the first line of a file (e.g. input) for bam signature.
376+
void checkBAMfile(splitLine_t * line)
377+
{
378+
// If the file is a bam file, we can't rely on fields.
379+
// So, look at the underlying buffer for the line.
380+
381+
// First define the signature to look for.
382+
int values[] = {31, -117, 8, 4, 66, 67, 2};
383+
int offsets[] = {0, 1, 2, 3, 12, 13, 14};
384+
int count = 7;
385+
386+
// Check for empty file or likely a sam file with a header.
387+
// This is necessary, as the @HD line may not be long enough to check for the BAM signature.
388+
if (line == NULL) fatalError("samblaster: Input file is empty. Exiting.\n");
389+
if (line->buffer[0] == '@') return;
390+
391+
// If a SAM file has no header, an alignment row should easily be long enough.
392+
if (line->bufLen <= offsets[count-1]) fatalError("samblaster: Input file is empty. Exiting.\n");
393+
// Check for the BAM signature.
394+
for (int i=0; i<count; i++)
395+
{
396+
if ((int)line->buffer[offsets[i]] != values[i]) return;
397+
}
398+
399+
// If we are here, we almost certainly have a bamfile.
400+
fatalError("samblaster: Input file appears to be in BAM format. SAM input is required. Exiting.\n");
401+
}
402+
373403
///////////////////////////////////////////////////////////////////////////////
374404
// SAM and signature set related structures.
375405
///////////////////////////////////////////////////////////////////////////////
@@ -518,9 +548,12 @@ struct state_struct
518548
char * unmappedClippedFileName;
519549
sigSet_t * sigs;
520550
seqMap_t seqs;
551+
UINT32 * seqLens;
552+
UINT64 * seqOffs;
521553
splitLine_t ** splitterArray;
522554
int splitterArrayMaxSize;
523555
UINT32 sigArraySize;
556+
int binCount;
524557
int minNonOverlap;
525558
int maxSplitCount;
526559
int minIndelSize;
@@ -585,31 +618,38 @@ void deleteState(state_t * s)
585618
{
586619
free((char *)(iter->first));
587620
}
621+
if (s->seqLens != NULL) free(s->seqLens);
622+
if (s->seqOffs != NULL) free(s->seqOffs);
588623
delete s;
589624
}
590625

591626

592627
///////////////////////////////////////////////////////////////////////////////
593628
// Signatures
594629
///////////////////////////////////////////////////////////////////////////////
595-
630+
// We now calculate signatures as offsets into a super contig that includes the entire genome.
631+
// And partition it into equaly sized bins.
632+
// This performs better on genomes with large numbers of small contigs,
633+
// without performance degradation on more standard genomes.
634+
// Thanks to https://github.com/carsonhh for the suggestion.
635+
/////////////////////////////////////////////////////////////////////////////
596636
inline sgn_t calcSig(splitLine_t * first, splitLine_t * second)
597637
{
598638
// Total nonsense to get the compiler to actually work.
599-
UINT64 t1 = first->pos;
639+
UINT64 t1 = first->binPos;
600640
UINT64 t2 = t1 << 32;
601-
UINT64 final = t2 | second->pos;
641+
UINT64 final = t2 | second->binPos;
602642
return (sgn_t)final;
603643
}
604644

605-
inline UINT32 calcSigArrOff(splitLine_t * first, splitLine_t * second, seqMap_t & seqs)
645+
inline UINT32 calcSigArrOff(splitLine_t * first, splitLine_t * second, int binCount)
606646
{
607-
UINT32 s1 = (first->seqNum * 2) + (isReverseStrand(first) ? 1 : 0);
608-
UINT32 s2 = (second->seqNum * 2) + (isReverseStrand(second) ? 1 : 0);
609-
UINT32 retval = (s1 * seqs.size() * 2) + s2;
647+
UINT32 s1 = (first->binNum * 2) + (isReverseStrand(first) ? 1 : 0);
648+
UINT32 s2 = (second->binNum * 2) + (isReverseStrand(second) ? 1 : 0);
649+
UINT32 retval = (s1 * binCount * 2) + s2;
610650
#ifdef DEBUG
611651
fprintf(stderr, "1st %d %d -> %d 2nd %d %d -> %d count %lu result %d\n",
612-
first->seqNum, isReverseStrand(first), s1, second->seqNum, isReverseStrand(second), s2, seqs.size(), retval);
652+
first->binNum, isReverseStrand(first), s1, second->binNum, isReverseStrand(second), s2, binCount, retval);
613653
#endif
614654
return retval;
615655
}
@@ -737,6 +777,8 @@ inline int getEndDiag(splitLine_t * line)
737777
///////////////////////////////////////////////////////////////////////////////
738778
// Process SAM Blocks
739779
///////////////////////////////////////////////////////////////////////////////
780+
#define BIN_SHIFT 27 //bin window is 27 bits wide
781+
#define BIN_MASK ((1 << 27)-1) //bin window is 27 bits wide
740782

741783
// This is apparently no longer called.
742784
void outputSAMBlock(splitLine_t * block, FILE * output)
@@ -773,8 +815,9 @@ inline void swapPtrs(splitLine_t ** first, splitLine_t ** second)
773815
void brokenBlock(splitLine_t *block, int count)
774816
{
775817
char * temp;
776-
asprintf(&temp, "samblaster: Can't find first and/or second of pair in sam block of length %d for id: %s\n%s\n",
777-
count, block->fields[QNAME], "samblaster: Are you sure the input is sorted by read ids?");
818+
asprintf(&temp, "samblaster: Can't find first and/or second of pair in sam block of length %d for id: %s\n%s%s:%s\n%s",
819+
count, block->fields[QNAME], "samblaster: At location: ", block->fields[RNAME], block->fields[POS],
820+
"samblaster: Are you sure the input is sorted by read ids?");
778821
fatalError(temp);
779822
}
780823

@@ -841,21 +884,27 @@ void markDupsDiscordants(splitLine_t * block, state_t * state)
841884
int mask = (FIRST_SEG | SECOND_SEG);
842885
// Process the first of the pair.
843886
// Get the list of reads that match the second of the pair.
844-
int count = fillSplitterArray<false>(block, state, second->flag & mask, true);
845-
for (int i=0; i<count; ++i)
887+
if (isMapped(first))
846888
{
847-
splitLine_t * line = state->splitterArray[i];
848-
addTag(line, " MC:Z:", first->fields[CIGAR]);
849-
addTag(line, " MQ:i:", first->fields[MAPQ]);
889+
int count = fillSplitterArray<false>(block, state, second->flag & mask, true);
890+
for (int i=0; i<count; ++i)
891+
{
892+
splitLine_t * line = state->splitterArray[i];
893+
addTag(line, " MC:Z:", first->fields[CIGAR]);
894+
addTag(line, " MQ:i:", first->fields[MAPQ]);
895+
}
850896
}
851897
// Process the second of the pair.
852898
// Get the list of reads that match the first of the pair.
853-
count = fillSplitterArray<false>(block, state, first->flag & mask, true);
854-
for (int i=0; i<count; ++i)
899+
if (isMapped(second))
855900
{
856-
splitLine_t * line = state->splitterArray[i];
857-
addTag(line, " MC:Z:", second->fields[CIGAR]);
858-
addTag(line, " MQ:i:", second->fields[MAPQ]);
901+
count = fillSplitterArray<false>(block, state, first->flag & mask, true);
902+
for (int i=0; i<count; ++i)
903+
{
904+
splitLine_t * line = state->splitterArray[i];
905+
addTag(line, " MC:Z:", second->fields[CIGAR]);
906+
addTag(line, " MQ:i:", second->fields[MAPQ]);
907+
}
859908
}
860909
}
861910

@@ -878,17 +927,26 @@ void markDupsDiscordants(splitLine_t * block, state_t * state)
878927
// Calculate and store the second position and sequence name.
879928
calcOffsets(second);
880929
second->seqNum = getSeqNum(second, RNAME, state);
930+
UINT64 seqOff = state->seqOffs[second->seqNum]; //genome relative position
931+
second->binNum = (seqOff + second->pos) >> BIN_SHIFT;
932+
second->binPos = (seqOff + second->pos) & BIN_MASK;
933+
881934
if (orphan)
882935
{
883936
// We have an orphan, so we just zero out the pos and seqnum
884937
first->pos = 0;
885938
first->seqNum = 0;
939+
first->binNum = 0;
940+
first->binPos = 0;
886941
}
887942
else
888943
{
889944
// Not an orphan, so handle first on its own.
890945
calcOffsets(first);
891946
first->seqNum = getSeqNum(first, RNAME, state);
947+
seqOff = state->seqOffs[first->seqNum]; //genome relative position
948+
first->binNum = (seqOff + first->pos) >> BIN_SHIFT;
949+
first->binPos = (seqOff + first->pos) & BIN_MASK;
892950
}
893951

894952
// The fact of which alignment is first or second in the template is not relevant for determining dups.
@@ -900,7 +958,7 @@ void markDupsDiscordants(splitLine_t * block, state_t * state)
900958
// Now find the signature of the pair.
901959
sgn_t sig = calcSig(first, second);
902960
// Calculate the offset into the signatures array.
903-
UINT32 off = calcSigArrOff(first, second, state->seqs);
961+
UINT32 off = calcSigArrOff(first, second, state->binCount);
904962
// Attempt insert into the sigs structure.
905963
// The return value will tell us if it was already there.
906964
bool insert = hashTableInsert(&(state->sigs[off]), sig);
@@ -1136,14 +1194,14 @@ void processSAMBlock(splitLine_t * block, state_t * state)
11361194
splitCount += 1;
11371195
}
11381196
}
1197+
11391198
disposeSplitLines(block);
11401199
}
11411200

11421201

11431202
///////////////////////////////////////////////////////////////////////////////
11441203
// Main Routine with helpers.
11451204
///////////////////////////////////////////////////////////////////////////////
1146-
11471205
void printPGsamLine(FILE * f, state_t * s)
11481206
{
11491207
if (f == NULL) return;
@@ -1394,24 +1452,52 @@ int main (int argc, char *argv[])
13941452

13951453
// Read in the SAM header and create the seqs structure.
13961454
state->seqs[strdup("*")] = 0;
1455+
state->seqLens = (UINT32*)calloc(1, sizeof(UINT32)); //initialize to 0
1456+
state->seqOffs = (UINT64*)calloc(1, sizeof(UINT64)); //initialize to 0
13971457
int count = 1;
1458+
UINT64 totalLen = 0;
13981459
splitLine_t * line;
1399-
while (true)
1460+
// Read the first line to prime the loop, and also to allow checking for malformed input.
1461+
line = readLine(state->inputFile);
1462+
checkBAMfile(line);
1463+
while (line != NULL && line->fields[0][0] == '@')
14001464
{
1401-
line = readLine(state->inputFile);
1402-
// Check if we have exhausted the header.
1403-
if (line == NULL || line->fields[0][0] != '@') break;
14041465
// Process input line to see if it defines a sequence.
14051466
if (streq(line->fields[0], "@SQ"))
14061467
{
1468+
char * seqID = NULL;
1469+
int seqNum = 0;
1470+
UINT32 seqLen = 0;
1471+
UINT64 seqOff = 0;
14071472
for (int i=1; i<line->numFields; ++i)
14081473
{
14091474
if (strncmp(line->fields[i], "SN:", 3) == 0)
14101475
{
1411-
// Unless we are marking dups, we don't need to use sequence numbers.
1412-
if (!state->acceptDups) state->seqs[strdup(line->fields[i]+3)] = count;
1476+
seqID = line->fields[i]+3;
1477+
seqNum = count;
14131478
count += 1;
14141479
}
1480+
else if (strncmp(line->fields[i], "LN:", 3) == 0)
1481+
{
1482+
seqLen = (UINT32)str2int(line->fields[i]+3);
1483+
seqOff = totalLen;
1484+
totalLen += (UINT64)(seqLen+1);
1485+
}
1486+
}
1487+
1488+
// Unless we are marking dups, we don't need to use sequence numbers.
1489+
if (!state->acceptDups)
1490+
{
1491+
// grow seqLens and seqOffs arrays
1492+
if(seqNum % 32768 == 1)
1493+
{
1494+
state->seqLens = (UINT32*)realloc(state->seqLens, (seqNum+32768)*sizeof(UINT32));
1495+
state->seqOffs = (UINT64*)realloc(state->seqOffs, (seqNum+32768)*sizeof(UINT64));
1496+
}
1497+
1498+
state->seqs[strdup(seqID)] = seqNum;
1499+
state->seqLens[seqNum] = seqLen;
1500+
state->seqOffs[seqNum] = seqOff;
14151501
}
14161502
}
14171503
// Output the header line.
@@ -1421,7 +1507,11 @@ int main (int argc, char *argv[])
14211507
if (state->splitterFile != NULL)
14221508
writeLine(line, state->splitterFile);
14231509
disposeSplitLines(line);
1510+
1511+
// Read in next line.
1512+
line = readLine(state->inputFile);
14241513
}
1514+
14251515
// Output the @PG header lines.
14261516
printPGsamLine(state->outputFile, state);
14271517
printPGsamLine(state->discordantFile, state);
@@ -1443,12 +1533,14 @@ int main (int argc, char *argv[])
14431533
if (!state->acceptDups)
14441534
{
14451535
// Make sure we can handle the number of sequences.
1446-
if (count >= (1 << 15))
1536+
int binCount = (totalLen >> BIN_SHIFT)+1;
1537+
if (binCount >= (1 << 15))
14471538
{
14481539
fatalError("samblaster: Too many sequences in header of input sam file. Exiting.\n");
14491540
}
14501541

1451-
state->sigArraySize = count * count * 4;
1542+
state->binCount = binCount;
1543+
state->sigArraySize = binCount * binCount * 4;
14521544
state->sigs = (sigSet_t *) malloc(state->sigArraySize * sizeof(sigSet_t));
14531545
if (state->sigs == NULL) fatalError("samblaster: Unable to allocate signature set array.");
14541546
for (UINT32 i=0; i<state->sigArraySize; i++) hashTableInit(&(state->sigs[i]));

sbhash.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
1111
License Information:
1212
13-
Copyright 2013-2015 Gregory G. Faust
13+
Copyright 2013-2016 Gregory G. Faust
1414
1515
Licensed under the MIT license (the "License");
1616
You may not use this file except in compliance with the License.

sbhash.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
1111
License Information:
1212
13-
Copyright 2013-2015 Gregory G. Faust
13+
Copyright 2013-2016 Gregory G. Faust
1414
1515
Licensed under the MIT license (the "License");
1616
You may not use this file except in compliance with the License.

0 commit comments

Comments
 (0)