From 525ca02b42156d4faf8a83dcb88f1aaf78197b89 Mon Sep 17 00:00:00 2001 From: GregFaust Date: Sun, 31 May 2015 21:25:58 -0400 Subject: [PATCH 1/3] Pre-release Version A of samblaster 0.1.22 - Add -M option. Use of -M is backward compatible with the samblaster behavior since release 0.1.15 in which both flag values 0x100 and 0x800 were treated as supplementary alignments for the purposes of identifying split-reads. The new default behavior is to treat only reads flagged with 0x800 as supplementary. The -M option can (and should) be used with older alignment files when split reads were marked 0x100, or ones produced by recent versions of BWA MEM using its -M option. - Bug fixes for buffer overruns that sometimes occurred while adding the duplicate flag or mate tag information to output lines. - The 'N' CIGAR op is now supported, as well as multiple clip ops at the beginning or end of a CIGAR string. - samblaster now outputs an @PG line in the header of all output sam files. - samblaster may now support headers with up to 32,000 contigs if the system that it is run on can allocate sufficiently large arrays. --- Makefile | 4 +- samblaster.cpp | 156 +++++++++++++++++++++++++++++++++++++------------ sbhash.cpp | 18 ++++-- sbhash.h | 4 +- 4 files changed, 134 insertions(+), 48 deletions(-) diff --git a/Makefile b/Makefile index 3227f08..91b0f1c 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ # Determine the samblaster build number -BUILDNUM = 21 -# INTERNAL = TRUE +BUILDNUM = 22 +INTERNAL = TRUE OBJS = samblaster.o sbhash.o diff --git a/samblaster.cpp b/samblaster.cpp index ba00253..0aca808 100644 --- a/samblaster.cpp +++ b/samblaster.cpp @@ -10,7 +10,7 @@ License Information: - Copyright 2013,2014 Gregory G. Faust + Copyright 2013-2015 Gregory G. Faust Licensed under the MIT license (the "License"); You may not use this file except in compliance with the License. @@ -264,6 +264,24 @@ void unsplitSplitLine(splitLine_t * line) line->split = false; } +// Resize the buffer of a splitLine. +// Since the new buffer may not be in the same place, we need to first unsplit, resize, then resplit. +void resizeSplitLine(splitLine_t * line, int newsize) +{ + // First unsplit it. + unsplitSplitLine(line); + // Resize the buffer, giving a little extra room. + line->maxBufLen = newsize + 50; + line->buffer = (char *)realloc(line->buffer, line->maxBufLen); + if (line->buffer == NULL) + { + fatalError("samblaster: Failed to reallocate to a larger read buffer size.\n"); + } + // Now resplit the line. + splitSplitLine(line, line->numFields); +} + + // Change a field into a value. // This will be tough given how we output lines. // So, we might have to try a few things. @@ -282,9 +300,10 @@ void changeFieldSplitLine(splitLine_t * line, int fnum, char * newValue) // This should never happen, but to be robust we need to check. // It is messy to fix it, as all the field ptrs will now be wrong. // For now, punt. - if ((size_t)(line->bufLen + move) > line->maxBufLen) + if ((size_t)(line->bufLen + move) >= line->maxBufLen) { - fatalError("samblaster: New buffer length exceeds maximum while changing field value.\n"); + resizeSplitLine(line, line->bufLen + move); + fp = line->fields[fnum]; } // Calculate the size of the tail that is still needed. int distance = 1 + line->bufLen - (fp - line->buffer) - oldLen; @@ -304,18 +323,20 @@ void addTag(splitLine_t * line, const char * header, const char * val) int hl = strlen(header); int vl = strlen(val); // Make sure everything will fit. - char * ptr = line->buffer + line->bufLen - 1; - line->bufLen += hl + vl; - if ((size_t)line->bufLen > line->maxBufLen) + int newlen = line->bufLen + hl + vl; + if ((size_t)newlen >= line->maxBufLen) { - fatalError("samblaster: New buffer length exceeds maximum while adding Mate tags.\n"); + resizeSplitLine(line, newlen); } // Copy over the header and the value. + char * ptr = line->buffer + line->bufLen - 1; ptr = (char *)mempcpy(ptr, header, hl); ptr = (char *)mempcpy(ptr, val, vl); // Add the null terminator for the field, and for the record. ptr[0] = 0; ptr[1] = 0; + // Fix the buffer length. + line->bufLen = newlen; } @@ -369,9 +390,9 @@ inline void writeLine(splitLine_t * line, FILE * output) // Define SAM flag accessors. -#define MULTI_SEGS 0x1 -#define FIRST_SEG 0x40 -#define SECOND_SEG 0x80 +#define MULTI_SEGS 0x1 +#define FIRST_SEG 0x40 +#define SECOND_SEG 0x80 inline bool checkFlag(splitLine_t * line, int bits) { return ((line->flag & bits) != 0); } inline void setFlag(splitLine_t * line, int bits) { line->flag |= bits; } @@ -398,8 +419,21 @@ inline bool isFirstRead(splitLine_t * line) { return checkFlag(line, FIRST_SEG); inline bool isSecondRead(splitLine_t * line) { return checkFlag(line, SECOND_SEG); } +// These determine alignment type. +// Things may get more complicated than this once we have alternate contigs such as in build 38 of human genome. +inline bool isPrimaryAlignment(splitLine_t * line) +{ return !(checkFlag(line, 0x100) || checkFlag(line, 0x800)); } + +// We have to hande secondard and complementary alignments differently depending on compatMode. +// So, we store which bits are being included in each. + +int complementaryBits = 0x800; +inline bool isComplementaryAlignment(splitLine_t * line) +{ return checkFlag(line, complementaryBits); } + +int secondaryBits = 0x100; inline bool isSecondaryAlignment(splitLine_t * line) -{ return (checkFlag(line, 0x100) || checkFlag(line, 0x800)); } +{ return checkFlag(line, secondaryBits); } inline bool isDuplicate(splitLine_t * line) { return checkFlag(line, 0x400); } @@ -486,7 +520,7 @@ struct state_struct seqMap_t seqs; splitLine_t ** splitterArray; int splitterArrayMaxSize; - int sigArraySize; + UINT32 sigArraySize; int minNonOverlap; int maxSplitCount; int minIndelSize; @@ -497,6 +531,7 @@ struct state_struct bool excludeDups; bool removeDups; bool addMateTags; + bool compatMode; bool quiet; }; typedef struct state_struct state_t; @@ -524,6 +559,7 @@ state_t * makeState () s->excludeDups = false; s->removeDups = false; s->addMateTags = false; + s->compatMode = false; s->quiet = false; // Start this as -1 to indicate we don't know yet. // Once we are outputting our first line, we will decide. @@ -537,7 +573,12 @@ state_t * makeState () void deleteState(state_t * s) { free(s->splitterArray); - if (s->sigs != NULL) delete[] s->sigs; + if (s->sigs != NULL) + { + // delete[] s->sigs; + for (UINT32 i=0; isigArraySize; i++) deleteHashTable(&(s->sigs[i])); + free (s->sigs); + } for (seqMap_t::iterator iter = s->seqs.begin(); iter != s->seqs.end(); ++iter) { free((char *)(iter->first)); @@ -559,11 +600,11 @@ inline sgn_t calcSig(splitLine_t * first, splitLine_t * second) return (sgn_t)final; } -inline int calcSigArrOff(splitLine_t * first, splitLine_t * second, seqMap_t & seqs) +inline UINT32 calcSigArrOff(splitLine_t * first, splitLine_t * second, seqMap_t & seqs) { - int s1 = (first->seqNum * 2) + (isReverseStrand(first) ? 1 : 0); - int s2 = (second->seqNum * 2) + (isReverseStrand(second) ? 1 : 0); - int retval = (s1 * seqs.size() * 2) + s2; + UINT32 s1 = (first->seqNum * 2) + (isReverseStrand(first) ? 1 : 0); + UINT32 s2 = (second->seqNum * 2) + (isReverseStrand(second) ? 1 : 0); + UINT32 retval = (s1 * seqs.size() * 2) + s2; #ifdef DEBUG fprintf(stderr, "1st %d %d -> %d 2nd %d %d -> %d count %lu result %d\n", first->seqNum, isReverseStrand(first), s1, second->seqNum, isReverseStrand(second), s2, seqs.size(), retval); @@ -638,13 +679,14 @@ void calcOffsets(splitLine_t * line) { line->raLen += opLen; line->qaLen += opLen; + first = false; } else if (opCode == 'S' || opCode == 'H') { - if (first) line->sclip = opLen; - else line->eclip = opLen; + if (first) line->sclip += opLen; + else line->eclip += opLen; } - else if (opCode == 'D') + else if (opCode == 'D' || opCode == 'N') { line->raLen += opLen; } @@ -656,7 +698,6 @@ void calcOffsets(splitLine_t * line) { fprintf(stderr, "Unknown opcode '%c' in CIGAR string: '%s'\n", opCode, line->fields[CIGAR]); } - first = false; } line->rapos = str2pos(line->fields[POS]); if (isForwardStrand(line)) @@ -743,6 +784,7 @@ UINT64 splitCount = 0; UINT64 unmapClipCount = 0; // This is the main workhorse that determines if lines are dups or not. +template int fillSplitterArray(splitLine_t * block, state_t * state, int mask, bool flagValue); void markDupsDiscordants(splitLine_t * block, state_t * state) { @@ -754,8 +796,8 @@ void markDupsDiscordants(splitLine_t * block, state_t * state) count += 1; // Do this conversion once and store the result. line->flag = str2int(line->fields[FLAG]); - // We don't make our duplicate decisions based on secondaries. - if (isSecondaryAlignment(line)) continue; + // We make our duplicate decisions based solely on primary alignments. + if (!isPrimaryAlignment(line)) continue; // Allow unpaired reads to go through (as the second so that signature is correct). // According to the SAM spec, this must be checked first. if (!isPaired(line)) second = line; @@ -795,7 +837,7 @@ void markDupsDiscordants(splitLine_t * block, state_t * state) int mask = (FIRST_SEG | SECOND_SEG); // Process the first of the pair. // Get the list of reads that match the second of the pair. - int count = fillSplitterArray(block, state, second->flag & mask, true); + int count = fillSplitterArray(block, state, second->flag & mask, true); for (int i=0; isplitterArray[i]; @@ -804,7 +846,7 @@ void markDupsDiscordants(splitLine_t * block, state_t * state) } // Process the second of the pair. // Get the list of reads that match the first of the pair. - count = fillSplitterArray(block, state, first->flag & mask, true); + count = fillSplitterArray(block, state, first->flag & mask, true); for (int i=0; isplitterArray[i]; @@ -854,7 +896,7 @@ void markDupsDiscordants(splitLine_t * block, state_t * state) // Now find the signature of the pair. sgn_t sig = calcSig(first, second); // Calculate the offset into the signatures array. - int off = calcSigArrOff(first, second, state->seqs); + UINT32 off = calcSigArrOff(first, second, state->seqs); // Attempt insert into the sigs structure. // The return value will tell us if it was already there. bool insert = hashTableInsert(&(state->sigs[off]), sig); @@ -894,6 +936,7 @@ int compQOs(const void * p1, const void * p2) return (l1->SQO - l2->SQO); } +template int fillSplitterArray(splitLine_t * block, state_t * state, int mask, bool flagValue) { // Count the secondaries we have for this read (if any), and store their ptrs into an array. @@ -901,7 +944,8 @@ int fillSplitterArray(splitLine_t * block, state_t * state, int mask, bool flagV for (splitLine_t * line = block; line != NULL; line = line->next) { // For all the ones that are the current read of interest.... - if (checkFlag(line, mask) == flagValue) + // Check if they are a primary or complementary alignment. + if (checkFlag(line, mask) == flagValue && !(excludeSecondaries && isSecondaryAlignment(line))) { // Add the ptr to this line to the sort array. // If it won't fit, double the array size. @@ -921,7 +965,7 @@ int fillSplitterArray(splitLine_t * block, state_t * state, int mask, bool flagV void markSplitterUnmappedClipped(splitLine_t * block, state_t * state, int mask, bool flagValue) { // Count the secondaries we have for this read (if any), and store their ptrs into an array. - int count = fillSplitterArray(block, state, mask, flagValue); + int count = fillSplitterArray(block, state, mask, flagValue); // We have the lines of interest in an array. // Decide what to do next based on the number of reads. @@ -950,16 +994,13 @@ void markSplitterUnmappedClipped(splitLine_t * block, state_t * state, int mask, // See if we need to process for splitters. if (state->splitterFile == NULL || count > state->maxSplitCount) return; + // Calculate the query positions (for sorting) and do other preprocessing. for (int i=0; isplitterArray[i]; - // If it is not a secondary. - if (!isSecondaryAlignment(line)) - { - // Make sure the primary is mapped! - if (isUnmapped(line)) return; - } + // Make sure the primary is mapped! + if (isPrimaryAlignment(line) && isUnmapped(line)) return; calcOffsets(line); } @@ -1092,9 +1133,27 @@ void processSAMBlock(splitLine_t * block, state_t * state) // Main Routine with helpers. /////////////////////////////////////////////////////////////////////////////// +void printPGsamLine(FILE * f, state_t * s) +{ + if (f == NULL) return; + fprintf(f, "@PG\tID:SAMBLASTER\tVN:0.1.%d.PRE.A\tCL:samblaster -i %s -o %s", BUILDNUM, s->inputFileName, s->outputFileName); + if (s->compatMode) fprintf(f, " -M"); + if (s->acceptDups) fprintf(f, " --acceptDupMarks"); + if (s->removeDups) fprintf(f, " --removeDups"); + else if (s->excludeDups && (s->discordantFile != NULL || s->splitterFile != NULL || s->unmappedClippedFile != NULL)) fprintf(f, " --excludeDups"); + if (s->addMateTags) fprintf(f, " --addMateTags"); + if (s->discordantFile != NULL) fprintf(f, " -d %s", s->discordantFileName); + if (s->splitterFile != NULL) + fprintf(f, " -s %s --maxSplitCount %d --maxUnmappedBases %d --minIndelSize %d --minNonOverlap %d", + s->splitterFileName, s->maxSplitCount, s->maxUnmappedBases, s->minIndelSize, s->minNonOverlap); + if (s->unmappedClippedFile != NULL) + fprintf(f, " -u %s --minClipSize %d", s->unmappedClippedFileName, s->minClip); + fprintf(f, "\n"); +} + void printVersionString() { - fprintf(stderr, "samblaster: Version 0.1.%d\n", BUILDNUM); + fprintf(stderr, "samblaster: Version 0.1.%d.PRE.A\n", BUILDNUM); } void printUsageString() @@ -1107,7 +1166,8 @@ void printUsageString() "Usage:\n" "For use as a post process on an aligner (eg. bwa mem):\n" - " bwa mem index samp.r1.fq samp.r2.fq | samblaster [-e] [-d samp.disc.sam] [-s samp.split.sam] | samtools view -Sb - > samp.out.bam\n" + " bwa mem samp.r1.fq samp.r2.fq | samblaster [-e] [-d samp.disc.sam] [-s samp.split.sam] | samtools view -Sb - > samp.out.bam\n" + " bwa mem -M samp.r1.fq samp.r2.fq | samblaster -M [-e] [-d samp.disc.sam] [-s samp.split.sam] | samtools view -Sb - > samp.out.bam\n" "For use with a pre-existing bam file to pull split, discordant and/or unmapped reads:\n" " samtools view -h samp.bam | samblaster [-a] [-e] [-d samp.disc.sam] [-s samp.split.sam] [-u samp.umc.fasta] -o /dev/null\n\n" @@ -1124,6 +1184,7 @@ void printUsageString() "-e --excludeDups Exclude reads marked as duplicates from discordant, splitter, and/or unmapped file.\n" "-r --removeDups Remove duplicates reads from all output files. (Implies --excludeDups).\n" " --addMateTags Add MC and MQ tags to all output paired-end SAM lines.\n" + "-M Run in compatibility mode; both 0x100 and 0x800 are considered chimeric. Similar to BWA MEM -M option.\n" " --maxSplitCount INT Maximum number of split alignments for a read to be included in splitter file. [2]\n" " --maxUnmappedBases INT Maximum number of un-aligned bases between two alignments to be included in splitter file. [50]\n" " --minIndelSize INT Minimum structural variant feature size for split alignments to be included in splitter file. [50]\n" @@ -1191,6 +1252,14 @@ int main (int argc, char *argv[]) { state->addMateTags = true; } + else if (streq(argv[argi],"-M")) + { + state->compatMode = true; + // In compatibility mode, both 0x100 and 0x800 are considered chimeric and can be used as splitters. + // None will be secondary. + complementaryBits = (0x100 | 0x800); + secondaryBits = 0; + } else if (streq(argv[argi],"--maxSplitCount")) { argi++; @@ -1336,6 +1405,10 @@ int main (int argc, char *argv[]) writeLine(line, state->splitterFile); disposeSplitLines(line); } + // Output the @PG header lines. + printPGsamLine(state->outputFile, state); + printPGsamLine(state->discordantFile, state); + printPGsamLine(state->splitterFile, state); // Make sure we have a header. if (count == 1 && !state->acceptDups) @@ -1352,9 +1425,16 @@ int main (int argc, char *argv[]) // We can now calculate the size of the signatures array, and intialize it. if (!state->acceptDups) { + // Make sure we can handle the number of sequences. + if (count >= (1 << 15)) + { + fatalError("samblaster: Too many sequences in header of input sam file. Exiting.\n"); + } + state->sigArraySize = count * count * 4; - state->sigs = new sigSet_t[state->sigArraySize]; - for (int i=0; isigArraySize; i++) hashTableInit(&(state->sigs[i])); + state->sigs = (sigSet_t *) malloc(state->sigArraySize * sizeof(sigSet_t)); + if (state->sigs == NULL) fatalError("samblaster: Unable to allocate signature set array."); + for (UINT32 i=0; isigArraySize; i++) hashTableInit(&(state->sigs[i])); } // Now start processing the alignment records. diff --git a/sbhash.cpp b/sbhash.cpp index 54be214..4c9bc4f 100644 --- a/sbhash.cpp +++ b/sbhash.cpp @@ -10,7 +10,7 @@ License Information: - Copyright 2013,2014 Gregory G. Faust + Copyright 2013-2015 Gregory G. Faust Licensed under the MIT license (the "License"); You may not use this file except in compliance with the License. @@ -73,7 +73,7 @@ struct LBMallocBlock char * pushNewLBMallocBlock(int blockSize, LBMallocBlock_t **blockArrayPtr) { char * newBlock = blockMalloc(blockSize); - LBMallocBlock_t * newMallocBlock = (LBMallocBlock_t *)malloc(sizeof(LBMallocBlock_t)); + LBMallocBlock_t * newMallocBlock = (LBMallocBlock_t *)malloc(sizeof(LBMallocBlock_t)); if (newMallocBlock == NULL) fatalError("samblaster: Insufficeint memory available to allocate (more) objects."); newMallocBlock->size = blockSize; newMallocBlock->block = newBlock; @@ -177,7 +177,7 @@ inline bool isValue(UINT64 value) } #define numOfSizes 27 -static UINT32 hashTableSizes [] = {0, 23, 47, 97, 199, 409, 823, 1741, 3739, 7517, 15173, 30727, 62233, 126271, 256279, 520241, 1056323, +static UINT32 hashTableSizes [] = {0, 23, 47, 97, 199, 409, 823, 1741, 3739, 7517, 15173, 30727, 62233, 126271, 256279, 520241, 1056323, 2144977, 4355707, 8844859, 17961079, 36473443, 74066549, 150406843, 305431229, 620239453, 1259520799}; inline UINT32 hash(UINT64 value) @@ -189,7 +189,7 @@ void hashTableInit(hashTable_t * ht, int size) { ht->entries = 0; ht->size = size; - if (size == 0) + if (size == 0) { ht->table = (UINT64 *)NULL; return; @@ -212,13 +212,19 @@ hashTable::~hashTable() if (table != NULL) free(table); } +// C style delete. +void deleteHashTable(hashTable_t * ht) +{ + if (ht->table != NULL) free(ht->table); +} + void resizeHashTable(hashTable_t * ht) { // Find out what size table is next. int newsize = 0; - for (int i=0; isize) + if (hashTableSizes[i] == ht->size) { newsize = hashTableSizes[i+1]; break; diff --git a/sbhash.h b/sbhash.h index a300015..16d0ec9 100644 --- a/sbhash.h +++ b/sbhash.h @@ -10,7 +10,7 @@ License Information: - Copyright 2013,2014 Gregory G. Faust + Copyright 2013-2015 Gregory G. Faust Licensed under the MIT license (the "License"); You may not use this file except in compliance with the License. @@ -58,7 +58,7 @@ struct hashTable }; hashTable_t * makeHashTable(); -void deleteHashTable(); +void deleteHashTable(hashTable_t * ht); bool hashTableInsert(hashTable_t * ht, UINT64 value); void hashTableInit(hashTable_t * ht, int size=0); void freeHashTableNodes(); From da880691f2bc5acf03dc33813cefcd05e4b2cfd6 Mon Sep 17 00:00:00 2001 From: GregFaust Date: Thu, 18 Jun 2015 13:44:32 -0400 Subject: [PATCH 2/3] First attempt at final version of samblaster 0.1.22 - Add --ignoreUnmated option - First attempt at updated README.md file --- Makefile | 2 +- README.md | 40 +++++++++++++++++++++++++++++----------- samblaster.cpp | 37 ++++++++++++++++++++++++++++++------- 3 files changed, 60 insertions(+), 19 deletions(-) diff --git a/Makefile b/Makefile index 91b0f1c..15e8c08 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ # Determine the samblaster build number BUILDNUM = 22 -INTERNAL = TRUE +# INTERNAL = TRUE OBJS = samblaster.o sbhash.o diff --git a/README.md b/README.md index f43e4e3..dc48ac0 100644 --- a/README.md +++ b/README.md @@ -11,9 +11,9 @@ Click the preceeding link or download the file from this repository. --- -**Current version:** 0.1.21 +**Current version:** 0.1.22 -Support for Linux and OSX. +Support for Linux and OSX (Version 10.7 or higher). ##Summary *samblaster* is a fast and flexible program for marking duplicates in __read-id grouped1__ paired-end SAM files. @@ -37,7 +37,7 @@ See the [SAM File Format Specification](http://samtools.sourceforge.net/SAMv1.pd By default, *samblaster* reads SAM input from **stdin** and writes SAM to **stdout**. Input SAM file usually contain paired end data (see [Duplicate Identification](#DupIdentification) below), must contain a sequence header, and must be __read-id grouped1__. By default, the output SAM file will contain all the alignments in the same order as the input, with duplicates marked with SAM FLAG 0x400. The **--removeDups** option will instead remove duplicate alignments from the output file. -__1A read-id grouped__ SAM file is one in which all alignments for a read-id are grouped together in adjacent lines. +__1A read-id grouped__ SAM file is one in which all alignments for a read-id (QNAME) are grouped together in adjacent lines. Aligners naturally produce such files. They can also be created by sorting a SAM file by read-id. But as shown below, sorting the input to *samblaster* by read-id is not required if the alignments are already grouped. @@ -46,12 +46,17 @@ But as shown below, sorting the input to *samblaster* by read-id is not required To take input alignments directly from _bwa mem_ and output to _samtools view_ to compress SAM to BAM: ``` -bwa mem index samp.r1.fq samp.r2.fq | samblaster | samtools view -Sb - > samp.out.bam +bwa mem samp.r1.fq samp.r2.fq | samblaster | samtools view -Sb - > samp.out.bam +``` + +When using the *bwa mem* **-M** option, also use the *samblaster* **-M** option: +``` +bwa mem -M samp.r1.fq samp.r2.fq | samblaster -M | samtools view -Sb - > samp.out.bam ``` To additionally output discordant read pairs and split read alignments: ``` -bwa mem index samp.r1.fq samp.r2.fq | samblaster -e -d samp.disc.sam -s samp.split.sam | samtools view -Sb - > samp.out.bam +bwa mem samp.r1.fq samp.r2.fq | samblaster -e -d samp.disc.sam -s samp.split.sam | samtools view -Sb - > samp.out.bam ``` To pull split reads and discordants read pairs from a pre-existing BAM file with duplicates already marked: @@ -76,18 +81,31 @@ Other Options: -e --excludeDups Exclude reads marked as duplicates from discordant, splitter, and/or unmapped file. -r --removeDups Remove duplicates reads from all output files. (Implies --excludeDups). --addMateTags Add MC and MQ tags to all output paired-end SAM lines. + --ignoreUnmated Suppress abort on unmated alignments. Use only when sure input is read-id grouped and alignments have been filtered. +
--ignoreUnmated is not recommended for general use. It disables checks in samblaster that detect incorrectly sorted input.
+-M Run in compatibility mode; both 0x100 and 0x800 are considered supplemental (chimeric). Similar to bwa mem -M option. See details below. --maxSplitCount INT Maximum number of split alignments for a read to be included in splitter file. [2] --maxUnmappedBases INT Maximum number of un-aligned bases between two alignments to be included in splitter file. [50] --minIndelSize INT Minimum structural variant feature size for split alignments to be included in splitter file. [50] --minNonOverlap INT Minimum non-overlaping base pairs between two alignments for a read to be included in splitter file. [20] --minClipSize INT Minumum number of bases a mapped read must be clipped to be included in unmapped file. [20] - -h --help Print samblaster help to stderr. -q --quiet Output fewer statistics. --version Print samblaster version number to stderr. ``` +--- +**ALIGNMENT TYPE DEFINITIONS:** +Below, we will use the following definitions for alignment types. +Starting with *samblaster* release 0.1.22, these definitions are affected by the use of the **-M** option. +By default, *samblaster* will use the current definitions of alignment types as specified in the [SAM Specification](http://samtools.sourceforge.net/SAMv1.pdf). +Namely, alignments marked with FLAG 0x100 are considered *secondary*, while those marked with FLAG 0x800 are considered *supplemental*. +If the **-M** option is specified, then both FLAG 0x100 and 0x800 are considered *supplemental*, and no alignments are considered *secondary*. +In either case, a *primary* alignment is one that is neither *secondary* nor *supplemental*. +Only *primary* and *supplemental* alignments are used to find chimeric (split-read) mappings. +The **-M** flag is used for backward compatibility with older SAM/BAM files in which "chimeric" alignments were marked with FLAG 0x100, and should also be used with output from more recent runs of *bwa mem* using its **-M** option. + --- **DUPLICATE IDENTIFICATION:** A **duplicate** read pair is defined as a pair that has the same *signature* for each mapped read as a previous read pair in the input SAM file. The *signature* is comprised of the combination of the sequence name, strand, and the reference offset where the 5' end of the read would fall if the read were fully aligned (not clipped) at its 5' end. The 5' aligned reference position is calculated using a combination of the POS field, the strand, and the CIGAR string. This definition of *signature* matches that used by *Picard MarkDuplicates*. @@ -95,7 +113,7 @@ A **duplicate** read pair is defined as a pair that has the same *signature* for 1. For pairs in which both reads are mapped, both signatures must match. 2. For pairs in which only one side is mapped (an "orphan"), the signature of the mapped read must match a previously seen orphan. In an orphan pair, the unmapped read need not appear in the input file. In addition, mapped non-paired single read alignments will be treated the same as an orphan pair with a missing unmapped read. 3. No doubly unmapped pair will be marked as a duplicate. -4. Any *secondary* alignment (FLAG 0x100 or 0x800) associated with a duplicate primary alignment will also be marked as a duplicate. +4. Any *secondary* or *supplemental* alignment associated with a duplicate *primary* alignment will also be marked as a duplicate. --- **DISCORDANT READ PAIR IDENTIFICATION:** @@ -103,14 +121,14 @@ A **discordant** read pair is one which meets all of the following criteria: 1. Both side of the read pair are mapped (neither FLAG 0x4 or 0x8 is set). 2. The *properly paired* FLAG (0x2) is not set. -3. Secondary alignments (FLAG 0x100 or 0x800) are never output as discordant, although a discordant read pair can have secondary alignments associated with them. +3. *Secondary* or *supplemental* alignments are never output as discordant, although a discordant read pair can have such alignments associated with them. 4. Duplicate read pairs that meet the above criteria will be output as discordant unless the **-e** option is used. --- **SPLIT READ IDENTIFICATION:** **Split Read** alignments are derived from a single read when one portion of the read aligns to a different region of the reference genome than another portion of the read. Such pairs of alignments often define a structural variant (SV) breakpoint, and are therefore useful input to SV detection algorithms such as [LUMPY](https://github.com/arq5x/lumpy-sv/). *samblaster* uses the following strategy to identify split reads alignments. -1. Identify reads that have between two and **--maxSplitCount** alignments. +1. Identify reads that have between two and **--maxSplitCount** *primary* and *supplemental* alignments. 2. Sort these alignments by their strand-normalized position along the read. 3. Two alignments are output as splitters if they are adjacent on the read, and meet these criteria: - each covers at least **--minNonOverlap** base pairs of the read that the other does not. @@ -120,10 +138,10 @@ A **discordant** read pair is one which meets all of the following criteria: --- **UNMAPPED/CLIPPED READ IDENTIFICATION:** -An **unmapped** or **clipped** read is one that is unaligned over all or part of its length respectively. The lack of a full alignment may be caused by a SV breakpoint that falls within the read. Therefore, *samblaster* will optionally output such reads to a FASTQ file for re-alignment by a tool, such as [YAHA](http://faculty.virginia.edu/irahall/yaha/), geared toward finding split-read mappings. *samblaster* applies the following strategy to identify and output unmapped/clipped reads: +An **unmapped** or **clipped** read is a *primary* alignment that is unaligned over all or part of its length respectively. The lack of a full alignment may be caused by a SV breakpoint that falls within the read. Therefore, *samblaster* will optionally output such reads to a FASTQ file for re-alignment by a tool, such as [YAHA](https://github.com/GregoryFaust/yaha/), geared toward finding split-read mappings. *samblaster* applies the following strategy to identify and output unmapped/clipped reads: 1. An **unmapped** read has the *unmapped read* FLAG set (0x4). -2. A **clipped** read is a mapped read with a CIGAR string that begins or ends with at least **--minClipSize** unaligned bases (CIGAR code S or H), and is not from a read that has one or more *secondary* alignments (FLAG 0x100). +2. A **clipped** read is a mapped read with a CIGAR string that begins or ends with at least **--minClipSize** unaligned bases (CIGAR code S and/or H), and is not from a read that has one or more *supplemental* alignments. 3. In order for *samblaster* to output the entire sequence for clipped reads, the input SAM file must have soft clipped primary alignments. 4. *samblaster* will output unmapped/clipped reads into a FASTQ file if QUAL information is available in the input file, and a FASTA file if not. 5. Unmapped/clipped reads that are part of a duplicate read pair will be output unless the **-e** option is used. diff --git a/samblaster.cpp b/samblaster.cpp index 0aca808..a77e8ac 100644 --- a/samblaster.cpp +++ b/samblaster.cpp @@ -532,6 +532,7 @@ struct state_struct bool removeDups; bool addMateTags; bool compatMode; + bool ignoreUnmated; bool quiet; }; typedef struct state_struct state_t; @@ -560,6 +561,7 @@ state_t * makeState () s->removeDups = false; s->addMateTags = false; s->compatMode = false; + s->ignoreUnmated = false; s->quiet = false; // Start this as -1 to indicate we don't know yet. // Once we are outputting our first line, we will decide. @@ -782,6 +784,7 @@ UINT64 dupCount = 0; UINT64 discCount = 0; UINT64 splitCount = 0; UINT64 unmapClipCount = 0; +UINT64 unmatedCount = 0; // This is the main workhorse that determines if lines are dups or not. template @@ -805,18 +808,19 @@ void markDupsDiscordants(splitLine_t * block, state_t * state) else if (isFirstRead(line)) first = line; else if (isSecondRead(line)) second = line; } + // Figure out what type of "pair" we have. - // First get rid of the useless case of having no first AND no second. - if (first == NULL && second == NULL) brokenBlock(block, count); - // Now see if we have orphan with the unmapped read missing. bool orphan = false; bool dummyFirst = false; + // First get rid of the useless case of having no first AND no second. + if (first == NULL && second == NULL) goto outOfHere; + // Now see if we have orphan with the unmapped read missing. if (first == NULL || second == NULL) { // Get the NULL one in the first slot. if (second == NULL) swapPtrs(&first, &second); // If the only read says its paired, and it is unmapped or its mate is mapped, something is wrong. - if (isPaired(second) && (isUnmapped(second) || isNextMapped(second))) brokenBlock(block, count); + if (isPaired(second) && (isUnmapped(second) || isNextMapped(second))) goto outOfHere; // If the only read we have is unmapped, then it can't be a dup. if (isUnmapped(second)) return; // Now MAKE a dummy record for the first read, but don't put it into the block. @@ -926,6 +930,11 @@ void markDupsDiscordants(splitLine_t * block, state_t * state) first->discordant = true; second->discordant = true; } + return; + +outOfHere: + if (state->ignoreUnmated) {unmatedCount += 1; return;} + else brokenBlock(block, count); } // Sort ascending in SQO. @@ -975,6 +984,8 @@ void markSplitterUnmappedClipped(splitLine_t * block, state_t * state, int mask, if (state->unmappedClippedFile == NULL) return; // Process unmapped or clipped. splitLine_t * line = state->splitterArray[0]; + // Unmapped or clipped alignments should be primary. + if (!isPrimaryAlignment(line)) return; if (isUnmapped(line)) { line->unmappedClipped = true; @@ -1136,12 +1147,13 @@ void processSAMBlock(splitLine_t * block, state_t * state) void printPGsamLine(FILE * f, state_t * s) { if (f == NULL) return; - fprintf(f, "@PG\tID:SAMBLASTER\tVN:0.1.%d.PRE.A\tCL:samblaster -i %s -o %s", BUILDNUM, s->inputFileName, s->outputFileName); + fprintf(f, "@PG\tID:SAMBLASTER\tVN:0.1.%d\tCL:samblaster -i %s -o %s", BUILDNUM, s->inputFileName, s->outputFileName); if (s->compatMode) fprintf(f, " -M"); if (s->acceptDups) fprintf(f, " --acceptDupMarks"); if (s->removeDups) fprintf(f, " --removeDups"); else if (s->excludeDups && (s->discordantFile != NULL || s->splitterFile != NULL || s->unmappedClippedFile != NULL)) fprintf(f, " --excludeDups"); if (s->addMateTags) fprintf(f, " --addMateTags"); + if (s->ignoreUnmated) fprintf(f, " --ignoreUnmated"); if (s->discordantFile != NULL) fprintf(f, " -d %s", s->discordantFileName); if (s->splitterFile != NULL) fprintf(f, " -s %s --maxSplitCount %d --maxUnmappedBases %d --minIndelSize %d --minNonOverlap %d", @@ -1153,7 +1165,7 @@ void printPGsamLine(FILE * f, state_t * s) void printVersionString() { - fprintf(stderr, "samblaster: Version 0.1.%d.PRE.A\n", BUILDNUM); + fprintf(stderr, "samblaster: Version 0.1.%d\n", BUILDNUM); } void printUsageString() @@ -1167,7 +1179,7 @@ void printUsageString() "Usage:\n" "For use as a post process on an aligner (eg. bwa mem):\n" " bwa mem samp.r1.fq samp.r2.fq | samblaster [-e] [-d samp.disc.sam] [-s samp.split.sam] | samtools view -Sb - > samp.out.bam\n" - " bwa mem -M samp.r1.fq samp.r2.fq | samblaster -M [-e] [-d samp.disc.sam] [-s samp.split.sam] | samtools view -Sb - > samp.out.bam\n" + " bwa mem -M samp.r1.fq samp.r2.fq | samblaster -M [-e] [-d samp.disc.sam] [-s samp.split.sam] | samtools view -Sb - > samp.out.bam\n" "For use with a pre-existing bam file to pull split, discordant and/or unmapped reads:\n" " samtools view -h samp.bam | samblaster [-a] [-e] [-d samp.disc.sam] [-s samp.split.sam] [-u samp.umc.fasta] -o /dev/null\n\n" @@ -1184,6 +1196,7 @@ void printUsageString() "-e --excludeDups Exclude reads marked as duplicates from discordant, splitter, and/or unmapped file.\n" "-r --removeDups Remove duplicates reads from all output files. (Implies --excludeDups).\n" " --addMateTags Add MC and MQ tags to all output paired-end SAM lines.\n" + " --ignoreUnmated Suppress abort on unmated alignments. Use only when sure input is read-id grouped and alignments have been filtered.\n" "-M Run in compatibility mode; both 0x100 and 0x800 are considered chimeric. Similar to BWA MEM -M option.\n" " --maxSplitCount INT Maximum number of split alignments for a read to be included in splitter file. [2]\n" " --maxUnmappedBases INT Maximum number of un-aligned bases between two alignments to be included in splitter file. [50]\n" @@ -1252,6 +1265,10 @@ int main (int argc, char *argv[]) { state->addMateTags = true; } + else if (streq(argv[argi],"--ignoreUnmated")) + { + state->ignoreUnmated = true; + } else if (streq(argv[argi],"-M")) { state->compatMode = true; @@ -1476,6 +1493,12 @@ int main (int argc, char *argv[]) } // Output stats. + if (state->ignoreUnmated) + { + fprintf(stderr, "samblaster: Found %"PRIu64" of %"PRIu64" (%4.2f%%) read ids unmated\n", + unmatedCount, idCount, ((double)100)*unmatedCount/idCount); + if (unmatedCount > 0) fprintf(stderr, "samblaster: Please double check that input file is read-id (QNAME) grouped\n"); + } if (state->removeDups) { fprintf(stderr, "samblaster: Removed %"PRIu64" of %"PRIu64" (%4.2f%%) read ids as duplicates", From 7f07ecf4b715d1a6f7d51608d8a57a0b721daeca Mon Sep 17 00:00:00 2001 From: Gregory Faust Date: Thu, 18 Jun 2015 15:16:59 -0400 Subject: [PATCH 3/3] Update README.md --- README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index dc48ac0..bd37907 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ cp samblaster /usr/local/bin/. ##Usage See the [SAM File Format Specification](http://samtools.sourceforge.net/SAMv1.pdf) for details about the SAM alignment format. -By default, *samblaster* reads SAM input from **stdin** and writes SAM to **stdout**. Input SAM file usually contain paired end data (see [Duplicate Identification](#DupIdentification) below), must contain a sequence header, and must be __read-id grouped1__. +By default, *samblaster* reads SAM input from **stdin** and writes SAM to **stdout**. Input SAM files usually contain paired end data (see [Duplicate Identification](#DupIdentification) below), must contain a sequence header, and must be __read-id grouped1__. By default, the output SAM file will contain all the alignments in the same order as the input, with duplicates marked with SAM FLAG 0x400. The **--removeDups** option will instead remove duplicate alignments from the output file. __1A read-id grouped__ SAM file is one in which all alignments for a read-id (QNAME) are grouped together in adjacent lines. @@ -67,7 +67,7 @@ samtools view -h samp.bam | samblaster -a -e -d samp.disc.sam -s samp.split.sam --- **OPTIONS:** Default values enclosed in square brackets [] -``` +
 Input/Output Options:
 -i --input           FILE Input sam file [stdin].
 -o --output          FILE Output sam file for all input alignments [stdout].
@@ -82,8 +82,8 @@ Other Options:
 -r --removeDups           Remove duplicates reads from all output files. (Implies --excludeDups).
    --addMateTags          Add MC and MQ tags to all output paired-end SAM lines.
    --ignoreUnmated        Suppress abort on unmated alignments. Use only when sure input is read-id grouped and alignments have been filtered.
-                          
--ignoreUnmated is not recommended for general use. It disables checks in samblaster that detect incorrectly sorted input.
--M Run in compatibility mode; both 0x100 and 0x800 are considered supplemental (chimeric). Similar to bwa mem -M option. See details below. + --ignoreUnmated is not recommended for general use. It disables checks that detect incorrectly sorted input. +-M Compatibility mode (details below); both FLAG 0x100 and 0x800 denote supplemental (chimeric). Similar to bwa mem -M option. --maxSplitCount INT Maximum number of split alignments for a read to be included in splitter file. [2] --maxUnmappedBases INT Maximum number of un-aligned bases between two alignments to be included in splitter file. [50] --minIndelSize INT Minimum structural variant feature size for split alignments to be included in splitter file. [50] @@ -93,7 +93,7 @@ Other Options: -h --help Print samblaster help to stderr. -q --quiet Output fewer statistics. --version Print samblaster version number to stderr. -``` +
--- **ALIGNMENT TYPE DEFINITIONS:** @@ -101,8 +101,8 @@ Below, we will use the following definitions for alignment types. Starting with *samblaster* release 0.1.22, these definitions are affected by the use of the **-M** option. By default, *samblaster* will use the current definitions of alignment types as specified in the [SAM Specification](http://samtools.sourceforge.net/SAMv1.pdf). Namely, alignments marked with FLAG 0x100 are considered *secondary*, while those marked with FLAG 0x800 are considered *supplemental*. -If the **-M** option is specified, then both FLAG 0x100 and 0x800 are considered *supplemental*, and no alignments are considered *secondary*. -In either case, a *primary* alignment is one that is neither *secondary* nor *supplemental*. +If the **-M** option is specified, alignments marked with either FLAG 0x100 or 0x800 are considered *supplemental*, and no alignments are considered *secondary*. +A *primary* alignment is always one that is neither *secondary* nor *supplemental*. Only *primary* and *supplemental* alignments are used to find chimeric (split-read) mappings. The **-M** flag is used for backward compatibility with older SAM/BAM files in which "chimeric" alignments were marked with FLAG 0x100, and should also be used with output from more recent runs of *bwa mem* using its **-M** option.