Skip to content

Commit

Permalink
Add support for reading and writing splitting BAM index files. (#1138)
Browse files Browse the repository at this point in the history
* Add support for reading and writing SBI files for BAMs.
* SBI files are splitting bam indexes which are used to seek to a a specific record number in a bam.  This is different than the existing bai index which is used to query by genomic coordinates and is useful for reading bams in a distributed parallel system like apache-spark.
  • Loading branch information
tomwhite authored and lbergelson committed Dec 11, 2018
1 parent 37f0789 commit 28dde96
Show file tree
Hide file tree
Showing 6 changed files with 794 additions and 0 deletions.
17 changes: 17 additions & 0 deletions src/main/java/htsjdk/samtools/BAMFileReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,15 @@ static long findVirtualOffsetOfFirstRecord(final File bam) throws IOException {
return offset;
}

/**
* Reads through the header and sequence records to find the virtual file offset of the first record in the BAM file.
* The caller is responsible for closing the stream.
*/
static long findVirtualOffsetOfFirstRecord(final SeekableStream seekableStream) throws IOException {
final BAMFileReader reader = new BAMFileReader(seekableStream, (SeekableStream) null, false, false, ValidationStringency.SILENT, new DefaultSAMRecordFactory());
return reader.mFirstRecordPointer;
}

/**
* If true, writes the source of every read into the source SAMRecords.
* @param enabled true to write source information into each SAMRecord.
Expand Down Expand Up @@ -944,6 +953,14 @@ public CloseableIterator<SAMRecord> createIndexIterator(final QueryInterval[] in
return new BAMQueryFilteringIterator(iterator, new BAMQueryMultipleIntervalsIteratorFilter(intervals, contained));
}

/**
* @return a virtual file pointer for the underlying compressed stream.
* @see BlockCompressedInputStream#getFilePointer()
*/
public long getVirtualFilePointer() {
return mCompressedInputStream.getFilePointer();
}

/**
* Iterate over the SAMRecords defined by the sections of the file described in the ctor argument.
*/
Expand Down
90 changes: 90 additions & 0 deletions src/main/java/htsjdk/samtools/BAMSBIIndexer.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
/*
* The MIT License
*
* Copyright (c) 2018 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package htsjdk.samtools;

import htsjdk.samtools.cram.io.InputStreamUtils;
import htsjdk.samtools.seekablestream.SeekablePathStream;
import htsjdk.samtools.seekablestream.SeekableStream;
import htsjdk.samtools.util.BlockCompressedInputStream;
import htsjdk.samtools.util.IOUtil;

import java.io.EOFException;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.file.Files;
import java.nio.file.Path;

/**
* Writes SBI files for BAM files, as understood by {@link SBIIndex}.
*/
public final class BAMSBIIndexer {

/**
* Perform indexing on the given BAM file, at the granularity level specified.
*
* @param bamFile the path to the BAM file
* @param granularity write the offset of every n-th alignment to the index
* @throws IOException as per java IO contract
*/
public static void createIndex(final Path bamFile, final long granularity) throws IOException {
final Path splittingBaiFile = IOUtil.addExtension(bamFile, SBIIndex.FILE_EXTENSION);
try (SeekableStream in = new SeekablePathStream(bamFile); OutputStream out = Files.newOutputStream(splittingBaiFile)) {
createIndex(in, out, granularity);
}
}

/**
* Perform indexing on the given BAM file, at the granularity level specified.
*
* @param in a seekable stream for reading the BAM file from
* @param out the stream to write the index to
* @param granularity write the offset of every n-th alignment to the index
* @throws IOException as per java IO contract
*/
public static void createIndex(final SeekableStream in, final OutputStream out, final long granularity) throws IOException {
long recordStart = SAMUtils.findVirtualOffsetOfFirstRecordInBam(in);
try (BlockCompressedInputStream blockIn = new BlockCompressedInputStream(in)) {
blockIn.seek(recordStart);
// Create a buffer for reading the BAM record lengths. BAM is little-endian.
final ByteBuffer byteBuffer = ByteBuffer.allocate(4).order(ByteOrder.LITTLE_ENDIAN);
final SBIIndexWriter indexWriter = new SBIIndexWriter(out, granularity);
while (true) {
try {
recordStart = blockIn.getFilePointer();
// Read the length of the remainder of the BAM record (`block_size` in the SAM spec)
InputStreamUtils.readFully(blockIn, byteBuffer.array(), 0, 4);
final int blockSize = byteBuffer.getInt(0);
// Process the record start position, then skip to the start of the next BAM record
indexWriter.processRecord(recordStart);
InputStreamUtils.skipFully(blockIn, blockSize);
} catch (EOFException e) {
break;
}
}
indexWriter.finish(recordStart, in.length());
}
}
}
13 changes: 13 additions & 0 deletions src/main/java/htsjdk/samtools/SAMUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
*/
package htsjdk.samtools;

import htsjdk.samtools.seekablestream.SeekableStream;
import htsjdk.samtools.util.BinaryCodec;
import htsjdk.samtools.util.CigarUtil;
import htsjdk.samtools.util.CloserUtil;
Expand Down Expand Up @@ -685,6 +686,18 @@ public static long findVirtualOffsetOfFirstRecordInBam(final File bamFile) {
}
}

/**
* Returns the virtual file offset of the first record in a BAM file - i.e. the virtual file
* offset after skipping over the text header and the sequence records.
*/
public static long findVirtualOffsetOfFirstRecordInBam(final SeekableStream seekableStream) {
try {
return BAMFileReader.findVirtualOffsetOfFirstRecord(seekableStream);
} catch (final IOException ioe) {
throw new RuntimeEOFException(ioe);
}
}

/**
* Given a Cigar, Returns blocks of the sequence that have been aligned directly to the
* reference sequence. Note that clipped portions, and inserted and deleted bases (vs. the reference)
Expand Down
Loading

0 comments on commit 28dde96

Please sign in to comment.