Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix aligned PathSeq input getting filtered by WellformedReadFilter #3453

Merged
merged 1 commit into from
Aug 21, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package org.broadinstitute.hellbender.tools.spark.pathseq;

import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMSequenceDictionary;
import org.apache.logging.log4j.Logger;
import org.apache.spark.api.java.JavaRDD;
import org.broadinstitute.hellbender.utils.Utils;
Expand Down Expand Up @@ -58,4 +60,30 @@ public static int pathseqGetRecommendedNumReducers(final String inputPath, final
}
return 1 + (int) (BucketUtils.dirSize(inputPath) / targetPartitionSize);
}

/**
* Returns a deep copy of the input header with an empty sequence dictionary, and logs warnings if the input may
* be aligned but --isHostAligned was not set to true (or vice versa).
*/
public static SAMFileHeader checkAndClearHeaderSequences(final SAMFileHeader inputHeader, final PSFilterArgumentCollection filterArgs, final Logger logger) {

Utils.nonNull(inputHeader, "Cannot check and clear null input header");
Utils.nonNull(filterArgs, "Cannot check header against null filter arguments");
Utils.nonNull(logger, "Cannot check header using null logger");

//Deep copy of header, otherwise aligned reads will be filtered out by WellformedReadFilter because the sequence dictionary is cleared
final SAMFileHeader header = inputHeader.clone();

if (filterArgs.alignedInput && (header.getSequenceDictionary() == null || header.getSequenceDictionary().isEmpty())) {
logger.warn("--isHostAligned is true but the BAM header contains no sequences");
}
if (!filterArgs.alignedInput && header.getSequenceDictionary() != null && !header.getSequenceDictionary().isEmpty()) {
logger.warn("--isHostAligned is false but there are one or more sequences in the BAM header");
}

//Clear header sequences
header.setSequenceDictionary(new SAMSequenceDictionary());

return header;
}
}
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package org.broadinstitute.hellbender.tools.spark.pathseq;

import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMSequenceDictionary;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.broadinstitute.barclay.argparser.Argument;
Expand Down Expand Up @@ -79,16 +78,8 @@ public boolean requiresReads() {
@Override
protected void runTool(final JavaSparkContext ctx) {

final SAMFileHeader header = getHeaderForReads();
if (filterArgs.alignedInput && (header.getSequenceDictionary() == null || header.getSequenceDictionary().isEmpty())) {
logger.warn("--isHostAligned is true but the BAM header contains no sequences");

}
if (!filterArgs.alignedInput && header.getSequenceDictionary() != null && !header.getSequenceDictionary().isEmpty()) {
logger.warn("--isHostAligned is false but there are one or more sequences in the BAM header");
}
filterArgs.doReadFilterArgumentWarnings(getCommandLineParser().getPluginDescriptor(GATKReadFilterPluginDescriptor.class), logger);
header.setSequenceDictionary(new SAMSequenceDictionary());
final SAMFileHeader header = PSUtils.checkAndClearHeaderSequences(getHeaderForReads(), filterArgs, logger);

final PSFilter filter = new PSFilter(ctx, filterArgs, getReads(), header);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import com.google.cloud.dataflow.sdk.options.PipelineOptions;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMSequenceDictionary;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.broadcast.Broadcast;
Expand Down Expand Up @@ -91,15 +90,8 @@ public boolean requiresReads() {
@Override
protected void runTool(final JavaSparkContext ctx) {

SAMFileHeader header = getHeaderForReads();
if (filterArgs.alignedInput && (header.getSequenceDictionary() == null || header.getSequenceDictionary().isEmpty())) {
logger.warn("--isHostAligned is true but the BAM header contains no sequences");
}
if (!filterArgs.alignedInput && header.getSequenceDictionary() != null && !header.getSequenceDictionary().isEmpty()) {
logger.warn("--isHostAligned is false but there are one or more sequences in the BAM header");
}
filterArgs.doReadFilterArgumentWarnings(getCommandLineParser().getPluginDescriptor(GATKReadFilterPluginDescriptor.class), logger);
header.setSequenceDictionary(new SAMSequenceDictionary());
SAMFileHeader header = PSUtils.checkAndClearHeaderSequences(getHeaderForReads(), filterArgs, logger);

//Filter
final PSFilter filter = new PSFilter(ctx, filterArgs, getReads(), header);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.broadinstitute.hellbender.tools.spark.pathseq;

import htsjdk.samtools.*;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.broadinstitute.hellbender.engine.spark.SparkContextFactory;
Expand Down Expand Up @@ -85,4 +86,21 @@ public void testLogItemizedWarning() {
PSUtils.logItemizedWarning(logger, items, "Test warning statement");
}

@Test
public void testCheckAndClearHeaderSequences() {
final List<SAMSequenceRecord> records = new ArrayList<>(1);
records.add(new SAMSequenceRecord("rec1", 1));
final SAMSequenceDictionary dict = new SAMSequenceDictionary(records);
final SAMFileHeader inputHeader = new SAMFileHeader(dict);
inputHeader.addReadGroup(new SAMReadGroupRecord("rg1"));
inputHeader.addProgramRecord(new SAMProgramRecord("pg1"));

final SAMFileHeader resultHeader = PSUtils.checkAndClearHeaderSequences(inputHeader, new PSFilterArgumentCollection(), logger);
Assert.assertTrue(resultHeader.getSequenceDictionary() == null || resultHeader.getSequenceDictionary().isEmpty());
Assert.assertNotEquals(resultHeader, inputHeader);

resultHeader.setSequenceDictionary(dict);
Assert.assertEquals(resultHeader, inputHeader);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,9 @@ public String getTestedClassName() {
return PathSeqPipelineSpark.class.getSimpleName();
}

@SuppressWarnings("unchecked")
@Test(groups = "spark")
public void testPathSeqPipeline() throws Exception {
public void runTest( final File inputBam, final File expectedBam, final File expectedScores, final boolean isHostAligned) throws Exception {
final File outputBam = createTempFile("pathseqPipelineTestOutput", ".bam");
final File outputScores = createTempFile("pathseqPipelineTestOutput", ".txt");
final File inputBam = getTestFile("pipeline_input.bam");

final String baseResourcePath = "src/test/resources/" + PathSeqBuildKmers.class.getPackage().getName().replace(".", "/");
final String kmerLibraryPath = baseResourcePath + "/hg19mini.hss";
Expand All @@ -38,18 +35,35 @@ public void testPathSeqPipeline() throws Exception {
args.addFileArgument("scoresOutputPath", outputScores);
args.addArgument("kmerLibraryPath", kmerLibraryPath);
args.addArgument("filterBwaImage", filterImagePath);
args.addBooleanArgument("isHostAligned", isHostAligned);
args.addFileArgument("pathogenBwaImage", pathogenBwaImage);
args.addFileArgument("pathogenFasta", pathogenFasta);
args.addFileArgument("taxonomicDatabasePath", taxonomyDatabase);

this.runCommandLine(args);

final File expectedBam = getTestFile("pipeline_output.bam");
SamAssertionUtils.assertEqualBamFiles(outputBam, expectedBam, true, ValidationStringency.STRICT);

final File expectedScores = getTestFile("pipeline_output.txt");
final String expectedScoreString = FileUtils.readFileToString(expectedScores);
final String actualScoresString = FileUtils.readFileToString(outputScores);
String expectedScoreString = FileUtils.readFileToString(expectedScores);
String actualScoresString = FileUtils.readFileToString(outputScores);
PathSeqScoreIntegrationTest.compareScoreTables(expectedScoreString, actualScoresString);
}

@SuppressWarnings("unchecked")
@Test(groups = "spark")
public void testPathSeqPipeline() throws Exception {
final File inputBam = getTestFile("pipeline_input.bam");
final File expectedBam = getTestFile("pipeline_output.bam");
final File expectedScores = getTestFile("pipeline_output.txt");
runTest(inputBam, expectedBam, expectedScores, false);
}

@SuppressWarnings("unchecked")
@Test(groups = "spark")
public void testPathSeqPipelineHostAlignedInput() throws Exception {
final File inputBam = getTestFile("pipeline_input_aligned.bam");
final File expectedBam = getTestFile("pipeline_output_aligned.bam");
final File expectedScores = getTestFile("pipeline_output_aligned.txt");
runTest(inputBam, expectedBam, expectedScores, true);
}
}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
tax_id taxonomy type name score score_normalized reads unambiguous reference_length
561 root|cellular_organisms|Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae|Escherichia genus Escherichia 11346.590909090712 100.00000000000001 1997 1997 0
562 root|cellular_organisms|Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae|Escherichia|Escherichia_coli species Escherichia_coli 11346.590909090712 100.00000000000001 1997 1997 0
1 root root root 11346.590909090712 100.00000000000001 1997 1997 0
2 root|cellular_organisms|Bacteria superkingdom Bacteria 11346.590909090712 100.00000000000001 1997 1997 0
1236 root|cellular_organisms|Bacteria|Proteobacteria|Gammaproteobacteria class Gammaproteobacteria 11346.590909090712 100.00000000000001 1997 1997 0
131567 root|cellular_organisms no_rank cellular_organisms 11346.590909090712 100.00000000000001 1997 1997 0
1224 root|cellular_organisms|Bacteria|Proteobacteria phylum Proteobacteria 11346.590909090712 100.00000000000001 1997 1997 0
91347 root|cellular_organisms|Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacterales order Enterobacterales 11346.590909090712 100.00000000000001 1997 1997 0
543 root|cellular_organisms|Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae family Enterobacteriaceae 11346.590909090712 100.00000000000001 1997 1997 0
83333 root|cellular_organisms|Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae|Escherichia|Escherichia_coli|Escherichia_coli_K-12 no_rank Escherichia_coli_K-12 11346.590909090712 100.00000000000001 1997 1997 0
511145 root|cellular_organisms|Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae|Escherichia|Escherichia_coli|Escherichia_coli_K-12|Escherichia_coli_str._K-12_substr._MG1655 no_rank Escherichia_coli_str._K-12_substr._MG1655 11346.590909090712 100.00000000000001 1997 1997 176000