Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add splitting for allele-specific annotations and ADs to VariantsToTable #5697

Merged
merged 1 commit into from
Mar 25, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLineCount;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.barclay.argparser.Advanced;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import picard.cmdline.programgroups.VariantEvaluationProgramGroup;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.ReadsContext;
Expand All @@ -27,6 +29,8 @@
import java.util.*;
import java.util.function.Function;

import static org.broadinstitute.hellbender.utils.Utils.split;

/**
* Extract fields from a VCF file to a tab-delimited table
*
Expand Down Expand Up @@ -55,12 +59,21 @@
* <li> NCALLED (number of called samples) </li>
* <li> MULTI-ALLELIC (is this variant multi-allelic? true/false) </li>
* </ul>
* <p>
* Use the `-ASF` argument to extract allele-specific/per allele INFO fields and split them appropriately when
* splitting multi-allelic variants.
* </p>
*
* <h4>FORMAT/sample-level fields</h4>
* <p>
* Use the `-GF` argument to extract FORMAT/sample-level fields. The tool will create a new column per sample
* with the name "SAMPLE_NAME.FORMAT_FIELD_NAME" e.g. NA12877.GQ, NA12878.GQ.
* </p>
* <p>
* Use the `-ASGF` argument to extract allele-specific/per allele FORMAT fields and split them appropriately
* when splitting multi-allelic variants. If AD is specified as an allele-specific genotype field the ref and alt
* counts will be given for each alt.
* </p>
*
* <h3>Inputs</h3>
* <ul>
Expand Down Expand Up @@ -93,6 +106,7 @@
* <li> It is common for certain annotations to be absent for some variants. By default, this tool will emit an NA for a missing annotation. If you prefer that the tool fail upon encountering a missing annotation, use the --error-if-missing-data flag. </li>
* <li> If multiple samples are present in the VCF, the genotype fields will be ordered alphabetically by sample name. </li>
* <li> Filtered sites are ignored by default. To include them in the output, use the --show-filtered flag. </li>
* <li> Allele-specific filtering is not yet supported. For PASS sites, all alleles will be given, regardless of their AS_FilterStatus.</li>
* </ul>
*/
@CommandLineProgramProperties(
Expand All @@ -102,6 +116,8 @@
)
@DocumentedFeature
public final class VariantsToTable extends VariantWalker {
public final static String SPLIT_MULTI_ALLELIC_LONG_NAME = "split-multi-allelic";
public final static String SPLIT_MULTI_ALLELIC_SHORT_NAME = "SMA";

static final Logger logger = LogManager.getLogger(VariantsToTable.class);

Expand All @@ -118,7 +134,7 @@ public final class VariantsToTable extends VariantWalker {
@Argument(fullName="fields",
shortName="F",
doc="The name of a standard VCF field or an INFO field to include in the output table", optional=true)
private List<String> fieldsToTake = new ArrayList<>();
protected List<String> fieldsToTake = new ArrayList<>();

/**
* Any annotation name in the FORMAT field (e.g., GQ, PL) to include in the output table.
Expand All @@ -129,6 +145,12 @@ public final class VariantsToTable extends VariantWalker {
doc="The name of a genotype field to include in the output table", optional=true)
private List<String> genotypeFieldsToTake = new ArrayList<>();

@Argument(shortName="ASF", doc="The name of an allele-specific INFO field to be split if present", optional=true)
private List<String> asFieldsToTake = new ArrayList<>();

@Argument(shortName="ASGF", doc="The name of an allele-specific FORMAT field to be split if present", optional=true)
private List<String> asGenotypeFieldsToTake = new ArrayList<>();

/**
* By default this tool only emits values for records where the FILTER field is either PASS or . (unfiltered).
* Turn on this flag to emit values regardless of the value of the FILTER field.
Expand All @@ -144,10 +166,10 @@ public final class VariantsToTable extends VariantWalker {
* (e.g. allele depth) separated by commas. This may cause difficulty when the table is loaded by an R script, for example.
* Use this flag to write multi-allelic records on separate lines of output. Fields that are not allele-specific will be duplicated.
*/
@Argument(fullName="split-multi-allelic",
shortName="SMA",
@Argument(fullName=SPLIT_MULTI_ALLELIC_LONG_NAME,
shortName=SPLIT_MULTI_ALLELIC_SHORT_NAME,
doc="Split multi-allelic records into multiple lines", optional=true)
private boolean splitMultiAllelic = false;
protected boolean splitMultiAllelic = false;

/**
* Use this flag to emit each field within a variant on a separate line. The resulting table will have
Expand Down Expand Up @@ -183,12 +205,14 @@ public final class VariantsToTable extends VariantWalker {
private SortedSet<String> samples;
private long nRecords = 0L;
private PrintStream outputStream = null;
private VCFHeader inputHeader;

@Override
public void onTraversalStart() {
inputHeader = getHeaderForVariants();
outputStream = createPrintStream();

if (genotypeFieldsToTake.isEmpty()) {
if (genotypeFieldsToTake.isEmpty() && asGenotypeFieldsToTake.isEmpty()) {
samples = Collections.emptySortedSet();
} else {
final Map<String, VCFHeader> vcfHeaders = Collections.singletonMap(getDrivingVariantsFeatureInput().getName(), getHeaderForVariants());
Expand All @@ -197,21 +221,30 @@ public void onTraversalStart() {
// if there are no samples, we don't have to worry about any genotype fields
if (samples.isEmpty()) {
genotypeFieldsToTake.clear();
asGenotypeFieldsToTake.clear();
logger.warn("There are no samples - the genotype fields will be ignored");
if (fieldsToTake.isEmpty()){
if (fieldsToTake.isEmpty() && asFieldsToTake.isEmpty()){
throw new UserException("There are no samples and no fields - no output will be produced");
}
}
}

if (asGenotypeFieldsToTake.isEmpty() && asFieldsToTake.isEmpty() && !splitMultiAllelic) {
logger.warn("Allele-specific fields will only be split if splitting multi-allelic variants is specified (`--" + SPLIT_MULTI_ALLELIC_LONG_NAME + "` or `-" + SPLIT_MULTI_ALLELIC_SHORT_NAME + "`");
}

// print out the header
if ( moltenizeOutput ) {
outputStream.println("RecordID\tSample\tVariable\tValue");
} else {
final String baseHeader = Utils.join("\t", fieldsToTake);
final String genotypeHeader = createGenotypeHeader();
final String separator = (!baseHeader.isEmpty() && !genotypeHeader.isEmpty()) ? "\t" : "";
outputStream.println(baseHeader + separator + genotypeHeader);
final List<String> fields = new ArrayList<>();
fields.addAll(fieldsToTake);
fields.addAll(asFieldsToTake);
final String header = new StringBuilder(Utils.join("\t", fields))
.append("\t")
.append(createGenotypeHeader())
.toString();
outputStream.println(header);
}
}

Expand Down Expand Up @@ -242,10 +275,12 @@ private static boolean isWildCard(final String s) {

private String createGenotypeHeader() {
boolean firstEntry = true;
final List<String> allGenotypeFieldsToTake = new ArrayList<>(genotypeFieldsToTake);
allGenotypeFieldsToTake.addAll(asGenotypeFieldsToTake);

final StringBuilder sb = new StringBuilder();
for ( final String sample : samples ) {
for ( final String gf : genotypeFieldsToTake ) {
for ( final String gf : allGenotypeFieldsToTake ) {
if ( firstEntry ) {
firstEntry = false;
} else {
Expand Down Expand Up @@ -278,21 +313,13 @@ private void emitMoltenizedOutput(final List<String> record) {
* @param vc the VariantContext whose field values we can to capture
* @return List of lists of field values
*/
private List<List<String>> extractFields(final VariantContext vc) {
protected List<List<String>> extractFields(final VariantContext vc) {

final int numRecordsToProduce = splitMultiAllelic ? vc.getAlternateAlleles().size() : 1;
final List<List<String>> records = new ArrayList<>(numRecordsToProduce);

final int numFields;
final boolean addGenotypeFields = genotypeFieldsToTake != null && !genotypeFieldsToTake.isEmpty();
if ( addGenotypeFields ) {
numFields = fieldsToTake.size() + genotypeFieldsToTake.size() * samples.size();
} else {
numFields = fieldsToTake.size();
}

for ( int i = 0; i < numRecordsToProduce; i++ ) {
records.add(new ArrayList<>(numFields));
records.add(new ArrayList<>());
}

for ( final String field : fieldsToTake ) {
Expand All @@ -318,7 +345,20 @@ private List<List<String>> extractFields(final VariantContext vc) {
}
}

if ( addGenotypeFields ) {
for ( final String field : asFieldsToTake) {
if (vc.hasAttribute(field)) {
if (splitMultiAllelic) {
addAlleleSpecificFieldValue(Arrays.asList(vc.getAttributeAsString(field, ".").replace("[", "").replace("]", "").split(",")), records, inputHeader.getInfoHeaderLine(field).getCountType());

} else {
addFieldValue(vc.getAttributeAsString(field, ".").replace("[","").replace("]",""), records);
}
} else {
handleMissingData(errorIfMissingData, field, records, vc);
}
}

if ( !genotypeFieldsToTake.isEmpty() || !asGenotypeFieldsToTake.isEmpty() ) {
addGenotypeFieldsToRecords(vc, records, errorIfMissingData);
}

Expand Down Expand Up @@ -346,6 +386,33 @@ private void addGenotypeFieldsToRecords(final VariantContext vc, final List<List
handleMissingData(errorIfMissingData, gf, records, vc);
}
}

for ( final String field : asGenotypeFieldsToTake) {
if ( vc.hasGenotype(sample) && vc.getGenotype(sample).hasAnyAttribute(field) ) {
if (splitMultiAllelic) {
if (VCFConstants.GENOTYPE_ALLELE_DEPTHS.equals(field)) {
List<String> altDepths = new ArrayList<>();
int[] allDepths = vc.getGenotype(sample).getAD();
for (int i = 1; i < allDepths.length; i++) {
altDepths.add(allDepths[0] + "," + allDepths[i]);
}
addFieldValue(altDepths, records);
} else {
addAlleleSpecificFieldValue(split(vc.getGenotype(sample).getExtendedAttribute(field).toString(), ','),
records, inputHeader.getFormatHeaderLine(field).getCountType());
}
} else {
final String value = vc.getGenotype(sample).getAnyAttribute(field).toString();
if (field.equals(VCFConstants.GENOTYPE_ALLELE_DEPTHS)) {
addFieldValue(value.replace("[","").replace("]","").replaceAll("\\s",""),records);
} else {
addFieldValue(value, records);
}
}
} else {
handleMissingData(errorIfMissingData, field, records, vc);
}
}
}
}

Expand Down Expand Up @@ -380,6 +447,22 @@ else if ( (val instanceof List) && ((List)val).size() == numResultRecords ) {
}
}

/**
* Handle per-allele/allele-specific annotations as described in the header
* @param val the annotation value
* @param result the cummulative output
* @param alleleCount scalar, R-type or A-type values
*/
private static void addAlleleSpecificFieldValue(final Object val, final List<List<String>> result, final VCFHeaderLineCount alleleCount) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you have bandwidth/will power, it would be great if you could revamp this legacy code that uses instanceof and write separate methods that take parameters of different types.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do not. (You'll note that the date on the unit test file was June of last year.) Ordinarily I loathe instanceof, but after vacillating for a while, I decided to let it ride because performance isn't really an issue here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Haha sounds good

if (val instanceof List && alleleCount.equals(VCFHeaderLineCount.R)) {
final List<?> myList = (List<?>) val;
addFieldValue(new ArrayList<>(myList.subList(1, myList.size())), result);
}
else {
addFieldValue(val, result);
}
}

private static String prettyPrintObject(final Object val) {
if ( val == null ) {
return "";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,35 @@ public void testGenotypeFieldsWithInline() throws IOException {
spec.executeTest("testGenotypeFieldsWithInline", this);
}

@Test
public void testSplitMultiallelicFields() throws IOException {
//missing AS INFO and FORMAT fields are handled
//R-type FORMAT annotations work (MMQ)
//A-type FORMAT annotations wotk (TLOD)
final IntegrationTestSpec spec = new IntegrationTestSpec(
" --variant " + getToolTestDataDir() + "../../GenotypeGVCFs/threeSamples.2alts.vcf" +
" -SMA -F CHROM -F POS -F REF -F ALT -F FOO -ASF TLOD -ASGF TLOD -ASGF AD -ASGF MMQ -ASGF BAR -raw" +
" -O %s",
Arrays.asList(getToolTestDataDir() + "expected.threeSamples.2alts.MT.txt"));
spec.executeTest("testGenotypeFieldsWithInline", this);

//asking for allele-specific fields without splitting produces reasonable output
final IntegrationTestSpec spec2 = new IntegrationTestSpec(
" --variant " + getToolTestDataDir() + "../../GenotypeGVCFs/threeSamples.2alts.vcf" +
" -F CHROM -F POS -F REF -F ALT -ASGF TLOD -ASGF AD -ASGF MMQ -raw" +
" -O %s",
Arrays.asList(getToolTestDataDir() + "expected.threeSamples.2alts.MT.noSplit.txt"));
spec2.executeTest("testGenotypeFieldsWithInline", this);

//A-type INFO annotations work
final IntegrationTestSpec spec4 = new IntegrationTestSpec(
" --variant " + getToolTestDataDir() + "../../../VQSR/expected/applyIndelAlleleSpecificResult.vcf" +
" -SMA -F CHROM -F POS -F REF -F ALT -ASF AS_BaseQRankSum -ASGF AD -raw -ASF AS_FilterStatus" +
" -O %s",
Arrays.asList(getToolTestDataDir() + "expected.ASindelVQSR.txt"));
spec4.executeTest("testGenotypeFieldsWithInline", this);
}

@Test
public void testListFields() throws IOException {
final IntegrationTestSpec spec = new IntegrationTestSpec(
Expand Down
Binary file not shown.
Loading