From 20e34e87414cb75ba44500d5ca35be5b796a5a14 Mon Sep 17 00:00:00 2001 From: Louis Bergelson Date: Thu, 19 Jan 2023 17:33:17 -0500 Subject: [PATCH 1/3] Add a --numeric-gt option to VariantsToTable * add an new option to VariantsToTable to allow output VCF style numeric GT fields previously it always output the actual bases of the Allele in the GT spot * resolves https://github.com/broadinstitute/gatk/issues/8160 * updates htsjdk to 3.0.5 --- .../walkers/variantutils/VariantsToTable.java | 32 +++++++++++++------ .../VariantsToTableIntegrationTest.java | 16 ++++++++++ 2 files changed, 38 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable.java index db5d4e2e63c..0eaa843dcc4 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable.java @@ -1,6 +1,7 @@ package org.broadinstitute.hellbender.tools.walkers.variantutils; import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.*; import org.apache.logging.log4j.LogManager; @@ -122,6 +123,7 @@ public final class VariantsToTable extends VariantWalker { public final static String SPLIT_MULTI_ALLELIC_LONG_NAME = "split-multi-allelic"; public final static String SPLIT_MULTI_ALLELIC_SHORT_NAME = "SMA"; + public static final String NUMERIC_GT_FULLNAME = "numeric-gt"; static final Logger logger = LogManager.getLogger(VariantsToTable.class); @@ -204,6 +206,11 @@ public final class VariantsToTable extends VariantWalker { doc="Fail on missing data", optional=true) public boolean errorIfMissingData = false; + @Argument(fullName = NUMERIC_GT_FULLNAME, + doc = "write the GT field the way it appears in a VCF ( ex. 0/1 instead of A/T )", + optional = true) + public boolean useNumericGT = false; + private static final String MISSING_DATA = "NA"; private SortedSet samples; @@ -341,7 +348,7 @@ private void emitMoltenizedOutput(final List record) { * @param vc the VariantContext whose field values we can to capture * @return List of lists of field values */ - protected List> extractFields(final VariantContext vc) { + private List> extractFields(final VariantContext vc) { final int numRecordsToProduce = splitMultiAllelic ? vc.getAlternateAlleles().size() : 1; final List> records = new ArrayList<>(numRecordsToProduce); @@ -395,18 +402,23 @@ protected List> extractFields(final VariantContext vc) { private void addGenotypeFieldsToRecords(final VariantContext vc, final List> records, final boolean errorIfMissingData) { for ( final String sample : samples ) { + final Genotype genotype = vc.getGenotype(sample); for ( final String gf : genotypeFieldsToTake ) { - if ( vc.hasGenotype(sample) && vc.getGenotype(sample).hasAnyAttribute(gf) ) { + if ( vc.hasGenotype(sample) && genotype.hasAnyAttribute(gf) ) { if (VCFConstants.GENOTYPE_KEY.equals(gf)) { - addFieldValue(vc.getGenotype(sample).getGenotypeString(true), records); + if(useNumericGT) { + addFieldValue(VCFEncoder.encodeGtField(vc, genotype), records); + } else { + addFieldValue(genotype.getGenotypeString(true), records); + } } else { /** * TODO - If gf == "FT" and the GT record is not filtered, Genotype.getAnyAttribute == null. Genotype.hasAnyAttribute should be changed so it * returns false for this condition. Presently, it always returns true. Once this is fixed, then only the "addFieldValue" statement will * remain in the following logic block. */ - if (vc.getGenotype(sample).getAnyAttribute(gf) != null) { - addFieldValue(vc.getGenotype(sample).getAnyAttribute(gf), records); + if (genotype.getAnyAttribute(gf) != null) { + addFieldValue(genotype.getAnyAttribute(gf), records); } else { handleMissingData(errorIfMissingData, gf, records, vc); } } @@ -416,21 +428,21 @@ private void addGenotypeFieldsToRecords(final VariantContext vc, final List altDepths = new ArrayList<>(); - int[] allDepths = vc.getGenotype(sample).getAD(); + final List altDepths = new ArrayList<>(); + int[] allDepths = genotype.getAD(); for (int i = 1; i < allDepths.length; i++) { altDepths.add(allDepths[0] + "," + allDepths[i]); } addFieldValue(altDepths, records); } else { - addAlleleSpecificFieldValue(split(vc.getGenotype(sample).getExtendedAttribute(field).toString(), ','), + addAlleleSpecificFieldValue(split(genotype.getExtendedAttribute(field).toString(), ','), records, inputHeader.getFormatHeaderLine(field).getCountType()); } } else { - final String value = vc.getGenotype(sample).getAnyAttribute(field).toString(); + final String value = genotype.getAnyAttribute(field).toString(); if (field.equals(VCFConstants.GENOTYPE_ALLELE_DEPTHS)) { addFieldValue(value.replace("[","").replace("]","").replaceAll("\\s",""),records); } else { diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTableIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTableIntegrationTest.java index 10c922a267a..b1e4b96a084 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTableIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTableIntegrationTest.java @@ -2,6 +2,7 @@ import org.broadinstitute.hellbender.CommandLineProgramTest; import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; import org.broadinstitute.hellbender.testutils.IntegrationTestSpec; import org.testng.annotations.Test; @@ -276,5 +277,20 @@ public void testNoFieldsSpecifiedFormatFieldInHeaderNoSamples() throws IOExcepti IntegrationTestSpec.assertEqualTextFiles(outputFile, expectedFile); } + + @Test + public void testNumericGTFlag() throws IOException { + final File inputFile = new File(getToolTestDataDir(), "VCFWithGenotypes_1000G.phase3.snippet.vcf"); + final File outputFile = createTempFile("numericGT", ".table"); + final File expectedFile = new File(getToolTestDataDir(), "expected.numericGT.table"); + + final ArgumentsBuilder args = new ArgumentsBuilder(); + args.addVCF(inputFile) + .addOutput(outputFile) + .addFlag(VariantsToTable.NUMERIC_GT_FULLNAME); + runCommandLine(args); + + IntegrationTestSpec.assertEqualTextFiles(outputFile, expectedFile); + } } From 157d19bf1b3c4a62c5e46d2dc1a466e368537a75 Mon Sep 17 00:00:00 2001 From: Louis Bergelson Date: Wed, 22 Feb 2023 16:49:05 -0500 Subject: [PATCH 2/3] add test data --- .../variantutils/VariantsToTable/expected.numericGT.table | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable/expected.numericGT.table diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable/expected.numericGT.table b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable/expected.numericGT.table new file mode 100644 index 00000000000..9b2ff757971 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable/expected.numericGT.table @@ -0,0 +1,4 @@ +CHROM POS ID REF ALT QUAL FILTER AC AF AN BaseQRankSum ClippingRankSum DP DS END FS HaplotypeScore InbreedingCoeff MLEAC MLEAF MQ MQ0 MQRankSum NEGATIVE_TRAIN_SITE POSITIVE_TRAIN_SITE QD ReadPosRankSum VQSLOD culprit HG00096.GT HG00096.AD HG00096.DP HG00096.GQ HG00096.PL HG00097.GT HG00097.AD HG00097.DP HG00097.GQ HG00097.PL HG00099.GT HG00099.AD HG00099.DP HG00099.GQ HG00099.PL +20 10000054 . CTTTG C 504.42 PASS 0 0.00 6 -0.975 -2.925 22 NA NA 1.899 NA 0.0592 NA NA 59.27 0 -3.212 NA NA 2.43 -0.264 5.10 FS 0/0 2,0 2 6 0,6,119 0/0 10,0 10 29 0,29,592 0/0 10,0 10 30 0,30,598 +20 10000107 . T C 263.95 PASS 0 0.00 6 -0.444 -3.132 25 NA NA 0.948 NA -0.0102 NA NA 59.19 0 2.292 NA true 10.56 0.055 7.76 FS 0/0 5,0 5 15 0,15,387 0/0 13,0 13 42 0,42,786 0/0 7,0 7 24 0,24,548 +20 10000117 . C T 329458.1699999999 PASS 1 0.167 6 10.505 -20.658 28 NA NA 8.305 NA 0.1727 NA NA 59.17 0 2.689 NA true 25.46 -4.688 3.19 ReadPosRankSum 0/0 5,0 5 15 0,15,189 0/1 8,8 16 99 254,0,231 0/0 7,0 7 21 0,21,271 From f98c2c99cf2c853cfe82a76d3a3f9c30324ad9e9 Mon Sep 17 00:00:00 2001 From: Louis Bergelson Date: Mon, 19 Aug 2024 13:15:52 -0400 Subject: [PATCH 3/3] responding to comments --- .../walkers/variantutils/VariantsToTable.java | 33 +++++++++---------- 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable.java index 0eaa843dcc4..775bee1d2ca 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/VariantsToTable.java @@ -123,7 +123,7 @@ public final class VariantsToTable extends VariantWalker { public final static String SPLIT_MULTI_ALLELIC_LONG_NAME = "split-multi-allelic"; public final static String SPLIT_MULTI_ALLELIC_SHORT_NAME = "SMA"; - public static final String NUMERIC_GT_FULLNAME = "numeric-gt"; + public static final String NUMERIC_GT_FULLNAME = "use-numeric-gt"; static final Logger logger = LogManager.getLogger(VariantsToTable.class); @@ -404,15 +404,13 @@ private void addGenotypeFieldsToRecords(final VariantContext vc, final List altDepths = new ArrayList<>(); - int[] allDepths = genotype.getAD(); + final int[] allDepths = genotype.getAD(); for (int i = 1; i < allDepths.length; i++) { altDepths.add(allDepths[0] + "," + allDepths[i]); } @@ -472,8 +471,7 @@ private static void addFieldValue(final Object val, final List> res result.get(0).add(prettyPrintObject(val)); } // if this field is a list of the proper size, add the appropriate entry to each record - else if ( (val instanceof List) && ((List)val).size() == numResultRecords ) { - final List list = (List)val; + else if ( (val instanceof List list) && list.size() == numResultRecords ) { for ( int i = 0; i < numResultRecords; i++ ) { result.get(i).add(list.get(i).toString()); } @@ -494,9 +492,8 @@ else if ( (val instanceof List) && ((List)val).size() == numResultRecords ) { * @param alleleCount scalar, R-type or A-type values */ private static void addAlleleSpecificFieldValue(final Object val, final List> result, final VCFHeaderLineCount alleleCount) { - if (val instanceof List && alleleCount.equals(VCFHeaderLineCount.R)) { - final List myList = (List) val; - addFieldValue(new ArrayList<>(myList.subList(1, myList.size())), result); + if (val instanceof final List list && alleleCount.equals(VCFHeaderLineCount.R)) { + addFieldValue(new ArrayList<>(list.subList(1, list.size())), result); } else { addFieldValue(val, result); @@ -508,8 +505,8 @@ private static String prettyPrintObject(final Object val) { return ""; } - if ( val instanceof List ) { - return prettyPrintObject(((List) val).toArray()); + if ( val instanceof List list) { + return prettyPrintObject(list.toArray()); } if ( !val.getClass().isArray() ) {