From 6f5230ff638db6522ddfc34482e4390bb9c2a7a7 Mon Sep 17 00:00:00 2001 From: Louis Bergelson Date: Fri, 27 Jan 2023 14:19:52 -0500 Subject: [PATCH] Expose the ability to encode a Genotoype into a GT field * Expose two public methods in VCFEncoder writeGtField and encodeGtField * Supports https://github.com/broadinstitute/gatk/issues/8160 but seems like a useful thing to be able to do in general * minor breaking change in VCFEncoder, made methods formatVCFField and buildAlleleStrings static It is unlikely anyone overrides either of these methods so it should not be a problem. --- .../java/htsjdk/variant/vcf/VCFEncoder.java | 50 ++++++++++++++++--- .../htsjdk/variant/vcf/VCFEncoderTest.java | 15 ++++++ 2 files changed, 57 insertions(+), 8 deletions(-) diff --git a/src/main/java/htsjdk/variant/vcf/VCFEncoder.java b/src/main/java/htsjdk/variant/vcf/VCFEncoder.java index 9cffb45837..c674665fa0 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFEncoder.java +++ b/src/main/java/htsjdk/variant/vcf/VCFEncoder.java @@ -219,7 +219,7 @@ private void fieldIsMissingFromHeaderError(final VariantContext vc, final String } @SuppressWarnings("rawtypes") - String formatVCFField(final Object val) { + static String formatVCFField(final Object val) { final String result; if (val == null) { result = VCFConstants.MISSING_VALUE_v4; @@ -327,11 +327,7 @@ public void addGenotypeData(final VariantContext vc, final Map a throw new IllegalStateException("GTs cannot be missing for some samples if they are available for others in the record"); } - writeAllele(g.getAllele(0), alleleMap, vcfoutput); - for (int i = 1; i < g.getPloidy(); i++) { - vcfoutput.append(g.isPhased() ? VCFConstants.PHASED : VCFConstants.UNPHASED); - writeAllele(g.getAllele(i), alleleMap, vcfoutput); - } + writeGtField(alleleMap, vcfoutput, g); continue; } else { @@ -387,6 +383,21 @@ public void addGenotypeData(final VariantContext vc, final Map a } } + /** + * write the encoded GT field for a Genotype + * @param alleleMap a mapping of Allele -> GT allele value (from {@link this#buildAlleleStrings(VariantContext)} + * @param vcfoutput the appendable to write to, to avoid inefficiency due to string copying + * @param g the genotoype to encode + * @throws IOException if appending fails with an IOException + */ + public static void writeGtField(final Map alleleMap, final Appendable vcfoutput, final Genotype g) throws IOException { + writeAllele(g.getAllele(0), alleleMap, vcfoutput); + for (int i = 1; i < g.getPloidy(); i++) { + vcfoutput.append(g.isPhased() ? VCFConstants.PHASED : VCFConstants.UNPHASED); + writeAllele(g.getAllele(i), alleleMap, vcfoutput); + } + } + /* * Create the info string; assumes that no values are null */ @@ -416,8 +427,31 @@ private void writeInfoString(final Map infoFields, final Appenda } } - public Map buildAlleleStrings(final VariantContext vc) { - final Map alleleMap = new HashMap(vc.getAlleles().size() + 1); + /** + * Easy way to generate the GT field for a Genotype. This will be less efficient than using + * {@link this#writeGtField(Map, Appendable, Genotype)} because of redundant Map initializations + * @param vc a VariantContext which must contain g or the results are likely to be incorrect + * @param g a Genotype in vc + * @return a String containing the encoding of the GT field of g + */ + public static String encodeGtField(VariantContext vc, Genotype g) { + final StringBuilder builder = new StringBuilder(); + try { + writeGtField(VCFEncoder.buildAlleleStrings(vc), builder, g); + } catch (final IOException e) { + throw new RuntimeException("Somehow we failed to append to a StringBuilder, this shouldn't happen.", e); + } + return builder.toString(); + } + + /** + * return a Map containing Allele -> String(allele position) for all Alleles in VC + * (as well as NO_CALL) + * ex: A,T,TC -> { A:0, T:1, TC:2, NO_CALL:EMPTY_ALLELE} + * This may be efficient when looking up values for many genotypes per VC + */ + public static Map buildAlleleStrings(final VariantContext vc) { + final Map alleleMap = new HashMap<>(vc.getAlleles().size() + 1); alleleMap.put(Allele.NO_CALL, VCFConstants.EMPTY_ALLELE); // convenience for lookup final List alleles = vc.getAlleles(); diff --git a/src/test/java/htsjdk/variant/vcf/VCFEncoderTest.java b/src/test/java/htsjdk/variant/vcf/VCFEncoderTest.java index 547549aa81..b0c88cd02a 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFEncoderTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFEncoderTest.java @@ -3,6 +3,7 @@ import htsjdk.HtsjdkTest; import htsjdk.tribble.util.ParsingUtils; import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.GenotypeBuilder; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; @@ -135,6 +136,20 @@ public Object[][] makeMissingFormatTestData() { return tests.toArray(new Object[][]{}); } + @Test + public void testEncodeGT(){ + final VariantContextBuilder vcb = new VariantContextBuilder("test", + "chr?", 100, 100, + Arrays.asList(Allele.REF_A, Allele.ALT_T, Allele.create("TC"))); + final Genotype g1 = new GenotypeBuilder("s1", Arrays.asList(Allele.REF_A, Allele.REF_A)).make(); + final Genotype g2 = new GenotypeBuilder("s2", Arrays.asList(Allele.ALT_T, Allele.create("TC"))).make(); + vcb.genotypes(g1, g2); + final VariantContext vc = vcb.make(); + + Assert.assertEquals(VCFEncoder.encodeGtField(vc, g1), "0/0"); + Assert.assertEquals(VCFEncoder.encodeGtField(vc, g2), "1/2"); + } + @Test(dataProvider = "MissingFormatTestData") public void testMissingFormatFields(final VCFEncoder encoder, final VariantContext vc, final String expectedLastColumn, final Map alleleMap, final List genotypeFormatKeys) { final StringBuilder sb = new StringBuilder();