Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a --numeric-gt option to VariantsToTable #8219

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.broadinstitute.hellbender.tools.walkers.variantutils;

import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.*;
import org.apache.logging.log4j.LogManager;
Expand Down Expand Up @@ -122,6 +123,7 @@
public final class VariantsToTable extends VariantWalker {
public final static String SPLIT_MULTI_ALLELIC_LONG_NAME = "split-multi-allelic";
public final static String SPLIT_MULTI_ALLELIC_SHORT_NAME = "SMA";
public static final String NUMERIC_GT_FULLNAME = "numeric-gt";
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

perhaps "use-numeric-gt" to make it verbed appropriately?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done


static final Logger logger = LogManager.getLogger(VariantsToTable.class);

Expand Down Expand Up @@ -204,6 +206,11 @@ public final class VariantsToTable extends VariantWalker {
doc="Fail on missing data", optional=true)
public boolean errorIfMissingData = false;

@Argument(fullName = NUMERIC_GT_FULLNAME,
doc = "write the GT field the way it appears in a VCF ( ex. 0/1 instead of A/T )",
optional = true)
public boolean useNumericGT = false;

private static final String MISSING_DATA = "NA";

private SortedSet<String> samples;
Expand Down Expand Up @@ -341,7 +348,7 @@ private void emitMoltenizedOutput(final List<String> record) {
* @param vc the VariantContext whose field values we can to capture
* @return List of lists of field values
*/
protected List<List<String>> extractFields(final VariantContext vc) {
private List<List<String>> extractFields(final VariantContext vc) {

final int numRecordsToProduce = splitMultiAllelic ? vc.getAlternateAlleles().size() : 1;
final List<List<String>> records = new ArrayList<>(numRecordsToProduce);
Expand Down Expand Up @@ -395,18 +402,23 @@ protected List<List<String>> extractFields(final VariantContext vc) {

private void addGenotypeFieldsToRecords(final VariantContext vc, final List<List<String>> records, final boolean errorIfMissingData) {
for ( final String sample : samples ) {
final Genotype genotype = vc.getGenotype(sample);
for ( final String gf : genotypeFieldsToTake ) {
if ( vc.hasGenotype(sample) && vc.getGenotype(sample).hasAnyAttribute(gf) ) {
if ( vc.hasGenotype(sample) && genotype.hasAnyAttribute(gf) ) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pulling the genotype into a variable above before doing the hasGenotype check seems strange to me.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that's.... a good point...

if (VCFConstants.GENOTYPE_KEY.equals(gf)) {
addFieldValue(vc.getGenotype(sample).getGenotypeString(true), records);
if(useNumericGT) {
addFieldValue(VCFEncoder.encodeGtField(vc, genotype), records);
} else {
addFieldValue(genotype.getGenotypeString(true), records);
}
} else {
/**
* TODO - If gf == "FT" and the GT record is not filtered, Genotype.getAnyAttribute == null. Genotype.hasAnyAttribute should be changed so it
* returns false for this condition. Presently, it always returns true. Once this is fixed, then only the "addFieldValue" statement will
* remain in the following logic block.
*/
if (vc.getGenotype(sample).getAnyAttribute(gf) != null) {
addFieldValue(vc.getGenotype(sample).getAnyAttribute(gf), records);
if (genotype.getAnyAttribute(gf) != null) {
addFieldValue(genotype.getAnyAttribute(gf), records);
} else {
handleMissingData(errorIfMissingData, gf, records, vc);
} }
Expand All @@ -416,21 +428,21 @@ private void addGenotypeFieldsToRecords(final VariantContext vc, final List<List
}

for ( final String field : asGenotypeFieldsToTake) {
if ( vc.hasGenotype(sample) && vc.getGenotype(sample).hasAnyAttribute(field) ) {
if ( vc.hasGenotype(sample) && genotype.hasAnyAttribute(field) ) {
if (splitMultiAllelic) {
if (VCFConstants.GENOTYPE_ALLELE_DEPTHS.equals(field)) {
List<String> altDepths = new ArrayList<>();
int[] allDepths = vc.getGenotype(sample).getAD();
final List<String> altDepths = new ArrayList<>();
int[] allDepths = genotype.getAD();
for (int i = 1; i < allDepths.length; i++) {
altDepths.add(allDepths[0] + "," + allDepths[i]);
}
addFieldValue(altDepths, records);
} else {
addAlleleSpecificFieldValue(split(vc.getGenotype(sample).getExtendedAttribute(field).toString(), ','),
addAlleleSpecificFieldValue(split(genotype.getExtendedAttribute(field).toString(), ','),
records, inputHeader.getFormatHeaderLine(field).getCountType());
}
} else {
final String value = vc.getGenotype(sample).getAnyAttribute(field).toString();
final String value = genotype.getAnyAttribute(field).toString();
if (field.equals(VCFConstants.GENOTYPE_ALLELE_DEPTHS)) {
addFieldValue(value.replace("[","").replace("]","").replaceAll("\\s",""),records);
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import org.broadinstitute.hellbender.CommandLineProgramTest;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.testutils.ArgumentsBuilder;
import org.broadinstitute.hellbender.testutils.IntegrationTestSpec;
import org.testng.annotations.Test;

Expand Down Expand Up @@ -276,5 +277,20 @@ public void testNoFieldsSpecifiedFormatFieldInHeaderNoSamples() throws IOExcepti

IntegrationTestSpec.assertEqualTextFiles(outputFile, expectedFile);
}

@Test
public void testNumericGTFlag() throws IOException {
final File inputFile = new File(getToolTestDataDir(), "VCFWithGenotypes_1000G.phase3.snippet.vcf");
final File outputFile = createTempFile("numericGT", ".table");
final File expectedFile = new File(getToolTestDataDir(), "expected.numericGT.table");

final ArgumentsBuilder args = new ArgumentsBuilder();
args.addVCF(inputFile)
.addOutput(outputFile)
.addFlag(VariantsToTable.NUMERIC_GT_FULLNAME);
runCommandLine(args);

IntegrationTestSpec.assertEqualTextFiles(outputFile, expectedFile);
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
CHROM POS ID REF ALT QUAL FILTER AC AF AN BaseQRankSum ClippingRankSum DP DS END FS HaplotypeScore InbreedingCoeff MLEAC MLEAF MQ MQ0 MQRankSum NEGATIVE_TRAIN_SITE POSITIVE_TRAIN_SITE QD ReadPosRankSum VQSLOD culprit HG00096.GT HG00096.AD HG00096.DP HG00096.GQ HG00096.PL HG00097.GT HG00097.AD HG00097.DP HG00097.GQ HG00097.PL HG00099.GT HG00099.AD HG00099.DP HG00099.GQ HG00099.PL
20 10000054 . CTTTG C 504.42 PASS 0 0.00 6 -0.975 -2.925 22 NA NA 1.899 NA 0.0592 NA NA 59.27 0 -3.212 NA NA 2.43 -0.264 5.10 FS 0/0 2,0 2 6 0,6,119 0/0 10,0 10 29 0,29,592 0/0 10,0 10 30 0,30,598
20 10000107 . T C 263.95 PASS 0 0.00 6 -0.444 -3.132 25 NA NA 0.948 NA -0.0102 NA NA 59.19 0 2.292 NA true 10.56 0.055 7.76 FS 0/0 5,0 5 15 0,15,387 0/0 13,0 13 42 0,42,786 0/0 7,0 7 24 0,24,548
20 10000117 . C T 329458.1699999999 PASS 1 0.167 6 10.505 -20.658 28 NA NA 8.305 NA 0.1727 NA NA 59.17 0 2.689 NA true 25.46 -4.688 3.19 ReadPosRankSum 0/0 5,0 5 15 0,15,189 0/1 8,8 16 99 254,0,231 0/0 7,0 7 21 0,21,271
Loading