From f18a80d90165c7cfcefdbf73dc32f48385c0f161 Mon Sep 17 00:00:00 2001 From: Andrea Haessly Date: Tue, 15 Sep 2020 13:42:04 -0400 Subject: [PATCH 1/4] change GTs to single character, drop hom ref, add sample metrics to sample metadata tsv --- .../tools/variantdb/CommonCode.java | 38 +++++-- .../tools/variantdb/IngestConstants.java | 3 +- .../arrays/ArrayExtractCohortEngine.java | 9 +- .../arrays/ArrayMetadataTsvCreator.java | 74 ------------- .../arrays/ArraySampleFieldEnum.java | 100 ++++++++++++++++++ .../arrays/ArraySampleTsvCreator.java | 98 +++++++++++++++++ .../arrays/CreateArrayIngestFiles.java | 24 +++-- .../variantdb/arrays/RawArrayFieldEnum.java | 79 +++++--------- .../variantdb/arrays/RawArrayTsvCreator.java | 33 +++--- .../variantdb/nextgen/ExomeFieldEnum.java | 14 +-- .../variantdb/nextgen/MetadataTsvCreator.java | 2 +- 11 files changed, 293 insertions(+), 181 deletions(-) delete mode 100644 src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayMetadataTsvCreator.java create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArraySampleFieldEnum.java create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArraySampleTsvCreator.java diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/CommonCode.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/CommonCode.java index a4f4a0aaa34..37c54234a26 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/CommonCode.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/CommonCode.java @@ -1,29 +1,51 @@ package org.broadinstitute.hellbender.tools.variantdb; import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.*; +import org.apache.commons.lang.StringUtils; +import org.broadinstitute.hellbender.tools.variantdb.arrays.RawArrayTsvCreator; +import org.broadinstitute.hellbender.utils.genotyper.IndexedAlleleList; import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines; +import java.util.ArrayList; import java.util.HashSet; +import java.util.List; import java.util.Set; //TODO rename this or get rid of it. a place holder for now public class CommonCode { - public static final String NORMX = "NORMX"; - public static final String NORMY = "NORMY"; - public static final String BAF = "BAF"; - public static final String LRR = "LRR"; + public static String getGTString(final VariantContext variant) { + List allele_indices = getGTAlleleIndexes(variant); + if (allele_indices.size() != 2){ + throw new IllegalArgumentException("GT doesnt have two alleles"); + } + String separator = variant.getGenotype(0).isPhased() ? VCFConstants.PHASED : VCFConstants.UNPHASED; + return StringUtils.join(allele_indices, separator); + } + + public static List getGTAlleleIndexes(final VariantContext variant) { + IndexedAlleleList alleleList = new IndexedAlleleList<>(variant.getAlleles()); + ArrayList allele_indices = new ArrayList(); + + for (Allele allele : variant.getGenotype(0).getAlleles()) { + allele_indices.add(alleleList.indexOfAllele(allele)); + } + return allele_indices; + } + public static VCFHeader generateRawArrayVcfHeader(Set sampleNames, final SAMSequenceDictionary sequenceDictionary) { final Set lines = new HashSet<>(); lines.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY)); - lines.add(new VCFFormatHeaderLine(NORMX, 1, VCFHeaderLineType.Float, "Normalized X intensity")); - lines.add(new VCFFormatHeaderLine(NORMY, 1, VCFHeaderLineType.Float, "Normalized Y intensity")); - lines.add(new VCFFormatHeaderLine(BAF, 1, VCFHeaderLineType.Float, "B Allele Frequency")); - lines.add(new VCFFormatHeaderLine(LRR, 1, VCFHeaderLineType.Float, "Log R Ratio")); + lines.add(new VCFFormatHeaderLine(RawArrayTsvCreator.NORMX, 1, VCFHeaderLineType.Float, "Normalized X intensity")); + lines.add(new VCFFormatHeaderLine(RawArrayTsvCreator.NORMY, 1, VCFHeaderLineType.Float, "Normalized Y intensity")); + lines.add(new VCFFormatHeaderLine(RawArrayTsvCreator.BAF, 1, VCFHeaderLineType.Float, "B Allele Frequency")); + lines.add(new VCFFormatHeaderLine(RawArrayTsvCreator.LRR, 1, VCFHeaderLineType.Float, "Log R Ratio")); final VCFHeader header = new VCFHeader(lines, sampleNames); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/IngestConstants.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/IngestConstants.java index fa5128b0f2b..806a8ebc52e 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/IngestConstants.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/IngestConstants.java @@ -4,8 +4,7 @@ public class IngestConstants { public static final char SEPARATOR = '\t'; public static final String FILETYPE = ".tsv"; - public static final String metadataFilePrefix = "metadata_"; - public static final String metadataDirectoryName = "metadata"; // TODO remove + public static final String sampleMetadataFilePrefix = "sample_"; public static final int partitionPerTable = 4000; } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayExtractCohortEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayExtractCohortEngine.java index d4b398cc378..86bbf6de10d 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayExtractCohortEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayExtractCohortEngine.java @@ -12,7 +12,6 @@ import org.broadinstitute.hellbender.engine.ProgressMeter; import org.broadinstitute.hellbender.engine.ReferenceDataSource; import org.broadinstitute.hellbender.tools.variantdb.arrays.BasicArrayData.ArrayGenotype; -import org.broadinstitute.hellbender.tools.variantdb.CommonCode; import org.broadinstitute.hellbender.tools.variantdb.SchemaUtils; import org.broadinstitute.hellbender.tools.variantdb.arrays.tables.ProbeInfo; import org.broadinstitute.hellbender.tools.walkers.ReferenceConfidenceVariantContextMerger; @@ -375,10 +374,10 @@ private VariantContext createVariantContextFromSampleRecord(final ProbeInfo prob } genotypeBuilder.alleles(genotypeAlleles); - genotypeBuilder.attribute(CommonCode.NORMX, formatFloatForVcf(normx)); - genotypeBuilder.attribute(CommonCode.NORMY, formatFloatForVcf(normy)); - genotypeBuilder.attribute(CommonCode.BAF, formatFloatForVcf(baf)); - genotypeBuilder.attribute(CommonCode.LRR, formatFloatForVcf(lrr)); + genotypeBuilder.attribute(RawArrayTsvCreator.NORMX, formatFloatForVcf(normx)); + genotypeBuilder.attribute(RawArrayTsvCreator.NORMY, formatFloatForVcf(normy)); + genotypeBuilder.attribute(RawArrayTsvCreator.BAF, formatFloatForVcf(baf)); + genotypeBuilder.attribute(RawArrayTsvCreator.LRR, formatFloatForVcf(lrr)); genotypeBuilder.name(sample); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayMetadataTsvCreator.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayMetadataTsvCreator.java deleted file mode 100644 index 4bc3cf1eb72..00000000000 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayMetadataTsvCreator.java +++ /dev/null @@ -1,74 +0,0 @@ -package org.broadinstitute.hellbender.tools.variantdb.arrays; - -import org.apache.commons.lang3.StringUtils; -import org.broadinstitute.hellbender.exceptions.UserException; -import org.broadinstitute.hellbender.tools.variantdb.IngestConstants; -import org.broadinstitute.hellbender.tools.variantdb.nextgen.PetTsvCreator; -import org.broadinstitute.hellbender.utils.SimpleInterval; -import org.broadinstitute.hellbender.utils.Utils; -import org.broadinstitute.hellbender.utils.tsv.SimpleXSVWriter; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Paths; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.stream.Collectors; - -public class ArrayMetadataTsvCreator { - - private SimpleXSVWriter sampleMetadataWriter = null; - - /** - * Expected headers for the Sample List Table - */ - public enum HeaderFieldEnum { - sample_name, - sample_id - } - - public static List getHeaders() { - return Arrays.stream(ArrayMetadataTsvCreator.HeaderFieldEnum.values()).map(String::valueOf).collect(Collectors.toList()); - } - - public void createRow(String sampleName, String sampleId, String tableNumberPrefix, File outputDirectory) { - // if the metadata tsvs don't exist yet -- create them - try { - // Create a metadata file to go into the metadata dir for _this_ sample - // TODO--this should just be one file per sample set? - final File sampleMetadataName = new File (outputDirectory, IngestConstants.metadataFilePrefix + tableNumberPrefix + sampleName + IngestConstants.FILETYPE); - // write header to it - List sampleListHeader = ArrayMetadataTsvCreator.getHeaders(); - sampleMetadataWriter = new SimpleXSVWriter(sampleMetadataName.toPath(), IngestConstants.SEPARATOR); - sampleMetadataWriter.setHeaderLine(sampleListHeader); - - final List TSVLineToCreateSampleMetadata = createSampleListRow( - sampleName, - sampleId); - sampleMetadataWriter.getNewLineBuilder().setRow(TSVLineToCreateSampleMetadata).write(); - - } catch (final IOException e) { - throw new UserException("Could not create sample metadata outputs", e); - } - - } - - private List createSampleListRow(String sampleName, String sampleId) { - List row = new ArrayList<>(); - row.add(sampleName); - row.add(sampleId); - return row; - } - - public void closeTool() { - if (sampleMetadataWriter != null) { - try { - sampleMetadataWriter.close(); - } catch (final Exception e) { - throw new IllegalArgumentException("Couldn't close VET writer", e); - } - } - - } -} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArraySampleFieldEnum.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArraySampleFieldEnum.java new file mode 100644 index 00000000000..99d7e54d0d3 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArraySampleFieldEnum.java @@ -0,0 +1,100 @@ +package org.broadinstitute.hellbender.tools.variantdb.arrays; + +import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.hellbender.tools.variantdb.CommonCode; +import org.broadinstitute.hellbender.tools.variantdb.arrays.tables.ProbeInfo; + +import java.util.*; + + + +public enum ArraySampleFieldEnum { + sample_id, + sample_name, + + + // This where the validation step (required vs not) lives -- fail if there is missing data for a required field + // and just leave it empty if not required + + NUM_ASSAYS { + public String getColumnValue(final Map metricsMap) { + return metricsMap.get(this.name()); + } + }, + NUM_NON_FILTERED_ASSAYS{ + public String getColumnValue(final Map metricsMap) { + return metricsMap.get(this.name()); + } + }, + NUM_FILTERED_ASSAYS{ + public String getColumnValue(final Map metricsMap) { + return metricsMap.get(this.name()); + } + }, + NUM_ZEROED_OUT_ASSAYS{ + public String getColumnValue(final Map metricsMap) { + return metricsMap.get(this.name()); + } + }, + NUM_SNPS{ + public String getColumnValue(final Map metricsMap) { + return metricsMap.get(this.name()); + } + }, + NUM_INDELS{ + public String getColumnValue(final Map metricsMap) { + return metricsMap.get(this.name()); + } + }, + NUM_CALLS{ + public String getColumnValue(final Map metricsMap) { + return metricsMap.get(this.name()); + } + }, + NUM_AUTOCALL_CALLS{ + public String getColumnValue(final Map metricsMap) { + return metricsMap.get(this.name()); + } + }, + NUM_NO_CALLS{ + public String getColumnValue(final Map metricsMap) { + return metricsMap.get(this.name()); + } + }, + NUM_IN_DB_SNP{ + public String getColumnValue(final Map metricsMap) { + return metricsMap.get(this.name()); + } + }, + NOVEL_SNPS{ + public String getColumnValue(final Map metricsMap) { + return metricsMap.get(this.name()); + } + }, + + PCT_DBSNP{ + public String getColumnValue(final Map metricsMap) { + return metricsMap.get(this.name()); + } + }, + CALL_RATE{ + public String getColumnValue(final Map metricsMap) { + return metricsMap.get(this.name()); + } + }, + AUTOCALL_CALL_RATE{ + public String getColumnValue(final Map metricsMap) { + return metricsMap.get(this.name()); + } + }, + NUM_SINGLETONS{ + public String getColumnValue(final Map metricsMap) { + return metricsMap.get(this.name()); + } + }; + + public String getColumnValue(final Map metricsMap) { + throw new IllegalArgumentException("Not implemented"); + } + +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArraySampleTsvCreator.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArraySampleTsvCreator.java new file mode 100644 index 00000000000..9de9d9a36c9 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArraySampleTsvCreator.java @@ -0,0 +1,98 @@ +package org.broadinstitute.hellbender.tools.variantdb.arrays; + +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.tools.variantdb.IngestConstants; +import org.broadinstitute.hellbender.utils.tsv.SimpleXSVWriter; + +import java.io.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +public class ArraySampleTsvCreator { + + private SimpleXSVWriter sampleMetadataWriter = null; + private Map metricsMap; + + + public ArraySampleTsvCreator(String metricsFilepath) { + BufferedReader reader = null; + try { + String columns = null; + String values = null; + reader = new BufferedReader(new FileReader(metricsFilepath)); + String line = reader.readLine(); + while (line != null) { + if (!line.startsWith("#") && !line.trim().isEmpty()) { + if (columns == null) { + columns = line; + } else { + values = line; + break; + } + } + line = reader.readLine(); + } + + List colList = Arrays.asList(columns.split("\t")); + List valList = Arrays.asList(values.split("\t")); + metricsMap = IntStream.range(0, colList.size()).boxed().collect(Collectors.toMap(colList::get, valList::get)); + + } catch (IOException e) { + throw new RuntimeException("could not read metrics file", e); + } + } + + public static List getHeaders() { + return Arrays.stream(ArraySampleFieldEnum.values()).map(String::valueOf).collect(Collectors.toList()); + } + + public void createRow(String sampleName, String sampleId, String tableNumberPrefix, File outputDirectory) { + // if the metadata tsvs don't exist yet -- create them + try { + // Create a metadata file to go into the metadata dir for _this_ sample + // TODO--this should just be one file per sample set? + final File sampleMetadataFileName = new File (outputDirectory, IngestConstants.sampleMetadataFilePrefix + tableNumberPrefix + sampleName + IngestConstants.FILETYPE); + // write header to it + List sampleListHeader = ArraySampleTsvCreator.getHeaders(); + sampleMetadataWriter = new SimpleXSVWriter(sampleMetadataFileName.toPath(), IngestConstants.SEPARATOR); + sampleMetadataWriter.setHeaderLine(sampleListHeader); + + final List TSVLineToCreateSampleMetadata = createSampleListRow( + sampleName, + sampleId); + sampleMetadataWriter.getNewLineBuilder().setRow(TSVLineToCreateSampleMetadata).write(); + + } catch (final IOException e) { + throw new UserException("Could not create sample outputs", e); + } + + } + + private List createSampleListRow(String sampleName, String sampleId) { + List row = new ArrayList<>(); + row.add(sampleName); + row.add(sampleId); + + for (final ArraySampleFieldEnum fieldEnum : ArraySampleFieldEnum.values()) { + if (fieldEnum != ArraySampleFieldEnum.sample_id && fieldEnum != ArraySampleFieldEnum.sample_name) { + row.add(fieldEnum.getColumnValue(metricsMap)); + } + } + return row; + } + + public void closeTool() { + if (sampleMetadataWriter != null) { + try { + sampleMetadataWriter.close(); + } catch (final Exception e) { + throw new IllegalArgumentException("Couldn't close array sample writer", e); + } + } + + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/CreateArrayIngestFiles.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/CreateArrayIngestFiles.java index 71791fabd57..ce972f4a58e 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/CreateArrayIngestFiles.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/CreateArrayIngestFiles.java @@ -29,12 +29,19 @@ public final class CreateArrayIngestFiles extends VariantWalker { static final Logger logger = LogManager.getLogger(CreateArrayIngestFiles.class); - private ArrayMetadataTsvCreator metadataTsvCreator; + private ArraySampleTsvCreator sampleTsvCreator; private RawArrayTsvCreator tsvCreator; private String sampleName; private String sampleId; + @Argument( + fullName = "metrics-file", + shortName = "QCF", + doc = "Filepath to picard metrics file", + optional = true) // TODO change this to false for release + private String metricsFilePath = null; + @Argument(fullName = "sample-name-mapping", shortName = "SNM", doc = "Sample name to sample id mapping", @@ -60,11 +67,6 @@ public final class CreateArrayIngestFiles extends VariantWalker { optional = true) private String probeCsvFile = null; - @Argument( - fullName = "use-compressed-data", - doc = "If true, use bit-packed fields for data", - optional = true) - private boolean useCompressedData = false; @Argument( fullName = "ref-version", @@ -102,8 +104,8 @@ public void onTraversalStart() { int sampleTableNumber = IngestUtils.getTableNumber(sampleId, IngestConstants.partitionPerTable); String tableNumberPrefix = String.format("%03d_", sampleTableNumber); - metadataTsvCreator = new ArrayMetadataTsvCreator(); - metadataTsvCreator.createRow(sampleName, sampleId, tableNumberPrefix, outputDir); + sampleTsvCreator = new ArraySampleTsvCreator(metricsFilePath); + sampleTsvCreator.createRow(sampleName, sampleId, tableNumberPrefix, outputDir); Map probeNameMap; if (probeCsvFile == null) { @@ -115,7 +117,7 @@ public void onTraversalStart() { // Set reference version ChromosomeEnum.setRefVersion(refVersion); - tsvCreator = new RawArrayTsvCreator(sampleName, sampleId, tableNumberPrefix, probeNameMap, useCompressedData, outputDir); + tsvCreator = new RawArrayTsvCreator(sampleName, sampleId, tableNumberPrefix, probeNameMap, outputDir); } @@ -137,8 +139,8 @@ public void closeTool() { if (tsvCreator != null) { tsvCreator.closeTool(); } - if (metadataTsvCreator != null) { - metadataTsvCreator.closeTool(); + if (sampleTsvCreator != null) { + sampleTsvCreator.closeTool(); } } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/RawArrayFieldEnum.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/RawArrayFieldEnum.java index 0c8bc0133eb..f67a99d625c 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/RawArrayFieldEnum.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/RawArrayFieldEnum.java @@ -3,8 +3,14 @@ import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.hellbender.tools.variantdb.CommonCode; import org.broadinstitute.hellbender.tools.variantdb.arrays.tables.ProbeInfo; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + /** * Expected headers for the uncompressed array table @@ -30,43 +36,6 @@ public String getColumnValue(VariantContext variant, ProbeInfo probeInfo, String // This where the validation step (required vs not) lives -- fail if there is missing data for a required field // and just leave it empty if not required - basic_array_data { - public String getColumnValue(final VariantContext variant, ProbeInfo probeInfo, String sampleId) { - String gt = GT_encoded.getColumnValue(variant, probeInfo, sampleId); - BasicArrayData.ArrayGenotype agt; - if (".".equals(gt)) { - agt = BasicArrayData.ArrayGenotype.NO_CALL; - } else { - agt = BasicArrayData.ArrayGenotype.valueOf(gt); - } - BasicArrayData d = new BasicArrayData(Integer.parseInt(sampleId), (int) probeInfo.probeId, agt); - return String.valueOf(d.encode()); - } - }, - - raw_array_data { - private Float convert(String s) { - if (s == null || "".equals(s) || "null".equals(s) ) { - return null; - } else { - return Float.parseFloat(s); - } - } - - public String getColumnValue(final VariantContext variant, ProbeInfo probeInfo, String sampleId) { - String normx = NORMX.getColumnValue(variant, probeInfo, sampleId); - String normy = NORMY.getColumnValue(variant, probeInfo, sampleId); - String baf = BAF.getColumnValue(variant, probeInfo, sampleId); - String lrr = LRR.getColumnValue(variant, probeInfo, sampleId); - - RawArrayData d = new RawArrayData(convert(normx), - convert(normy), - convert(baf), - convert(lrr) - ); - return String.valueOf(d.encode()); - } - }, probe_id { // Required public String getColumnValue(final VariantContext variant, ProbeInfo probeInfo, String sampleId) { @@ -74,23 +43,32 @@ public String getColumnValue(final VariantContext variant, ProbeInfo probeInfo, } }, - GT_encoded { + GT_encoded { // Required public String getColumnValue(final VariantContext variant, ProbeInfo probeInfo, String sampleId) { - Genotype g = variant.getGenotype(0); + List alleleIndexes = CommonCode.getGTAlleleIndexes(variant); + RawArrayTsvCreator.GT_encoding gt = RawArrayTsvCreator.GT_encoding.MISSING; - if (g.isHomRef() || g.isHomVar()) { - Allele allele = g.getAllele(0); - if (allele.basesMatch(probeInfo.alleleA)) { - gt = RawArrayTsvCreator.GT_encoding.AA; - } else if (allele.basesMatch(probeInfo.alleleB)) { - gt = RawArrayTsvCreator.GT_encoding.BB; + if (alleleIndexes.size() == 2) { + Set uniqueAlleleIndexes = new HashSet<>(alleleIndexes); + + if (uniqueAlleleIndexes.size() == 1) { + // we know it's HOM something + if (uniqueAlleleIndexes.contains(0)) { + gt = RawArrayTsvCreator.GT_encoding.HOM_REF; + } else if (uniqueAlleleIndexes.contains(1)) { + gt = RawArrayTsvCreator.GT_encoding.HOM_VAR; + } else if (uniqueAlleleIndexes.contains(2)) { + gt = RawArrayTsvCreator.GT_encoding.HOM_ALT2; + } } else { - throw new IllegalStateException("allele: " + allele + " must match either A: " + probeInfo.alleleA + " or B: " + probeInfo.alleleB); + // we know its het + if (uniqueAlleleIndexes.containsAll(new HashSet<>(Arrays.asList(0, 1)))) { + gt = RawArrayTsvCreator.GT_encoding.HET0_1; + } else if (uniqueAlleleIndexes.containsAll(new HashSet<>(Arrays.asList(1, 2)))) + gt = RawArrayTsvCreator.GT_encoding.HET1_2; } - } else if (g.isHet()) { - gt = RawArrayTsvCreator.GT_encoding.AB; } - return gt.getValue(); + return gt == RawArrayTsvCreator.value_to_drop ? "null" : gt.getValue(); } }, @@ -122,7 +100,4 @@ public String getColumnValue(final VariantContext variant, ProbeInfo probeInfo, public static RawArrayFieldEnum[] getUncompressedRawArrayFieldEnums() { return new RawArrayFieldEnum[] { sample_id, probe_id, GT_encoded, NORMX, NORMY, BAF, LRR }; } - public static RawArrayFieldEnum[] getCompressedRawArrayFieldEnums() { - return new RawArrayFieldEnum[] { basic_array_data, raw_array_data }; - } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/RawArrayTsvCreator.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/RawArrayTsvCreator.java index 2d13d2d7c7c..f864dcf830f 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/RawArrayTsvCreator.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/RawArrayTsvCreator.java @@ -23,16 +23,23 @@ public final class RawArrayTsvCreator { static final Logger logger = LogManager.getLogger(RawArrayTsvCreator.class); + public static final String NORMX = "NORMX"; + public static final String NORMY = "NORMY"; + public static final String BAF = "BAF"; + public static final String LRR = "LRR"; + public static final GT_encoding value_to_drop = GT_encoding.HOM_REF; + private SimpleXSVWriter rawArrayWriter = null; private final String sampleId; private final Map probeDataByName; - private final boolean useCompressedData; private static String RAW_FILETYPE_PREFIX = "raw_"; enum GT_encoding { - AA("AA"), - AB("AB"), - BB("BB"), + HOM_REF("R"), + HET0_1("X"), + HOM_VAR("A"), + HET1_2("Y"), + HOM_ALT2("B"), MISSING("."); String value; @@ -44,15 +51,14 @@ String getValue() { } } - public RawArrayTsvCreator(final String sampleName, final String sampleId, final String tableNumberPrefix, final Map probeDataByName, boolean useCompressedData, final File outputDirectory) { + public RawArrayTsvCreator(final String sampleName, final String sampleId, final String tableNumberPrefix, final Map probeDataByName, final File outputDirectory) { this.sampleId = sampleId; this.probeDataByName = probeDataByName; - this.useCompressedData = useCompressedData; try { // Create a raw file to go into the raw dir for _this_ sample final File rawOutputName = new File(outputDirectory, RAW_FILETYPE_PREFIX + tableNumberPrefix + sampleName + IngestConstants.FILETYPE); // write header to it - List rawHeader = RawArrayTsvCreator.getHeaders(useCompressedData); + List rawHeader = RawArrayTsvCreator.getHeaders(); rawArrayWriter = new SimpleXSVWriter(rawOutputName.toPath(), IngestConstants.SEPARATOR); rawArrayWriter.setHeaderLine(rawHeader); } catch (final IOException e) { @@ -70,21 +76,14 @@ public List createRow(final VariantContext variant, final String sampleI if (probeInfo == null) { logger.warn("no probe found for variant with ID: " + variant.getID() + "\t" + variant); } else { - RawArrayFieldEnum[] fields = RawArrayFieldEnum.getCompressedRawArrayFieldEnums(); - if (!useCompressedData) { - fields = RawArrayFieldEnum.getUncompressedRawArrayFieldEnums(); - } - for (final RawArrayFieldEnum fieldEnum : fields) { + for (final RawArrayFieldEnum fieldEnum : RawArrayFieldEnum.getUncompressedRawArrayFieldEnums()) { row.add(fieldEnum.getColumnValue(variant, probeInfo, sampleId)); } } return row; } - public static List getHeaders(final boolean useCompressedData) { - if (useCompressedData) { - return Arrays.stream(RawArrayFieldEnum.getCompressedRawArrayFieldEnums()).map(String::valueOf).collect(Collectors.toList()); - } + public static List getHeaders() { return Arrays.stream(RawArrayFieldEnum.getUncompressedRawArrayFieldEnums()).map(String::valueOf).collect(Collectors.toList()); } @@ -92,7 +91,7 @@ public void apply(final VariantContext variant, final ReadsContext readsContext, if (!variant.getFilters().contains("ZEROED_OUT_ASSAY")) { final List rowData = createRow(variant, sampleId); - int length = useCompressedData ? RawArrayFieldEnum.getCompressedRawArrayFieldEnums().length : RawArrayFieldEnum.getUncompressedRawArrayFieldEnums().length; + int length = RawArrayFieldEnum.getUncompressedRawArrayFieldEnums().length; // write the row to the XSV if (rowData.size() == length) { SimpleXSVWriter.LineBuilder rawLine = rawArrayWriter.getNewLineBuilder(); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/ExomeFieldEnum.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/ExomeFieldEnum.java index 0b406ccb609..addf46cd166 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/ExomeFieldEnum.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/ExomeFieldEnum.java @@ -5,6 +5,7 @@ import htsjdk.variant.vcf.VCFConstants; import org.apache.commons.lang.StringUtils; import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.tools.variantdb.CommonCode; import org.broadinstitute.hellbender.tools.variantdb.SchemaUtils; import org.broadinstitute.hellbender.utils.genotyper.IndexedAlleleList; import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; @@ -192,17 +193,8 @@ public String getColumnValue(final VariantContext variant) { call_GT { public String getColumnValue(final VariantContext variant) { - IndexedAlleleList alleleList = new IndexedAlleleList<>(variant.getAlleles()); - ArrayList allele_indices = new ArrayList(); - - for (Allele allele : variant.getGenotype(0).getAlleles()) { - allele_indices.add(alleleList.indexOfAllele(allele)); - } - if (allele_indices.size() != 2){ - throw new IllegalArgumentException("GT doesnt have two alleles"); - } - String separator = variant.getGenotype(0).isPhased() ? VCFConstants.PHASED : VCFConstants.UNPHASED; - return StringUtils.join(allele_indices, separator); + // TODO how is missing handled? + return CommonCode.getGTString(variant); } }, diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/MetadataTsvCreator.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/MetadataTsvCreator.java index 7925740b814..bf9efe38bb6 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/MetadataTsvCreator.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/MetadataTsvCreator.java @@ -62,7 +62,7 @@ public void createRow(String sampleName, String sampleId, String tableNumberPref try { // Create a metadata file to go into the metadata dir for _this_ sample // TODO--this should just be one file per sample set? - final String sampleMetadataName = IngestConstants.metadataFilePrefix + tableNumberPrefix + sampleName + IngestConstants.FILETYPE; + final String sampleMetadataName = IngestConstants.sampleMetadataFilePrefix + tableNumberPrefix + sampleName + IngestConstants.FILETYPE; // write header to it List sampleListHeader = MetadataTsvCreator.getHeaders(); sampleMetadataWriter = new SimpleXSVWriter(Paths.get(sampleMetadataName), IngestConstants.SEPARATOR); From 010c3b6ba0ee16821ba04d680fadd463024f3000 Mon Sep 17 00:00:00 2001 From: Andrea Haessly Date: Wed, 16 Sep 2020 09:26:39 -0400 Subject: [PATCH 2/4] update based on PR feedback --- .../arrays/ArraySampleTsvCreator.java | 10 +++- .../variantdb/arrays/RawArrayFieldEnum.java | 55 +++++++++---------- .../variantdb/arrays/RawArrayTsvCreator.java | 18 ++++-- 3 files changed, 47 insertions(+), 36 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArraySampleTsvCreator.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArraySampleTsvCreator.java index 9de9d9a36c9..50e63728201 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArraySampleTsvCreator.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArraySampleTsvCreator.java @@ -1,5 +1,7 @@ package org.broadinstitute.hellbender.tools.variantdb.arrays; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.tools.variantdb.IngestConstants; import org.broadinstitute.hellbender.utils.tsv.SimpleXSVWriter; @@ -14,6 +16,8 @@ public class ArraySampleTsvCreator { + private static final Logger logger = LogManager.getLogger(ArraySampleTsvCreator.class); + private SimpleXSVWriter sampleMetadataWriter = null; private Map metricsMap; @@ -29,9 +33,11 @@ public ArraySampleTsvCreator(String metricsFilepath) { if (!line.startsWith("#") && !line.trim().isEmpty()) { if (columns == null) { columns = line; - } else { + } else if (values == null) { values = line; - break; + } else { + // there are more lines than expected - output a warning + logger.warn("more lines than expected in metrics file: " + line); } } line = reader.readLine(); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/RawArrayFieldEnum.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/RawArrayFieldEnum.java index f67a99d625c..3143efc65fe 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/RawArrayFieldEnum.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/RawArrayFieldEnum.java @@ -14,37 +14,24 @@ /** * Expected headers for the uncompressed array table - * sample, // req - * probe_id, // req - * GT_encoded, + * sample, // required + * probe_id, // required + * GT_encoded, // required * NORMX, // intensity * NORMY, // intensity * BAF // b allele fraction --> AD proxy * LRR // Log R ratio --> intensity value instead of DP * - * Headers for the compressed array table - * basic_array_data - * raw_array_data */ public enum RawArrayFieldEnum { - sample_id { - public String getColumnValue(VariantContext variant, ProbeInfo probeInfo, String sampleId) { - return sampleId; - } - }, - - // This where the validation step (required vs not) lives -- fail if there is missing data for a required field - // and just leave it empty if not required - - probe_id { // Required - public String getColumnValue(final VariantContext variant, ProbeInfo probeInfo, String sampleId) { - return String.valueOf(probeInfo.probeId); - } - }, + // fail if there is missing data for a required field + // and return the string "null" if there is missing data for an optional field + sample_id, + probe_id, GT_encoded { // Required - public String getColumnValue(final VariantContext variant, ProbeInfo probeInfo, String sampleId) { + public String getColumnValue(final VariantContext variant) { List alleleIndexes = CommonCode.getGTAlleleIndexes(variant); RawArrayTsvCreator.GT_encoding gt = RawArrayTsvCreator.GT_encoding.MISSING; @@ -72,28 +59,36 @@ public String getColumnValue(final VariantContext variant, ProbeInfo probeInfo, } }, - NORMX { - public String getColumnValue(final VariantContext variant, ProbeInfo probeInfo, String sampleId) { - return String.valueOf(variant.getGenotype(0).getExtendedAttribute("NORMX")); + NORMX { // Required + public String getColumnValue(final VariantContext variant) { + Object value = variant.getGenotype(0).getExtendedAttribute("NORMX"); + if (value == null) { + throw new IllegalStateException("Missing required value NORMX for variant: \t" + variant); + } + return String.valueOf(value); } }, - NORMY { - public String getColumnValue(final VariantContext variant, ProbeInfo probeInfo, String sampleId) { - return String.valueOf(variant.getGenotype(0).getExtendedAttribute("NORMY")); + NORMY { // Required + public String getColumnValue(final VariantContext variant) { + Object value = variant.getGenotype(0).getExtendedAttribute("NORMY"); + if (value == null) { + throw new IllegalStateException("Missing required value NORMY for variant: \t" + variant); + } + return String.valueOf(value); } }, BAF { - public String getColumnValue(final VariantContext variant, ProbeInfo probeInfo, String sampleId) { + public String getColumnValue(final VariantContext variant) { return String.valueOf(variant.getGenotype(0).getExtendedAttribute("BAF")); } }, LRR { - public String getColumnValue(final VariantContext variant, ProbeInfo probeInfo, String sampleId) { + public String getColumnValue(final VariantContext variant) { return String.valueOf(variant.getGenotype(0).getExtendedAttribute("LRR")); } }; - public String getColumnValue(final VariantContext variant, ProbeInfo probeInfo, String sampleId) { + public String getColumnValue(final VariantContext variant) { throw new IllegalArgumentException("Not implemented"); } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/RawArrayTsvCreator.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/RawArrayTsvCreator.java index f864dcf830f..2ccb918dec1 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/RawArrayTsvCreator.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/RawArrayTsvCreator.java @@ -74,12 +74,22 @@ public List createRow(final VariantContext variant, final String sampleI } ProbeInfo probeInfo = probeDataByName.get(rsid); if (probeInfo == null) { - logger.warn("no probe found for variant with ID: " + variant.getID() + "\t" + variant); - } else { - for (final RawArrayFieldEnum fieldEnum : RawArrayFieldEnum.getUncompressedRawArrayFieldEnums()) { - row.add(fieldEnum.getColumnValue(variant, probeInfo, sampleId)); + throw new IllegalStateException("Cannot be missing required probe ID for variant " + variant.getID() + "\t" + variant); + } + + for (final RawArrayFieldEnum fieldEnum : RawArrayFieldEnum.getUncompressedRawArrayFieldEnums()) { + switch (fieldEnum) { + case sample_id: + row.add(sampleId); + break; + case probe_id: + row.add(probeInfo.toString()); + break; + default: + row.add(fieldEnum.getColumnValue(variant, probeInfo, sampleId)); } } + return row; } From 42063f72a87187a5845d7e4c041ba3ad10e0fdf3 Mon Sep 17 00:00:00 2001 From: Andrea Haessly Date: Wed, 16 Sep 2020 09:40:04 -0400 Subject: [PATCH 3/4] fix errors --- .../arrays/ArraySampleTsvCreator.java | 52 +++++++++++-------- .../variantdb/arrays/RawArrayTsvCreator.java | 4 +- 2 files changed, 31 insertions(+), 25 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArraySampleTsvCreator.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArraySampleTsvCreator.java index 50e63728201..5e11e80f538 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArraySampleTsvCreator.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArraySampleTsvCreator.java @@ -23,32 +23,34 @@ public class ArraySampleTsvCreator { public ArraySampleTsvCreator(String metricsFilepath) { - BufferedReader reader = null; - try { - String columns = null; - String values = null; - reader = new BufferedReader(new FileReader(metricsFilepath)); - String line = reader.readLine(); - while (line != null) { - if (!line.startsWith("#") && !line.trim().isEmpty()) { - if (columns == null) { - columns = line; - } else if (values == null) { - values = line; - } else { - // there are more lines than expected - output a warning - logger.warn("more lines than expected in metrics file: " + line); + if (metricsFilepath != null && !metricsFilepath.isEmpty()) { + BufferedReader reader = null; + try { + String columns = null; + String values = null; + reader = new BufferedReader(new FileReader(metricsFilepath)); + String line = reader.readLine(); + while (line != null) { + if (!line.startsWith("#") && !line.trim().isEmpty()) { + if (columns == null) { + columns = line; + } else if (values == null) { + values = line; + } else { + // there are more lines than expected - output a warning + logger.warn("more lines than expected in metrics file: " + line); + } } + line = reader.readLine(); } - line = reader.readLine(); - } - List colList = Arrays.asList(columns.split("\t")); - List valList = Arrays.asList(values.split("\t")); - metricsMap = IntStream.range(0, colList.size()).boxed().collect(Collectors.toMap(colList::get, valList::get)); + List colList = Arrays.asList(columns.split("\t")); + List valList = Arrays.asList(values.split("\t")); + metricsMap = IntStream.range(0, colList.size()).boxed().collect(Collectors.toMap(colList::get, valList::get)); - } catch (IOException e) { - throw new RuntimeException("could not read metrics file", e); + } catch (IOException e) { + throw new RuntimeException("could not read metrics file", e); + } } } @@ -85,7 +87,11 @@ private List createSampleListRow(String sampleName, String sampleId) { for (final ArraySampleFieldEnum fieldEnum : ArraySampleFieldEnum.values()) { if (fieldEnum != ArraySampleFieldEnum.sample_id && fieldEnum != ArraySampleFieldEnum.sample_name) { - row.add(fieldEnum.getColumnValue(metricsMap)); + if (metricsMap == null) { + row.add("null"); + } else { + row.add(fieldEnum.getColumnValue(metricsMap)); + } } } return row; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/RawArrayTsvCreator.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/RawArrayTsvCreator.java index 2ccb918dec1..a5d92643c32 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/RawArrayTsvCreator.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/RawArrayTsvCreator.java @@ -83,10 +83,10 @@ public List createRow(final VariantContext variant, final String sampleI row.add(sampleId); break; case probe_id: - row.add(probeInfo.toString()); + row.add(String.valueOf(probeInfo.probeId)); break; default: - row.add(fieldEnum.getColumnValue(variant, probeInfo, sampleId)); + row.add(fieldEnum.getColumnValue(variant)); } } From b36a4eaec55ac5f0c17e1b2111ac037cdc50e919 Mon Sep 17 00:00:00 2001 From: Andrea Haessly Date: Wed, 16 Sep 2020 12:00:12 -0400 Subject: [PATCH 4/4] add warning --- .../tools/variantdb/arrays/RawArrayFieldEnum.java | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/RawArrayFieldEnum.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/RawArrayFieldEnum.java index 3143efc65fe..dfb6fb72018 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/RawArrayFieldEnum.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/RawArrayFieldEnum.java @@ -3,6 +3,8 @@ import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.VariantContext; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.broadinstitute.hellbender.tools.variantdb.CommonCode; import org.broadinstitute.hellbender.tools.variantdb.arrays.tables.ProbeInfo; @@ -27,6 +29,7 @@ public enum RawArrayFieldEnum { // fail if there is missing data for a required field // and return the string "null" if there is missing data for an optional field + // (in the bq import, it will convert the "null" to an actual null value in the database sample_id, probe_id, @@ -54,6 +57,8 @@ public String getColumnValue(final VariantContext variant) { } else if (uniqueAlleleIndexes.containsAll(new HashSet<>(Arrays.asList(1, 2)))) gt = RawArrayTsvCreator.GT_encoding.HET1_2; } + } else { + logger.warn("Found " + alleleIndexes.size() + " alleles instead of 2. Not processing variant \t" + variant); } return gt == RawArrayTsvCreator.value_to_drop ? "null" : gt.getValue(); } @@ -88,6 +93,8 @@ public String getColumnValue(final VariantContext variant) { } }; + private static final Logger logger = LogManager.getLogger(RawArrayFieldEnum.class); + public String getColumnValue(final VariantContext variant) { throw new IllegalArgumentException("Not implemented"); }