Skip to content

Commit

Permalink
ah - use new GT encoding (#6822)
Browse files Browse the repository at this point in the history
* change GTs to single character, drop hom ref, add sample metrics to sample metadata tsv
  • Loading branch information
ahaessly authored and kcibul committed Feb 1, 2021
1 parent e8cfa77 commit 5475f85
Show file tree
Hide file tree
Showing 11 changed files with 349 additions and 213 deletions.
Original file line number Diff line number Diff line change
@@ -1,29 +1,51 @@
package org.broadinstitute.hellbender.tools.variantdb;

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.*;
import org.apache.commons.lang.StringUtils;
import org.broadinstitute.hellbender.tools.variantdb.arrays.RawArrayTsvCreator;
import org.broadinstitute.hellbender.utils.genotyper.IndexedAlleleList;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

//TODO rename this or get rid of it. a place holder for now
public class CommonCode {
public static final String NORMX = "NORMX";
public static final String NORMY = "NORMY";
public static final String BAF = "BAF";
public static final String LRR = "LRR";


public static String getGTString(final VariantContext variant) {
List<Integer> allele_indices = getGTAlleleIndexes(variant);
if (allele_indices.size() != 2){
throw new IllegalArgumentException("GT doesnt have two alleles");
}
String separator = variant.getGenotype(0).isPhased() ? VCFConstants.PHASED : VCFConstants.UNPHASED;
return StringUtils.join(allele_indices, separator);
}

public static List<Integer> getGTAlleleIndexes(final VariantContext variant) {
IndexedAlleleList<Allele> alleleList = new IndexedAlleleList<>(variant.getAlleles());
ArrayList<Integer> allele_indices = new ArrayList<Integer>();

for (Allele allele : variant.getGenotype(0).getAlleles()) {
allele_indices.add(alleleList.indexOfAllele(allele));
}
return allele_indices;
}

public static VCFHeader generateRawArrayVcfHeader(Set<String> sampleNames, final SAMSequenceDictionary sequenceDictionary) {
final Set<VCFHeaderLine> lines = new HashSet<>();

lines.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY));
lines.add(new VCFFormatHeaderLine(NORMX, 1, VCFHeaderLineType.Float, "Normalized X intensity"));
lines.add(new VCFFormatHeaderLine(NORMY, 1, VCFHeaderLineType.Float, "Normalized Y intensity"));
lines.add(new VCFFormatHeaderLine(BAF, 1, VCFHeaderLineType.Float, "B Allele Frequency"));
lines.add(new VCFFormatHeaderLine(LRR, 1, VCFHeaderLineType.Float, "Log R Ratio"));
lines.add(new VCFFormatHeaderLine(RawArrayTsvCreator.NORMX, 1, VCFHeaderLineType.Float, "Normalized X intensity"));
lines.add(new VCFFormatHeaderLine(RawArrayTsvCreator.NORMY, 1, VCFHeaderLineType.Float, "Normalized Y intensity"));
lines.add(new VCFFormatHeaderLine(RawArrayTsvCreator.BAF, 1, VCFHeaderLineType.Float, "B Allele Frequency"));
lines.add(new VCFFormatHeaderLine(RawArrayTsvCreator.LRR, 1, VCFHeaderLineType.Float, "Log R Ratio"));


final VCFHeader header = new VCFHeader(lines, sampleNames);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@ public class IngestConstants {

public static final char SEPARATOR = '\t';
public static final String FILETYPE = ".tsv";
public static final String metadataFilePrefix = "metadata_";
public static final String metadataDirectoryName = "metadata"; // TODO remove
public static final String sampleMetadataFilePrefix = "sample_";
public static final int partitionPerTable = 4000;

}
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
import org.broadinstitute.hellbender.engine.ProgressMeter;
import org.broadinstitute.hellbender.engine.ReferenceDataSource;
import org.broadinstitute.hellbender.tools.variantdb.arrays.BasicArrayData.ArrayGenotype;
import org.broadinstitute.hellbender.tools.variantdb.CommonCode;
import org.broadinstitute.hellbender.tools.variantdb.SchemaUtils;
import org.broadinstitute.hellbender.tools.variantdb.arrays.tables.ProbeInfo;
import org.broadinstitute.hellbender.tools.walkers.ReferenceConfidenceVariantContextMerger;
Expand Down Expand Up @@ -375,10 +374,10 @@ private VariantContext createVariantContextFromSampleRecord(final ProbeInfo prob
}
genotypeBuilder.alleles(genotypeAlleles);

genotypeBuilder.attribute(CommonCode.NORMX, formatFloatForVcf(normx));
genotypeBuilder.attribute(CommonCode.NORMY, formatFloatForVcf(normy));
genotypeBuilder.attribute(CommonCode.BAF, formatFloatForVcf(baf));
genotypeBuilder.attribute(CommonCode.LRR, formatFloatForVcf(lrr));
genotypeBuilder.attribute(RawArrayTsvCreator.NORMX, formatFloatForVcf(normx));
genotypeBuilder.attribute(RawArrayTsvCreator.NORMY, formatFloatForVcf(normy));
genotypeBuilder.attribute(RawArrayTsvCreator.BAF, formatFloatForVcf(baf));
genotypeBuilder.attribute(RawArrayTsvCreator.LRR, formatFloatForVcf(lrr));

genotypeBuilder.name(sample);

Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
package org.broadinstitute.hellbender.tools.variantdb.arrays;

import htsjdk.variant.variantcontext.VariantContext;
import org.broadinstitute.hellbender.tools.variantdb.CommonCode;
import org.broadinstitute.hellbender.tools.variantdb.arrays.tables.ProbeInfo;

import java.util.*;



public enum ArraySampleFieldEnum {
sample_id,
sample_name,


// This where the validation step (required vs not) lives -- fail if there is missing data for a required field
// and just leave it empty if not required

NUM_ASSAYS {
public String getColumnValue(final Map<String, String> metricsMap) {
return metricsMap.get(this.name());
}
},
NUM_NON_FILTERED_ASSAYS{
public String getColumnValue(final Map<String, String> metricsMap) {
return metricsMap.get(this.name());
}
},
NUM_FILTERED_ASSAYS{
public String getColumnValue(final Map<String, String> metricsMap) {
return metricsMap.get(this.name());
}
},
NUM_ZEROED_OUT_ASSAYS{
public String getColumnValue(final Map<String, String> metricsMap) {
return metricsMap.get(this.name());
}
},
NUM_SNPS{
public String getColumnValue(final Map<String, String> metricsMap) {
return metricsMap.get(this.name());
}
},
NUM_INDELS{
public String getColumnValue(final Map<String, String> metricsMap) {
return metricsMap.get(this.name());
}
},
NUM_CALLS{
public String getColumnValue(final Map<String, String> metricsMap) {
return metricsMap.get(this.name());
}
},
NUM_AUTOCALL_CALLS{
public String getColumnValue(final Map<String, String> metricsMap) {
return metricsMap.get(this.name());
}
},
NUM_NO_CALLS{
public String getColumnValue(final Map<String, String> metricsMap) {
return metricsMap.get(this.name());
}
},
NUM_IN_DB_SNP{
public String getColumnValue(final Map<String, String> metricsMap) {
return metricsMap.get(this.name());
}
},
NOVEL_SNPS{
public String getColumnValue(final Map<String, String> metricsMap) {
return metricsMap.get(this.name());
}
},

PCT_DBSNP{
public String getColumnValue(final Map<String, String> metricsMap) {
return metricsMap.get(this.name());
}
},
CALL_RATE{
public String getColumnValue(final Map<String, String> metricsMap) {
return metricsMap.get(this.name());
}
},
AUTOCALL_CALL_RATE{
public String getColumnValue(final Map<String, String> metricsMap) {
return metricsMap.get(this.name());
}
},
NUM_SINGLETONS{
public String getColumnValue(final Map<String, String> metricsMap) {
return metricsMap.get(this.name());
}
};

public String getColumnValue(final Map<String, String> metricsMap) {
throw new IllegalArgumentException("Not implemented");
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
package org.broadinstitute.hellbender.tools.variantdb.arrays;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.variantdb.IngestConstants;
import org.broadinstitute.hellbender.utils.tsv.SimpleXSVWriter;

import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

public class ArraySampleTsvCreator {

private static final Logger logger = LogManager.getLogger(ArraySampleTsvCreator.class);

private SimpleXSVWriter sampleMetadataWriter = null;
private Map<String, String> metricsMap;


public ArraySampleTsvCreator(String metricsFilepath) {
if (metricsFilepath != null && !metricsFilepath.isEmpty()) {
BufferedReader reader = null;
try {
String columns = null;
String values = null;
reader = new BufferedReader(new FileReader(metricsFilepath));
String line = reader.readLine();
while (line != null) {
if (!line.startsWith("#") && !line.trim().isEmpty()) {
if (columns == null) {
columns = line;
} else if (values == null) {
values = line;
} else {
// there are more lines than expected - output a warning
logger.warn("more lines than expected in metrics file: " + line);
}
}
line = reader.readLine();
}

List<String> colList = Arrays.asList(columns.split("\t"));
List<String> valList = Arrays.asList(values.split("\t"));
metricsMap = IntStream.range(0, colList.size()).boxed().collect(Collectors.toMap(colList::get, valList::get));

} catch (IOException e) {
throw new RuntimeException("could not read metrics file", e);
}
}
}

public static List<String> getHeaders() {
return Arrays.stream(ArraySampleFieldEnum.values()).map(String::valueOf).collect(Collectors.toList());
}

public void createRow(String sampleName, String sampleId, String tableNumberPrefix, File outputDirectory) {
// if the metadata tsvs don't exist yet -- create them
try {
// Create a metadata file to go into the metadata dir for _this_ sample
// TODO--this should just be one file per sample set?
final File sampleMetadataFileName = new File (outputDirectory, IngestConstants.sampleMetadataFilePrefix + tableNumberPrefix + sampleName + IngestConstants.FILETYPE);
// write header to it
List<String> sampleListHeader = ArraySampleTsvCreator.getHeaders();
sampleMetadataWriter = new SimpleXSVWriter(sampleMetadataFileName.toPath(), IngestConstants.SEPARATOR);
sampleMetadataWriter.setHeaderLine(sampleListHeader);

final List<String> TSVLineToCreateSampleMetadata = createSampleListRow(
sampleName,
sampleId);
sampleMetadataWriter.getNewLineBuilder().setRow(TSVLineToCreateSampleMetadata).write();

} catch (final IOException e) {
throw new UserException("Could not create sample outputs", e);
}

}

private List<String> createSampleListRow(String sampleName, String sampleId) {
List<String> row = new ArrayList<>();
row.add(sampleName);
row.add(sampleId);

for (final ArraySampleFieldEnum fieldEnum : ArraySampleFieldEnum.values()) {
if (fieldEnum != ArraySampleFieldEnum.sample_id && fieldEnum != ArraySampleFieldEnum.sample_name) {
if (metricsMap == null) {
row.add("null");
} else {
row.add(fieldEnum.getColumnValue(metricsMap));
}
}
}
return row;
}

public void closeTool() {
if (sampleMetadataWriter != null) {
try {
sampleMetadataWriter.close();
} catch (final Exception e) {
throw new IllegalArgumentException("Couldn't close array sample writer", e);
}
}

}
}
Loading

0 comments on commit 5475f85

Please sign in to comment.