Skip to content

Commit

Permalink
Add filtering to ExtractCohort (#6971)
Browse files Browse the repository at this point in the history
* adding filtering to extractCohort

* addressing comments
  • Loading branch information
meganshand authored and kcibul committed Mar 9, 2021
1 parent 9c73c39 commit 7b5dfd4
Show file tree
Hide file tree
Showing 7 changed files with 164 additions and 82 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -157,14 +157,7 @@ public static Set<VCFHeaderLine> getEvoquerVcfHeaderLines() {
headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.SB_TABLE_KEY));

headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.AS_VQS_LOD_KEY));

// TODO: Temporary. We don't really want these as FORMAT fields,
// headerLines.add(
// new VCFInfoHeaderLine(AS_VQS_LOD_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.String, "For each alt allele, the log odds of being a true variant versus being false under the trained gaussian mixture model")
// );
// headerLines.add(
// new VCFInfoHeaderLine(AS_YNG_STATUS_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.String, "For each alt allele, status of the YNG filter")
// );
headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.AS_YNG_STATUS_KEY));

headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.AS_VARIANT_DEPTH_KEY));
headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.VARIANT_DEPTH_KEY));
Expand All @@ -181,13 +174,9 @@ public static Set<VCFHeaderLine> getEvoquerVcfHeaderLines() {

headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.SB_TABLE_KEY));

// TODO: There must be a more appropriate constant to use for these
// headerLines.add(new VCFFilterHeaderLine(PASSES_FILTERS_v4, "PASSING"));

// TODO: fix these
// headerLines.add(new VCFFilterHeaderLine("NAY", "Site is Nay in the YNG table"));
// headerLines.add(new VCFFilterHeaderLine("VQSRTrancheSNP", "Site fails to exceed the SNP tranch threshold"));
headerLines.add(new VCFFilterHeaderLine("VQSRTrancheINDEL", "Site fails to exceel the INDEL tranch threshold"));
headerLines.add(GATKVCFHeaderLines.getFilterLine(GATKVCFConstants.VQSR_TRANCHE_SNP));
headerLines.add(GATKVCFHeaderLines.getFilterLine(GATKVCFConstants.VQSR_TRANCHE_INDEL));
headerLines.add(GATKVCFHeaderLines.getFilterLine(GATKVCFConstants.NAY_FROM_YNG));

return headerLines;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,17 @@ public class SchemaUtils {
public static final String CALL_PID = GENOTYPE_FIELD_PREFIX + "PID";
public static final String CALL_PL = GENOTYPE_FIELD_PREFIX + "PL";

//Filtering table
public static final String FILTER_SET_NAME = "filter_set_name";
public static final String TYPE = "type";
public static final String VQSLOD = "vqslod";
public static final String CULPRIT = "culprit";
public static final String TRAINING_LABEL = "training_label";
public static final String YNG_STATUS = "yng_status";

public static final List<String> SAMPLE_FIELDS = Arrays.asList(SchemaUtils.SAMPLE_NAME_FIELD_NAME, SchemaUtils.SAMPLE_ID_FIELD_NAME);
public static final List<String> YNG_FIELDS = Arrays.asList(LOCATION_FIELD_NAME, REF_ALLELE_FIELD_NAME, ALT_ALLELE_FIELD_NAME);
public static final List<String> YNG_FIELDS = Arrays.asList(FILTER_SET_NAME, LOCATION_FIELD_NAME, REF_ALLELE_FIELD_NAME, ALT_ALLELE_FIELD_NAME, VQSLOD, YNG_STATUS);


public static final List<String> PET_FIELDS = Arrays.asList(LOCATION_FIELD_NAME, SAMPLE_ID_FIELD_NAME, STATE_FIELD_NAME);
public static final List<String> VET_FIELDS = Arrays.asList(SAMPLE_ID_FIELD_NAME, LOCATION_FIELD_NAME, REF_ALLELE_FIELD_NAME, ALT_ALLELE_FIELD_NAME, AS_RAW_MQ,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,11 @@
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.programgroups.ShortVariantDiscoveryProgramGroup;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.variantdb.CommonCode;
import org.broadinstitute.hellbender.tools.variantdb.SampleList;
import org.broadinstitute.hellbender.tools.variantdb.SchemaUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;

import java.util.*;

Expand Down Expand Up @@ -37,6 +40,12 @@ public class ExtractCohort extends ExtractTool {
)
private String cohortTable = null;

@Argument(
fullName = "filter-set-name",
doc = "Name in filter_set_name column of filtering table to use. Which training set should be applied in extract."
)
private String filterSetName = null;

@Override
protected void onStartup() {
super.onStartup();
Expand All @@ -46,6 +55,17 @@ protected void onStartup() {

VCFHeader header = CommonCode.generateVcfHeader(sampleNames, reference.getSequenceDictionary());

if (minLocation == null && maxLocation == null && hasUserSuppliedIntervals()) {
final SimpleInterval firstInterval = getTraversalIntervals().get(0);
final SimpleInterval lastInterval = getTraversalIntervals().get(getTraversalIntervals().size() - 1);

minLocation = SchemaUtils.encodeLocation(firstInterval.getContig(), firstInterval.getStart());
maxLocation = SchemaUtils.encodeLocation(lastInterval.getContig(), lastInterval.getEnd());
} else if ((minLocation != null || maxLocation != null) && hasUserSuppliedIntervals()) {
throw new UserException("min-location and max-location should not be used together with intervals (-L).");
}


engine = new ExtractCohortEngine(
projectID,
vcfWriter,
Expand All @@ -63,7 +83,8 @@ protected void onStartup() {
vqsLodSNPThreshold,
vqsLodINDELThreshold,
progressMeter,
queryMode);
queryMode,
filterSetName);
vcfWriter.writeHeader(header);
}

Expand Down
Loading

0 comments on commit 7b5dfd4

Please sign in to comment.