Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GL-548 - Update CreateVat code to handle samples that do not contain all population groups. #7965

Merged
merged 4 commits into from
Aug 2, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ workflows:
branches:
- master
- ah_var_store
- gg_VS-548_HandleSubsetsOfSubpopulations
- name: GvsCreateVATAnnotations
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsCreateVATAnnotations.wdl
Expand Down Expand Up @@ -167,6 +168,7 @@ workflows:
branches:
- master
- ah_var_store
- gg_VS-548_HandleSubsetsOfSubpopulations
- name: GvsExtractCohortFromSampleNames
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsExtractCohortFromSampleNames.wdl
Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsAssignIds.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ task CreateCostObservabilityTable {
fi
>>>
runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_2022_07_14"
docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_2022_08_01"
}
output {
Boolean done = true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ task Add_AS_MAX_VQSLOD_ToVcf {
File input_vcf
String output_basename

String docker = "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_2022_07_14"
String docker = "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_2022_08_01"
Int cpu = 1
Int memory_mb = 3500
Int disk_size_gb = ceil(2*size(input_vcf, "GiB")) + 50
Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsCallsetCost.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ task WorkflowComputeCosts {
>>>

runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_2022_07_14"
docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_2022_08_01"
}

output {
Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsCreateAltAllele.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ task PopulateAltAlleleTable {
$SERVICE_ACCOUNT_STANZA
>>>
runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_2022_07_14"
docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_2022_08_01"
memory: "3 GB"
disks: "local-disk 10 HDD"
cpu: 1
Expand Down
12 changes: 7 additions & 5 deletions scripts/variantstore/wdl/GvsCreateVAT.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ workflow GvsCreateVAT {
Array[String] contig_array = ["chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", "chr21", "chr22", "chrX", "chrY", "chrM"]
File reference = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta"
File nirvana_data_directory = "gs://broad-dsp-spec-ops/scratch/rcremer/Nirvana/NirvanaData.tar.gz"
File AnAcAf_annotations_template = "gs://broad-dsp-spec-ops/scratch/rcremer/Nirvana/vat/custom_annotations_template.tsv"

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice!

call MakeSubpopulationFiles {
input:
Expand All @@ -46,7 +45,7 @@ workflow GvsCreateVAT {
nirvana_data_directory = nirvana_data_directory,
output_path = output_path,
service_account_json_path = service_account_json_path,
custom_annotations_template = AnAcAf_annotations_template,
custom_annotations_template = MakeSubpopulationFiles.custom_annotations_template_file,
ref = reference
}
}
Expand Down Expand Up @@ -128,6 +127,7 @@ task MakeSubpopulationFiles {
}
}
String output_ancestry_filename = "ancestry_mapping.tsv"
String custom_annotations_template_filename = "custom_annotations_template.tsv"
String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'
String updated_input_ancestry_file = basename(input_ancestry_file)
String updated_input_vcfs_file = basename(inputFileofFileNames)
Expand All @@ -149,13 +149,14 @@ task MakeSubpopulationFiles {
## the ancestry file is processed down to a simple mapping from sample to subpopulation
python3 /app/extract_subpop.py \
--input_path ~{updated_input_ancestry_file} \
--output_path ~{output_ancestry_filename}
--output_path ~{output_ancestry_filename} \
--custom_annotations_template_path ~{custom_annotations_template_filename}
>>>

# ------------------------------------------------
# Runtime settings:
runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_2022_07_14"
docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_2022_08_01"
memory: "1 GB"
preemptible: 3
cpu: "1"
Expand All @@ -164,7 +165,8 @@ task MakeSubpopulationFiles {
# ------------------------------------------------
# Outputs:
output {
File ancestry_mapping_list = "~{output_ancestry_filename}"
File ancestry_mapping_list = output_ancestry_filename
File custom_annotations_template_file = custom_annotations_template_filename
Array[File] input_vcfs = read_lines(updated_input_vcfs_file)
Array[File] input_vcf_indices = read_lines(updated_input_indices_file)
}
Expand Down
24 changes: 9 additions & 15 deletions scripts/variantstore/wdl/GvsCreateVATAnnotations.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -107,16 +107,6 @@ task ExtractAnAcAfFromVCF {
gsutil cp ~{input_vcf_index} ~{local_input_vcf_index}
gsutil cp ~{ref} Homo_sapiens_assembly38.fasta

# expected_subpopulations = [
# "afr",
# "amr",
# "eas",
# "eur",
# "mid",
# "oth",
# "sas"
#]

echo_date "VAT: Convert input to BCF format"
bcftools convert --threads 4 -O b -o original.bcf ~{local_input_vcf}
rm ~{local_input_vcf}
Expand Down Expand Up @@ -166,11 +156,15 @@ task ExtractAnAcAfFromVCF {
cat duplicates.tsv >> track_dropped.tsv
rm duplicates.tsv ## clean up unneeded file

echo_date "VAT: calculate annotations for all subpopulations"
SUBPOPS=$(cut -f 2 ~{subpopulation_sample_list} | sort | uniq | tr '\n' ' ')
echo_date "VAT: calculate annotations for the following subpopulations '$SUBPOPS'"

VCF_FIELDS=$(grep ^#CHROM ~{custom_annotations_template} | sed s/^#/%/g | awk '{gsub("\t","\\t%",$0); print;}')
echo_date "VAT: Here are the VCF fields to pull with bcftools: $VCF_FIELDS"

## AC_het,AC_hom and AC_Hemi are used to calculate the participant count
bcftools plugin fill-tags --threads 4 -- deduplicated.bcf -S ~{subpopulation_sample_list} -t AC,AF,AN,AC_het,AC_hom,AC_Hemi | bcftools query -f \
'%CHROM\t%POS\t%REF\t%ALT\t%AC\t%AN\t%AF\t%AC_Hom\t%AC_Het\t%AC_Hemi\t%AC_afr\t%AN_afr\t%AF_afr\t%AC_Hom_afr\t%AC_Het_afr\t%AC_Hemi_afr\t%AC_amr\t%AN_amr\t%AF_amr\t%AC_Hom_amr\t%AC_Het_amr\t%AC_Hemi_amr\t%AC_eas\t%AN_eas\t%AF_eas\t%AC_Hom_eas\t%AC_Het_eas\t%AC_Hemi_eas\t%AC_eur\t%AN_eur\t%AF_eur\t%AC_Hom_eur\t%AC_Het_eur\t%AC_Hemi_eur\t%AC_mid\t%AN_mid\t%AF_mid\t%AC_Hom_mid\t%AC_Het_mid\t%AC_Hemi_mid\t%AC_oth\t%AN_oth\t%AF_oth\t%AC_Hom_oth\t%AC_Het_oth\t%AC_Hemi_oth\t%AC_sas\t%AN_sas\t%AF_sas\t%AC_Hom_sas\t%AC_Het_sas\t%AC_Hemi_sas\n' \
>> ~{custom_annotations_file_name}
"$VCF_FIELDS\n" >> ~{custom_annotations_file_name}

## for validation of the pipeline
wc -l ~{custom_annotations_file_name} | awk '{print $1 -7}' > count.txt
Expand All @@ -186,7 +180,7 @@ task ExtractAnAcAfFromVCF {
# ------------------------------------------------
# Runtime settings:
runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_2022_07_14"
docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_2022_08_01"
maxRetries: 3
memory: "16 GB"
preemptible: 3
Expand Down Expand Up @@ -317,7 +311,7 @@ task PrepAnnotationJson {
# ------------------------------------------------
# Runtime settings:
runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_2022_07_14"
docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_2022_08_01"
memory: "8 GB"
preemptible: 5
cpu: "1"
Expand Down
4 changes: 2 additions & 2 deletions scripts/variantstore/wdl/GvsCreateVATFromAnnotations.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ task GetAnnotations {
# ------------------------------------------------
# Runtime settings:
runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_2022_07_14"
docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_2022_08_01"
memory: "1 GB"
preemptible: 3
cpu: "1"
Expand Down Expand Up @@ -151,7 +151,7 @@ task PrepAnnotationJson {
# ------------------------------------------------
# Runtime settings:
runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_2022_07_14"
docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_2022_08_01"
memory: "8 GB"
preemptible: 5
cpu: "1"
Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsImportGenomes.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,7 @@ task CurateInputLists {
--output_files True
>>>
runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_2022_07_14"
docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_2022_08_01"
memory: "3 GB"
disks: "local-disk 100 HDD"
bootDiskSizeGb: 15
Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsPrepareRangesCallset.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ task PrepareRangesCallsetTask {
}

runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_2022_07_14"
docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_2022_08_01"
memory: "3 GB"
disks: "local-disk 100 HDD"
bootDiskSizeGb: 15
Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsUtils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -374,7 +374,7 @@ task ScaleXYBedValues {
}

runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_2022_07_14"
docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_2022_08_01"
maxRetries: 3
memory: "7 GB"
preemptible: 3
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -172,13 +172,13 @@ def get_subpopulation_calculations(subpop_annotations):
max_af = None
max_sc = None
max_subpop = ""
for gvs_subpop in gvs_subpopulations: # note these will break if a subpopulation is missing
subpop_ac_val = subpop_annotations.get("_".join(["AC", gvs_subpop]))
subpop_an_val = subpop_annotations.get("_".join(["AN", gvs_subpop]))
subpop_af_val = subpop_annotations.get("_".join(["AF", gvs_subpop]))
for gvs_subpop in gvs_subpopulations:
subpop_ac_val = subpop_annotations.get("_".join(["AC", gvs_subpop]), 0)
subpop_an_val = subpop_annotations.get("_".join(["AN", gvs_subpop]), 0)
subpop_af_val = subpop_annotations.get("_".join(["AF", gvs_subpop]), None)
# note the assumption is made that AC_Hom must be even because by it's nature it means there are two, but there could be an error
subpop_sc_val = int(subpop_annotations.get("_".join(["AC_Hom", gvs_subpop])) / 2 ) + subpop_annotations.get("_".join(["AC_Het", gvs_subpop])) + subpop_annotations.get("_".join(["AC_Hemi", gvs_subpop]))
# here we set the subpopulation ac/an/af values
subpop_sc_val = int(subpop_annotations.get("_".join(["AC_Hom", gvs_subpop]), 0) / 2 ) + subpop_annotations.get("_".join(["AC_Het", gvs_subpop]), 0) + subpop_annotations.get("_".join(["AC_Hemi", gvs_subpop]), 0)

row["_".join(["gvs", gvs_subpop, "ac"])] = subpop_ac_val
row["_".join(["gvs", gvs_subpop, "an"])] = subpop_an_val
row["_".join(["gvs", gvs_subpop, "af"])] = subpop_af_val
Expand Down Expand Up @@ -377,9 +377,9 @@ def make_annotation_jsons(annotated_json, output_json, output_genes_json):

if __name__ == '__main__':
parser = argparse.ArgumentParser(allow_abbrev=False, description='Create BQ load friendly jsons for VAT creation')
parser.add_argument('--annotated_json',type=str, help='nirvana created annotation json', required=True)
parser.add_argument('--output_vt_json',type=str, help='name of the vt json', required=True)
parser.add_argument('--output_genes_json',type=str, help='name of the genes json', required=True)
parser.add_argument('--annotated_json', type=str, help='nirvana created annotation json', required=True)
parser.add_argument('--output_vt_json', type=str, help='name of the vt json', required=True)
parser.add_argument('--output_genes_json', type=str, help='name of the genes json', required=True)

args = parser.parse_args()

Expand Down
46 changes: 40 additions & 6 deletions scripts/variantstore/wdl/extract/extract_subpop.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import csv
import argparse

expected_subpopulations = [
valid_subpopulations = [
"afr",
"amr",
"eas",
Expand All @@ -14,19 +14,53 @@
def extract_subpopulation(input_path, output_path):
with open(input_path, newline='') as tsvin, open(output_path, 'w', newline='') as csvout:
tsvin = csv.reader(tsvin, delimiter='\t')
csvout = csv.writer(csvout, delimiter='\t')
csvout = csv.writer(csvout, delimiter='\t', lineterminator="\n")
next(tsvin) # Skip header row

observed_subpopulations = set()
for row in tsvin:
if row[4] not in valid_subpopulations:
raise ValueError(f"Unrecognized subpopulation: {row[4]} in {args.input_path}")
observed_subpopulations.add(row[4])
csvout.writerow([row[0], row[4]])

return observed_subpopulations

def write_custom_annotations_files(observed_subpopulations, custom_annotations_template_path):

with open(custom_annotations_template_path, 'w', newline='') as csvout:
csvout = csv.writer(csvout, delimiter='\t', lineterminator="\n")
csvout.writerow(["#title=gvsAnnotations"])
csvout.writerow(["#assembly=GRCh38"])
csvout.writerow(["#matchVariantsBy=allele"])

chrom_line = ["#CHROM", "POS", "REF", "ALT", "AC", "AN", "AF", "AC_Hom", "AC_Het", "AC_Hemi"]
categories_line = ["#categories", ".", ".", ".", "AlleleCount", "AlleleNumber", "AlleleFrequency", "AlleleCount", "AlleleCount", "AlleleCount"]
description_line = ["#descriptions", ".", ".", ".", ".", ".", ".", ".", ".", "."]
type_line = ["#type", ".", ".", ".", "number", "number", "number", "number", "number", "number"]

for subpopulation in sorted(observed_subpopulations):
for annotation in (["AC", "AN", "AF", "AC_Hom", "AC_Het", "AC_Hemi"]):
chrom_line.append(f"{annotation}_{subpopulation}")
categories_line += ["AlleleCount", "AlleleNumber", "AlleleFrequency", "AlleleCount", "AlleleCount", "AlleleCount"]
for i in range(6):
description_line.append(".")
type_line.append("number")

csvout.writerow(chrom_line)
csvout.writerow(categories_line)
csvout.writerow(description_line)
csvout.writerow(type_line)

if __name__ == '__main__':
parser = argparse.ArgumentParser(allow_abbrev=False, description='Extract subpopulation per sample data out of a callset TSV')
parser.add_argument('--input_path',type=str, metavar='path', help='path to the original callset TSV', required=True)
parser.add_argument('--output_path',type=str, metavar='path', help='path for the output TSV', required=True)
parser.add_argument('--input_path', type=str, metavar='path', help='path to the original callset TSV', required=True)
parser.add_argument('--output_path', type=str, metavar='path', help='path for the output TSV', required=True)
parser.add_argument('--custom_annotations_template_path', type=str, metavar='path', help='path for the custom annotations template file', required=True)

args = parser.parse_args()

extract_subpopulation(args.input_path,
args.output_path)
observed_subpopulations = extract_subpopulation(args.input_path,
args.output_path)
write_custom_annotations_files(observed_subpopulations,
args.custom_annotations_template_path)