diff --git a/.dockstore.yml b/.dockstore.yml index 88fe91bc..5395b64b 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -1,50 +1,10 @@ -# The first line refers to the version 1.2 of the .dockstore.yml schema version: 1.2 -# An array of workflows. Each element corresponds to a workflow on Dockstore. workflows: - - # The optional workflow name for a workflow, which may only consist of alphanumerics - # and internal underscores and hyphens, but no spaces or other characters. Names may not exceed 256 characters. - # If using a .dockstore.yml with multiple workflows, this field is required - # to uniquely identify workflows in the repository. - # - # It should be noted that having the name come first is an arbitrary decision. - # You could use subclass instead, for instance. Provided arrays are not broken - # up, the order of fields within a .dockstore.yml is not important. - name: HiFi-human-WGS-WDL - - # The descriptor language used for the workflow. CWL, WDL, NFL (Nextflow), or GALAXY. - # This cannot be changed once the workflow is registered. subclass: WDL - - # Workflow-wide setting that will affect ALL branches/tags; only set this as needed in a main branch. - # Set to true to publish an unpublished workflow, or false to unpublish a published workflow. - # Omitting the publish setting leaves the publish-state unchanged (recommended for all non-primary branches). - # publish: - - # The absolute path to the primary descriptor file in the Git repository. - # - For CWL, the primary descriptor is a .cwl file. - # - For WDL, the primary descriptor is a .wdl file. - # - For Galaxy, the primary descriptor is a .ga file. - # - Nextflow differs from these as the primary descriptor is a nextflow.config file. primaryDescriptorPath: /workflows/main.wdl - - # An optional array of absolute paths to test parameter files in the Git repository. - # For example... - # testParameterFiles: - # - /null-model/null-model.json - # - /null-model/null-model-binary.json - # testParameterFiles: - - # An optional path to a workflow-specific readme in the Git repository. If not provided, Dockstore will show - # the readme.md present at the root of the Git repository if it is present. - # If you have multiple workflows in a single Git repository, it is recommend to give each one a readme. readMePath: /README.md - - # An optional array of authorship information. - # Note that if orcid is present, then all other fields will be ignored, as information will be taken from orcid. - # If orcid is not present, make sure to at a minimum include the name field for each author. authors: - orcid: 0000-0001-5921-2022 # Juniper Lake - orcid: 0000-0001-7628-5645 # Gregory Concepcion @@ -52,18 +12,36 @@ workflows: - orcid: 0000-0002-7422-1194 # William Rowell - orcid: 0000-0002-5507-0896 # Heather Ward - orcid: 0009-0001-0205-4614 # Karen Fang - - # A boolean that will change the default version to be displayed on Dockstore. Default: False. - # A value of true will automatically display the latest tag updated as default. - # A value of false will retain the default version that has been specified via the Dockstore UI. latestTagAsDefault: False - - # The optional filters section allow specifying sets of Git branches and tags to include for the workflow. - # If no filters are given, all branches and tags are included. - # Branches and tags are arrays of pattern-strings. - # Pattern-strings use Unix-style Glob syntax by default (Ex: `develop`, `myworkflow/**`) - # https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/nio/file/FileSystem.html#getPathMatcher(java.lang.String) - # or RegEx when the string is surrounded by / (Ex: `/develop/`, `/myworkflow\/.*/`). + filters: + tags: [ /v1\..*dockstore/ ] + - name: HiFi-human-WGS-WDL-singleton + subclass: WDL + primaryDescriptorPath: /workflows/singleton.wdl + readMePath: /docs/singleton.md + authors: + - orcid: 0000-0001-5921-2022 # Juniper Lake + - orcid: 0000-0001-7628-5645 # Gregory Concepcion + - orcid: 0000-0003-1183-0432 # Aaron Wenger + - orcid: 0000-0002-7422-1194 # William Rowell + - orcid: 0000-0002-5507-0896 # Heather Ward + - orcid: 0009-0001-0205-4614 # Karen Fang + latestTagAsDefault: False + filters: + branches: [ /.*dockstore/ ] + tags: [ /v.*dockstore/ ] + - name: HiFi-human-WGS-WDL-family + subclass: WDL + primaryDescriptorPath: /workflows/family.wdl + readMePath: /docs/family.md + authors: + - orcid: 0000-0001-5921-2022 # Juniper Lake + - orcid: 0000-0001-7628-5645 # Gregory Concepcion + - orcid: 0000-0003-1183-0432 # Aaron Wenger + - orcid: 0000-0002-7422-1194 # William Rowell + - orcid: 0000-0002-5507-0896 # Heather Ward + - orcid: 0009-0001-0205-4614 # Karen Fang + latestTagAsDefault: False filters: branches: [ /.*dockstore/ ] - tags: [ /v.*/ ] + tags: [ /v.*dockstore/ ] diff --git a/.gitmodules b/.gitmodules index 8c759522..6bc7de6b 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "workflows/wdl-common"] path = workflows/wdl-common - url = https://github.com/PacificBiosciences/wdl-common + url = ../wdl-common.git diff --git a/GRCh38.ref_map.v2p0p0.template.tsv b/GRCh38.ref_map.v2p0p0.template.tsv new file mode 100644 index 00000000..3a095aaf --- /dev/null +++ b/GRCh38.ref_map.v2p0p0.template.tsv @@ -0,0 +1,12 @@ +name GRCh38 +fasta /hifi-wdl-resources-v2.0.0/GRCh38/human_GRCh38_no_alt_analysis_set.fasta +fasta_index /hifi-wdl-resources-v2.0.0/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai +pbsv_splits /hifi-wdl-resources-v2.0.0/GRCh38/human_GRCh38_no_alt_analysis_set.pbsv_splits.json +pbsv_tandem_repeat_bed /hifi-wdl-resources-v2.0.0/GRCh38/human_GRCh38_no_alt_analysis_set.trf.bed +trgt_tandem_repeat_bed /hifi-wdl-resources-v2.0.0/GRCh38/trgt/human_GRCh38_no_alt_analysis_set.trgt.v0.3.4.bed +hificnv_exclude_bed /hifi-wdl-resources-v2.0.0/GRCh38/hificnv/cnv.excluded_regions.common_50.hg38.bed.gz +hificnv_exclude_bed_index /hifi-wdl-resources-v2.0.0/GRCh38/hificnv/cnv.excluded_regions.common_50.hg38.bed.gz.tbi +hificnv_expected_bed_male /hifi-wdl-resources-v2.0.0/GRCh38/hificnv/expected_cn.hg38.XY.bed +hificnv_expected_bed_female /hifi-wdl-resources-v2.0.0/GRCh38/hificnv/expected_cn.hg38.XX.bed +pharmcat_positions_vcf /hifi-wdl-resources-v2.0.0/GRCh38/pharmcat/pharmcat_positions_2.15.4.vcf.bgz +pharmcat_positions_vcf_index /hifi-wdl-resources-v2.0.0/GRCh38/pharmcat/pharmcat_positions_2.15.4.vcf.bgz.csi diff --git a/GRCh38.tertiary_map.v2p0p0.template.tsv b/GRCh38.tertiary_map.v2p0p0.template.tsv new file mode 100644 index 00000000..86b7b3b6 --- /dev/null +++ b/GRCh38.tertiary_map.v2p0p0.template.tsv @@ -0,0 +1,12 @@ +slivar_js /hifi-wdl-resources-v2.0.0/slivar/slivar-functions.v0.2.8.js +ensembl_gff /hifi-wdl-resources-v2.0.0/GRCh38/ensembl.GRCh38.101.reformatted.gff3.gz +lof_lookup /hifi-wdl-resources-v2.0.0/slivar/lof_lookup.v2.1.1.txt +clinvar_lookup /hifi-wdl-resources-v2.0.0/slivar/clinvar_gene_desc.20240624T165443.txt +slivar_gnotate_files /hifi-wdl-resources-v2.0.0/GRCh38/slivar_gnotate/gnomad.hg38.v4.1.custom.v1.zip,/hifi-wdl-resources-v2.0.0/GRCh38/slivar_gnotate/CoLoRSdb.GRCh38.v1.1.0.deepvariant.glnexus.zip +slivar_gnotate_prefixes gnomad,colors +slivar_max_af 0.03 +slivar_max_nhomalt 4 +slivar_max_ac 4 +slivar_min_gq 5 +svpack_pop_vcfs /hifi-wdl-resources-v2.0.0/GRCh38/sv_pop_vcfs/gnomad.v4.1.sv.sites.pass.vcf.gz,/hifi-wdl-resources-v2.0.0/GRCh38/sv_pop_vcfs/CoLoRSdb.GRCh38.v1.1.0.pbsv.jasmine.vcf.gz +svpack_pop_vcf_indices /hifi-wdl-resources-v2.0.0/GRCh38/sv_pop_vcfs/gnomad.v4.1.sv.sites.pass.vcf.gz.tbi,/hifi-wdl-resources-v2.0.0/GRCh38/sv_pop_vcfs/CoLoRSdb.GRCh38.v1.1.0.pbsv.jasmine.vcf.gz.tbi \ No newline at end of file diff --git a/LICENSE b/LICENSE index aaea0c14..9d176165 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2023, Pacific Biosciences of California, Inc. +Copyright (c) 2024, Pacific Biosciences of California, Inc. All rights reserved. diff --git a/README.md b/README.md index 4a91b386..e681468e 100644 --- a/README.md +++ b/README.md @@ -4,273 +4,137 @@ Workflow for analyzing human PacBio whole genome sequencing (WGS) data using [Workflow Description Language (WDL)](https://openwdl.org/). -- Docker images used by this workflow are defined in [the wdl-dockerfiles repo](https://github.com/PacificBiosciences/wdl-dockerfiles). Images are hosted in PacBio's [quay.io](https://quay.io/organization/pacbio). -- Common tasks that may be reused within or between workflows are defined in [the wdl-common repo](https://github.com/PacificBiosciences/wdl-common). +- Docker images used by this workflow are defined in [the wdl-dockerfiles repo](../../wdl-dockerfiles). Images are hosted in PacBio's [quay.io](https://quay.io/organization/pacbio). +- Common tasks that may be reused within or between workflows are defined in [the wdl-common repo](../../wdl-common). -# Workflow +## Workflow -**Workflow entrypoint**: [workflows/main.wdl](workflows/main.wdl) +Starting in v2, this repo contains two related workflows. The `singleton` workflow is designed to analyze a single sample, while the `family` workflow is designed to analyze a family of related samples. With the exception of the joint calling tasks in the `family` workflow, both workflows make use of the same tasks, although the input and output structure differ. -PacBio WGS Variant Pipeline performs read alignment, variant calling, and phasing. Joint-calling of small variants and structural variants for cohorts and optional variant filtering and annotation is also available for HiFi human WGS. The workflow can run using Azure, AWS, GCP, and HPC backends. +The `singleton` workflow will be best for most use cases, including analysis of rare disease probands without data from relatives, or non-rare disease applications. The input and output structures are relatively flat, which should improve compatibility with platforms like Terra. -![PacBio WGS Variant Pipeline diagram](https://github.com/PacificBiosciences/HiFi-human-WGS-WDL/raw/main/images/main.graphviz.svg "PacBio WGS Variant Pipeline diagram") +The `family` workflow adds joint-calling for small variants and structural variants and is best suited for rare disease applications where data is available from multiple samples in a family. The input and output structures are more complex and may require some additional work to integrate with other platforms. + +Both workflows are designed to analyze human PacBio whole genome sequencing (WGS) data. The workflows are designed to be run on Azure, AWS HealthOmics, GCP, or HPC backends. + +**Workflow entrypoint**: + +- [workflows/singleton.wdl](../../blob/main/workflows/singleton.wdl) +- [workflows/family.wdl](../../blob/main/workflows/family.wdl) ## Setup -We recommend cloning the repo rather than downloading the release package. Some tasks and workflows are pulled in from other repositories. Ensure you have initialized submodules following cloning by running `git submodule update --init --recursive`. +This is an actively developed workflow with multiple versioned releases, and we make use of git submodules for common tasks that are shared by multiple workflows. There are two ways to ensure you are using a supported release of the workflow and ensure that the submodules are correctly initialized: + +1) Download the release zips directly from a [supported release](../releases/tag/v2.0.0-rc6): + + ```bash + wget https://github.com/PacificBiosciences/HiFi-human-WGS-WDL/releases/download/v2.0.0-rc6/hifi-human-wgs-singleton.zip + wget https://github.com/PacificBiosciences/HiFi-human-WGS-WDL/releases/download/v2.0.0-rc6/hifi-human-wgs-family.zip + ``` + +2) Clone the repository and initialize the submodules: + + ```bash + git clone \ + --depth 1 --branch v2.0.0-rc6 \ + --recursive \ + https://github.com/PacificBiosciences/HiFi-human-WGS-WDL.git + ``` ## Resource requirements -The workflow requires at minimum 64 cores and 256 GB of RAM. Ensure that the backend environment you're using has enough quota to run the workflow. +The most resource-heavy step in the workflow requires 64 cores and 256 GB of RAM. Ensure that the backend environment you're using has enough quota to run the workflow. + +On some backends, you may be able to make use of a GPU to accelerate the DeepVariant step. The GPU is not required, but it can significantly speed up the workflow. If you have access to a GPU, you can set the `gpu` parameter to `true` in the inputs JSON file. ## Reference datasets and associated workflow files -Reference datasets are hosted publicly for use in the pipeline. For data locations, see the [backend-specific documentation](backends/) and template inputs files for each backend with paths to publicly hosted reference files filled out. +Reference datasets are hosted publicly for use in the pipeline. For data locations, see the [backend-specific documentation](./backends) and template inputs files for each backend with paths to publicly hosted reference files filled out. -# Running the workflow +## Setting up and executing the workflow 1. [Select a backend environment](#selecting-a-backend) 2. [Configure a workflow execution engine in the chosen environment](#configuring-a-workflow-engine) 3. [Fill out the inputs JSON file for your cohort](#filling-out-the-inputs-json) 4. [Run the workflow](#running-the-workflow-1) -## Selecting a backend +### Selecting a backend The workflow can be run on Azure, AWS, GCP, or HPC. Your choice of backend will largely be determined by the location of your data. For backend-specific configuration, see the relevant documentation: -- [Azure](backends/azure) -- [AWS](backends/aws) -- [GCP](backends/gcp) -- [HPC](backends/hpc) +- [Azure](../backend-azure) +- [AWS](../backend-aws-healthomics) +- [GCP](../backend-gcp) +- [HPC](../backend-hpc) -## Configuring a workflow engine and container runtime +### Configuring a workflow engine and container runtime An execution engine is required to run workflows. Two popular engines for running WDL-based workflows are [`miniwdl`](https://miniwdl.readthedocs.io/en/latest/getting_started.html) and [`Cromwell`](https://cromwell.readthedocs.io/en/stable/tutorials/FiveMinuteIntro/). Because workflow dependencies are containerized, a container runtime is required. This workflow has been tested with [Docker](https://docs.docker.com/get-docker/) and [Singularity](https://docs.sylabs.io/guides/3.10/user-guide/) container runtimes. -See the [backend-specific documentation](backends) for details on setting up an engine. +See the [backend-specific documentation](../backends) for details on setting up an engine. | Engine | Azure | AWS | GCP | HPC | | :- | :- | :- | :- | :- | -| [**miniwdl**](https://github.com/chanzuckerberg/miniwdl#scaling-up) | _Unsupported_ | Supported via the [Amazon Genomics CLI](https://aws.amazon.com/genomics-cli/) | _Unsupported_ | (SLURM only) Supported via the [`miniwdl-slurm`](https://github.com/miniwdl-ext/miniwdl-slurm) plugin | -| [**Cromwell**](https://cromwell.readthedocs.io/en/stable/backends/Backends/) | Supported via [Cromwell on Azure](https://github.com/microsoft/CromwellOnAzure) | Supported via the [Amazon Genomics CLI](https://aws.amazon.com/genomics-cli/) | Supported via Google's [Pipelines API](https://cromwell.readthedocs.io/en/stable/backends/Google/) | Supported - [Configuration varies depending on HPC infrastructure](https://cromwell.readthedocs.io/en/stable/tutorials/HPCIntro/) | +| [**miniwdl**](https://github.com/chanzuckerberg/miniwdl#scaling-up) | _Unsupported_ | Supported via [AWS HealthOmics](https://aws.amazon.com/healthomics/) | _Unsupported_ | (SLURM only) Supported via the [`miniwdl-slurm`](https://github.com/miniwdl-ext/miniwdl-slurm) plugin | +| [**Cromwell**](https://cromwell.readthedocs.io/en/stable/backends/Backends/) | Supported via [Cromwell on Azure](https://github.com/microsoft/CromwellOnAzure) | _Unsupported_ | Supported via Google's [Pipelines API](https://cromwell.readthedocs.io/en/stable/backends/Google/) | Supported - [Configuration varies depending on HPC infrastructure](https://cromwell.readthedocs.io/en/stable/tutorials/HPCIntro/) | -## Filling out the inputs JSON +### Filling out the inputs JSON The input to a workflow run is defined in JSON format. Template input files with reference dataset information filled out are available for each backend: -- [Azure](backends/azure/inputs.azure.json) -- [AWS](backends/aws/inputs.aws.json) -- [GCP](backends/gcp/inputs.gcp.json) -- [HPC](backends/hpc/inputs.hpc.json) +- [Azure](../../blob/main/backends/azure/singleton.azure.inputs.json) +- [AWS](../../blob/main/backends/aws-healthomics/singleton.healthomics.inputs.json) +- [GCP](../../blob/main/backends/gcp/singleton.gcp.inputs.json) +- [HPC](../../blob/main/backends/hpc/singleton.hpc.inputs.json) Using the appropriate inputs template file, fill in the cohort and sample information (see [Workflow Inputs](#workflow-inputs) for more information on the input structure). -If using an HPC backend, you will need to download the reference bundle and replace the `` in the input template file with the local path to the reference datasets on your HPC. +If using an HPC backend, you will need to download the reference bundle and replace the `` in the input template file with the local path to the reference datasets on your HPC. If using Amazon HealthOmics, you will need to download the reference bundle, upload it to your S3 bucket, and adjust paths accordingly. -## Running the workflow +### Running the workflow Run the workflow using the engine and backend that you have configured ([miniwdl](#run-directly-using-miniwdl), [Cromwell](#run-directly-using-cromwell)). Note that the calls to `miniwdl` and `Cromwell` assume you are accessing the engine directly on the machine on which it has been deployed. Depending on the backend you have configured, you may be able to submit workflows using different methods (e.g. using trigger files in Azure, or using the Amazon Genomics CLI in AWS). -### Run directly using miniwdl +#### Run directly using miniwdl -`miniwdl run workflows/main.wdl -i ` +`miniwdl run workflows/singleton.wdl -i ` -### Run directly using Cromwell +#### Run directly using Cromwell -`java -jar run workflows/main.wdl -i ` +`java -jar run workflows/singleton.wdl -i ` If Cromwell is running in server mode, the workflow can be submitted using cURL. Fill in the values of CROMWELL_URL and INPUTS_JSON below, then from the root of the repository, run: -```bash -# The base URL (and port, if applicable) of your Cromwell server -CROMWELL_URL= -# The path to your inputs JSON file -INPUTS_JSON= - -(cd workflows && zip -r dependencies.zip humanwgs_structs.wdl cohort_analysis/ sample_analysis/ tertiary_analysis/ wdl-common/) -curl -X "POST" \ - "${CROMWELL_URL}/api/workflows/v1" \ - -H "accept: application/json" \ - -H "Content-Type: multipart/form-data" \ - -F "workflowSource=@workflows/main.wdl" \ - -F "workflowInputs=@${INPUTS_JSON};type=application/json" \ - -F "workflowDependencies=@workflows/dependencies.zip;type=application/zip" -``` - -To specify [workflow options](https://cromwell.readthedocs.io/en/latest/wf_options/Overview/), add the following to the request (assuming your options file is a file called `options.json` located in the `pwd`): `-F "workflowOptions=@options.json;type=application/json"`. - -# Workflow inputs - -This section describes the inputs required for a run of the workflow. Typically, only the `humanwgs.cohort` and potentially [run/backend-specific sections](#other-inputs) will be filled out by the user for each run of the workflow. Input templates with reference file locations filled out are provided [for each backend](backends). - -## [Cohort](workflows/humanwgs_structs.wdl) - -A cohort can include one or more samples. Samples need not be related, but if you plan to run tertiary analysis, it is best to think of a cohort as a family of related samples. We have tested cohorts of up to 5 samples with 30x coverage. Larger cohorts may encounter memory issues during joint calling. - -| Type | Name | Description | Notes | -| :- | :- | :- | :- | -| String | cohort_id | A unique name for the cohort; used to name outputs | | -| Array[[Sample](#sample)] | samples | The set of samples for the cohort. At least one sample must be defined. | | -| Array[String] | phenotypes | [Human Phenotype Ontology (HPO) phenotypes](https://hpo.jax.org/app/) associated with the cohort. If no particular phenotypes are desired, the root HPO term, `"HP:0000001"`, can be used. | | - -### [Sample](workflows/humanwgs_structs.wdl) - -Sample information for each sample in the workflow run. - -| Type | Name | Description | Notes | -| :- | :- | :- | :- | -| String | sample_id | A unique name for the sample; used to name outputs | | -| Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)] | movie_bams | The set of unaligned movie BAMs associated with this sample | | -| String? | sex | Sample sex | ["MALE", "FEMALE", `null`]. If the sex field is missing or `null`, sex will be set to unknown. Used to set the expected sex chromosome karyotype for TRGT and HiFiCNV. | -| Boolean | affected | Is this sample affected by the phenotype? | \[`true`, `false`\] | -| String? | father_id | Paternal `sample_id` | | -| String? | mother_id | Maternal `sample_id` | | - -## [ReferenceData](workflows/humanwgs_structs.wdl) - -Files associated with the reference genome. - -These files are hosted publicly in each of the cloud backends; see `backends/${backend}/inputs.${backend}.json`. - -| Type | Name | Description | Notes | -| :- | :- | :- | :- | -| String | name | Reference name; used to name outputs (e.g., "GRCh38") | Note: The workflow currently only supports GRCh38 and provides GCA_000001405.15_GRCh38_no_alt_analysis_set. | -| [IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl) | fasta | Reference genome and index | | -| File | tandem_repeat_bed | Tandem repeat locations used by [pbsv](https://github.com/PacificBiosciences/pbsv) to normalize SV representation | | -| File | trgt_tandem_repeat_bed | Tandem repeat sites to be genotyped by [TRGT](https://github.com/PacificBiosciences/trgt) | | -| [IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl) | hificnv_exclude_bed | Compressed BED and index of regions to exclude from calling by [HiFiCNV](https://github.com/PacificBiosciences/HiFiCNV). We recommend [cnv.excluded_regions.common_50.hg38.bed.gz](https://github.com/PacificBiosciences/HiFiCNV/blob/main/docs/aux_data.md). | | -| File | hificnv_expected_bed_male | BED of expected copy number for male karyotype for HiFiCNV | | -| File | hificnv_expected_bed_female | BED of expected copy number for female karyotype for HiFiCNV | | -| File? | gnomad_af | [gnomAD](https://gnomad.broadinstitute.org/) v3.1 allele frequences in [`slivar gnotate`](https://github.com/brentp/slivar/wiki/gnotate) format | required if `run_tertiary_analysis` is set to `true` | -| File? | hprc_af | Allele frequences in ~100 [Human Pangenome Reference Consortium (HPRC)](https://humanpangenome.org/) samples in `slivar gnotate` format | required if `run_tertiary_analysis` is set to `true` | -| File? | gff | [Ensembl](https://useast.ensembl.org/index.html) GFF3 reference annotation | required if `run_tertiary_analysis` is set to `true` | -| Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)?] | population_vcfs | An array of structural variant population VCFs | required if `run_tertiary_analysis` is set to `true` | - -## [SlivarData](workflows/humanwgs_structs.wdl) - -Files associated with `slivar` annotation. These are required if `run_tertiary_analysis` is set to `true`. - -These files are hosted publicly in each of the cloud backends; see `backends/${backend}/inputs.${backend}.json`. - -| Type | Name | Description | Notes | -| :- | :- | :- | :- | -| File | slivar_js | Additional javascript functions for slivar | | -| File | hpo_terms | [HPO](https://hpo.jax.org/app/) annotation lookups | | -| File | hpo_dag | HPO annotation lookups | | -| File | hpo_annotations | HPO annotation lookups | | -| File | ensembl_to_hgnc | Ensembl to HGNC gene mapping | | -| File | lof_lookup | Loss-of-function scores per gene | | -| File | clinvar_lookup | ClinVar annotations per gene | | - -## Other inputs - -| Type | Name | Description | Notes | -| :- | :- | :- | :- | -| String? | deepvariant_version | Version of deepvariant to use \["1.5.0"\] | | -| [DeepVariantModel](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)? | deepvariant_model | Optional alternate DeepVariant model file to use | | -| Int? | pbsv_call_mem_gb | Optionally set RAM (GB) for pbsv_call during cohort analysis | | -| Int? | glnexus_mem_gb | Optionally set RAM (GB) for GLnexus during cohort analysis | | -| Boolean? | run_tertiary_analysis | Run the optional tertiary analysis steps \[`false`\] | \[`true`, `false`\] | -| String | backend | Backend where the workflow will be executed | \["Azure", "AWS", "GCP", "HPC"\] | -| String? | zones | Zones where compute will take place; required if backend is set to 'AWS' or 'GCP'. |
  • [Determining available zones in AWS](backends/aws/README.md#determining-available-zones)
  • [Determining available zones in GCP](backends/gcp/README.md#determining-available-zones)
| -| String? | aws_spot_queue_arn | Queue ARN for the spot batch queue; required if backend is set to 'AWS' and `preemptible` is set to `true` | [Determining the AWS queue ARN](backends/aws/README.md#determining-the-aws-batch-queue-arn) | -| String? | aws_on_demand_queue_arn | Queue ARN for the on demand batch queue; required if backend is set to 'AWS' and `preemptible` is set to `false` | [Determining the AWS queue ARN](backends/aws/README.md#determining-the-aws-batch-queue-arn) | -| String? | container_registry | Container registry where workflow images are hosted. If left blank, [PacBio's public Quay.io registry](https://quay.io/organization/pacbio) will be used. | | -| Boolean | preemptible | If set to `true`, run tasks preemptibly where possible. On-demand VMs will be used only for tasks that run for >24 hours if the backend is set to GCP. If set to `false`, on-demand VMs will be used for every task. Ignored if backend is set to HPC. | \[`true`, `false`\] | - -# Workflow outputs - -## Sample analysis - -These files will be output for each sample defined in the cohort. - -| Type | Name | Description | Notes | -| :- | :- | :- | :- | -| Array[Array[File]] | bam_stats | TSV of length and quality for each read (per input BAM) | | -| Array[Array[File]] | read_length_summary | For each input BAM, read length distribution (per input BAM) | | -| Array[Array[File]] | read_quality_summary | For each input BAM, read quality distribution (per input BAM) | | -| Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)] | small_variant_gvcfs | Small variants (SNPs and INDELs < 50bp) gVCFs called by [DeepVariant](https://github.com/google/deepvariant) (with index) | | -| Array[File] | small_variant_vcf_stats | [`bcftools stats`](https://samtools.github.io/bcftools/bcftools.html#stats) summary statistics for small variants | | -| Array[File] | small_variant_roh_out | Output of [`bcftools roh`](https://samtools.github.io/bcftools/howtos/roh-calling.html) using `--AF-dflt 0.4` | | -| Array[File] | small_variant_roh_bed | Regions of homozygosity determiend by [`bcftools roh`](https://samtools.github.io/bcftools/howtos/roh-calling.html) using `--AF-dflt 0.4` | | -| Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)] | sample_phased_small_variant_vcfs | Small variants called by DeepVariant and phased by [HiPhase](https://github.com/PacificBiosciences/HiPhase) (with index) | | -| Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)] | sample_phased_sv_vcfs | Structural variants called by [pbsv](https://github.com/PacificBiosciences/pbsv) and phased by HiPhase (with index) | | -| Array[File] | sample_hiphase_stats | Phase block summary statistics written by [HiPhase](https://github.com/PacificBiosciences/HiPhase/blob/main/docs/user_guide.md#chromosome-summary-file---summary-file) | | -| Array[File] | sample_hiphase_blocks | Phase block list written by [HiPhase](https://github.com/PacificBiosciences/HiPhase/blob/main/docs/user_guide.md#phase-block-file---blocks-file) | | -| Array[File] | sample_hiphase_haplotags | Per-read haplotag information, written by [HiPhase](https://github.com/PacificBiosciences/HiPhase/blob/main/docs/user_guide.md#haplotag-file---haplotag-file) | | -| Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)] | merged_haplotagged_bam | Aligned (by [pbmm2](https://github.com/PacificBiosciences/pbmm2)), haplotagged (by [HiPhase](https://github.com/PacificBiosciences/HiPhase/blob/main/docs/user_guide.md#haplotagged-bam-files)) reads (with index) | | -| Array[File] | haplotagged_bam_mosdepth_summary | [mosdepth](https://github.com/brentp/mosdepth) summary of median depths per chromosome | | -| Array[File] | haplotagged_bam_mosdepth_region_bed | mosdepthhttps://github.com/brentp/mosdepth BED of median coverage depth per 500 bp window | | -| Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)] | trgt_repeat_vcf | Tandem repeat genotypes from [TRGT](https://github.com/PacificBiosciences/trgt/blob/main/docs/vcf_files.md) (with index) | | -| Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)] | trgt_spanning_reads | Fragments of HiFi reads spanning loci genotyped by TRGT (with index) | | -| Array[File] | trgt_dropouts | Regions with insufficient coverage to genotype by TRGT | | -| Array[Array[File]] | cpg_pileup_beds | 5mCpG site methylation probability pileups generated by [pb-CpG-tools](https://github.com/PacificBiosciences/pb-CpG-tools#output-files) | | -| Array[Array[File]] | cpg_pileup_bigwigs | 5mCpG site methylation probability pileups generated by pb-CpG-tools | | -| Array[File] | paraphase_output | Output generated by [Paraphase](https://github.com/PacificBiosciences/paraphase) | | -| Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)] | paraphase_realigned_bam | Realigned BAM for selected medically relevant genes in segmental duplications (with index), generated by Paraphase | | -| Array[Array[File]] | paraphase_vcfs | Phased Variant calls for selected medically relevant genes in segmental duplications, generated by Paraphase | | -| Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)] | hificnv_vcfs | VCF output containing copy number variant calls for the sample from [HiFiCNV](https://github.com/PacificBiosciences/HiFiCNV) | | -| Array[File] | hificnv_copynum_bedgraphs | Copy number values calculated for each region | | -| Array[File] | hificnv_depth_bws | Bigwig file containing the depth measurements from HiFiCNV | | -| Array[File] | hificnv_maf_bws | Bigwig file containing the minor allele frequency measurements from DeepVariant, generated by HiFiCNV | | - -## Cohort analysis - -These files will be output if the cohort includes more than one sample. - -| Type | Name | Description | Notes | -| :- | :- | :- | :- | -| [IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)? | cohort_small_variant_vcf | Small variants called by [DeepVariant](https://github.com/google/deepvariant), joint-called by [GLnexus](https://github.com/dnanexus-rnd/GLnexus), and phased by [HiPhase](https://github.com/PacificBiosciences/HiPhase) (with index) | | -| [IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)? | cohort_sv_vcf | Structural variants joint-called by [pbsv](https://github.com/PacificBiosciences/pbsv) and phased by HiPhase (with index) | | -| File? | cohort_hiphase_stats | Phase block summary statistics written by [HiPhase](https://github.com/PacificBiosciences/HiPhase/blob/main/docs/user_guide.md#chromosome-summary-file---summary-file) | | -| File? | cohort_hiphase_blocks | Phase block list written by [HiPhase](https://github.com/PacificBiosciences/HiPhase/blob/main/docs/user_guide.md#phase-block-file---blocks-file) | | - -## Tertiary analysis - -These files will be output for each run of the workflow if `run_tertiary_analysis` is set to `true`. The files that are being annotated will depend on whether the number of samples is equal to or greater than one: -- If the number of samples is equal to one, the files being annotated in this step are the sample small variant VCF and SV VCF. -- If the number of samples is greater than one, the files being annotated in this step are the phased, joint-called small variant VCF and the cohort SV VCF. - -| Type | Name | Description | Notes | -| :- | :- | :- | :- | -| [IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)? | filtered_small_variant_vcf | Small variant calls that are filtered based on population frequency and annotated with cohort information, population frequency, gene, functional impact, etc., using [slivar](https://github.com/brentp/slivar) and [`bcftools csq`](https://samtools.github.io/bcftools/howtos/csq-calling.html) | | -| [IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)? | compound_het_small_variant_vcf | Compound heterozygotes annotated with cohort information, population frequency, gene, functional impact, etc., using slivar and `bcftools csq` | | -| File? | filtered_small_variant_tsv | Filtered VCFs are reformatted as a human-readable TSV by [`slivar tsv`](https://github.com/brentp/slivar/wiki/tsv:-creating-a-spreadsheet-from-a-filtered-VCF) | | -| File? | compound_het_small_variant_tsv | Filtered VCFs are reformatted as a human-readable TSV by `slivar tsv` | | -| [IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)? | filtered_svpack_vcf | Structural variant calls that are filtered based on population frequency and annotated with cohort information, population frequency, gene, functional impact, etc., using [svpack](https://github.com/PacificBiosciences/svpack) | | -| File? | filtered_svpack_tsv | Filtered VCFs are reformatted as a human-readable TSV by `slivar tsv` | | +## Workflow inputs + +This section describes the inputs required for a run of the workflow. Typically, only the `humanwgs.cohort` and potentially run/backend-specific sections will be filled out by the user for each run of the workflow. Input templates with reference file locations filled out are provided [for each backend](../../blob/main/backends). + +Workflow inputs for each entrypoint are described in [singleton](../singleton) and [family](../family) documentation. + +At a high level, we have two types of inputs files: + +- _maps_ are TSV files describing inputs that will be used for every execution of the workflow, like reference genome FASTA files and genome interval BED files. +- _inputs.json_ files are JSON files that describe the samples to be analyzed and the paths to the input files for each sample. + +The resource bundle containing the GRCh38 reference and other files used in this workflow can be downloaded from Zenodo: + +![https://doi.org/10.5281/zenodo.14027047](https://zenodo.org/badge/DOI/10.5281/zenodo.14027047.svg) # Tool versions and Docker images -Docker images definitions used by this workflow can be found in [the wdl-dockerfiles repository](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/987efde4d614a292fbfe9f3cf146b63005ad6a8a). Images are hosted in PacBio's [quay.io](https://quay.io/organization/pacbio). Docker images used in the workflow are pegged to specific versions by referring to their digests rather than tags. +Docker images definitions used by this workflow can be found in [the wdl-dockerfiles repository](https://github.com/PacificBiosciences/wdl-dockerfiles/). Images are hosted in PacBio's [quay.io](https://quay.io/organization/pacbio). Docker images used in the workflow are pegged to specific versions by referring to their digests rather than tags. The Docker image used by a particular step of the workflow can be identified by looking at the `docker` key in the `runtime` block for the given task. Images can be referenced in the following table by looking for the name after the final `/` character and before the `@sha256:...`. For example, the image referred to here is "align_hifiasm": > ~{runtime_attributes.container_registry}/**align_hifiasm**@sha256:3968cb<...>b01f80fe -| Image | Major tool versions | Links | -| :- | :- | :- | -| bcftools |
  • [bcftools 1.14](https://github.com/samtools/bcftools/releases/tag/1.14)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/bcftools) | -| deepvariant | User-defined; default is version [1.5.0](https://github.com/google/deepvariant/releases/tag/v1.5.0) | [DeepVariant GitHub](https://github.com/google/deepvariant) | -| glnexus |
  • [glnexus v1.4.3](https://github.com/dnanexus-rnd/GLnexus/releases/tag/v1.4.3)
| [GLnexus GitHub](https://github.com/dnanexus-rnd/GLnexus) | -| hificnv |
  • [HiFiCNV v0.1.7](https://github.com/PacificBiosciences/HiFiCNV/releases/tag/v0.1.7)
  • [bcftools 1.16](https://github.com/samtools/bcftools/releases/tag/1.16)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/0b0fbe939648087e9fdea4497ae08dc76538ebf0/docker/hificnv) | -| hiphase |
  • [HiPhase 1.0.0](https://github.com/PacificBiosciences/HiPhase/releases/tag/v1.0.0)
  • [samtools 1.18](https://github.com/samtools/samtools/releases/tag/1.18)
  • [bcftools 1.18](https://github.com/samtools/bcftools/releases/tag/1.18)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/d26db6204409dfeff56e169cdba0cc14bc272f15/docker/hiphase) | -| htslib |
  • [htslib 1.14](https://github.com/samtools/htslib/releases/tag/1.14)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/htslib) | -| mosdepth |
  • [mosdepth 0.2.9](https://github.com/brentp/mosdepth/releases/tag/v0.2.9)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/mosdepth) | -| paraphase |
  • [minimap2 2.17](https://github.com/lh3/minimap2/releases/tag/v2.17)
  • [samtools 1.14](https://github.com/samtools/samtools/releases/tag/1.14)
  • [paraphase 2.2.3](https://github.com/PacificBiosciences/paraphase/releases/tag/v2.2.3)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/paraphase) | -| pb-cpg-tools |
  • [pb-CpG-tools v2.3.2](https://github.com/PacificBiosciences/pb-CpG-tools/releases/tag/v2.3.2)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/7481837d3b0f539adf4f64209a65cf28eebf3dba/docker/pb-cpg-tools) | -| pbmm2 |
  • [pbmm2 1.10.0](https://github.com/PacificBiosciences/pbmm2/releases/tag/v1.10.0)
  • [datamash 1.1.0](https://ftp.gnu.org/gnu/datamash/)
  • [pysam 0.16.0.1](https://github.com/pysam-developers/pysam/releases/tag/v0.16.0.1)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/pbmm2) | -| pbsv |
  • [pbsv 2.9.0](https://github.com/PacificBiosciences/pbsv/releases/tag/v2.9.0)
  • [htslib 1.14](https://github.com/samtools/htslib/releases/tag/1.14)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/f9e33a757e6d8cb15696ac930a2efd0fd7a885d8/docker/pbsv) | -| wgs_tertiary image | | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/6b13cc246dd44e41903d17a660bb5432cdd18dbe/docker/wgs_tertiary) | -| samtools |
  • [samtools 1.14](https://github.com/samtools/samtools/releases/tag/1.14)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/samtools) | -| slivar |
  • [slivar 0.2.2](https://github.com/brentp/slivar/releases/tag/v0.2.2)
  • [bcftools 1.14](https://github.com/samtools/bcftools/releases/tag/1.14)
  • [vcfpy 0.13.3](https://github.com/bihealth/vcfpy/releases/tag/v0.13.3)
  • [pysam 0.19.1](https://github.com/pysam-developers/pysam/releases/tag/v0.19.1)
| [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/3560fcc5a84e044067cea9c9a7669cfc2659178e/docker/slivar) | -| svpack |
  • [svpack 9dddda6](https://github.com/PacificBiosciences/svpack/tree/9dddda61e7c8e81124d35478600ad13e4db8d612)
  • [htslib 1.20](https://github.com/samtools/htslib/releases/tag/1.20)
  • [pysam 0.22.1](https://github.com/pysam-developers/pysam/releases/tag/v0.22.1)
  • | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/6b13cc246dd44e41903d17a660bb5432cdd18dbe/docker/svpack) | -| trgt |
    • [trgt 0.5.0](https://github.com/PacificBiosciences/trgt/releases/tag/v0.5.0)
    • [samtools 1.18](https://github.com/samtools/samtools/releases/tag/1.18)
    • [bcftools 1.18](https://github.com/samtools/bcftools/releases/tag/1.18)
    • [pysam 0.21.0](https://github.com/pysam-developers/pysam/releases/tag/v0.21.0)
    | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/d2a45e0213ac3fa631a51a48757c442d3ed550b6/docker/trgt) | +Tool versions and Docker images used in these workflows can be found in the [tools_containers](../tools_containers) documentation. --- diff --git a/backends/aws-healthomics/GRCh38.ref_map.v2p0p0.aws.tsv b/backends/aws-healthomics/GRCh38.ref_map.v2p0p0.aws.tsv new file mode 100644 index 00000000..832473dd --- /dev/null +++ b/backends/aws-healthomics/GRCh38.ref_map.v2p0p0.aws.tsv @@ -0,0 +1,12 @@ +name GRCh38 +fasta s3://pacbio-hifi-human-wgs-reference/dataset/hifi-wdl-resources-v2.0.0/GRCh38/human_GRCh38_no_alt_analysis_set.fasta +fasta_index s3://pacbio-hifi-human-wgs-reference/dataset/hifi-wdl-resources-v2.0.0/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai +pbsv_splits s3://pacbio-hifi-human-wgs-reference/dataset/hifi-wdl-resources-v2.0.0/GRCh38/human_GRCh38_no_alt_analysis_set.pbsv_splits.json +pbsv_tandem_repeat_bed s3://pacbio-hifi-human-wgs-reference/dataset/hifi-wdl-resources-v2.0.0/GRCh38/human_GRCh38_no_alt_analysis_set.trf.bed +trgt_tandem_repeat_bed s3://pacbio-hifi-human-wgs-reference/dataset/hifi-wdl-resources-v2.0.0/GRCh38/trgt/human_GRCh38_no_alt_analysis_set.trgt.v0.3.4.bed +hificnv_exclude_bed s3://pacbio-hifi-human-wgs-reference/dataset/hifi-wdl-resources-v2.0.0/GRCh38/hificnv/cnv.excluded_regions.common_50.hg38.bed.gz +hificnv_exclude_bed_index s3://pacbio-hifi-human-wgs-reference/dataset/hifi-wdl-resources-v2.0.0/GRCh38/hificnv/cnv.excluded_regions.common_50.hg38.bed.gz.tbi +hificnv_expected_bed_male s3://pacbio-hifi-human-wgs-reference/dataset/hifi-wdl-resources-v2.0.0/GRCh38/hificnv/expected_cn.hg38.XY.bed +hificnv_expected_bed_female s3://pacbio-hifi-human-wgs-reference/dataset/hifi-wdl-resources-v2.0.0/GRCh38/hificnv/expected_cn.hg38.XX.bed +pharmcat_positions_vcf s3://pacbio-hifi-human-wgs-reference/dataset/hifi-wdl-resources-v2.0.0/GRCh38/pharmcat/pharmcat_positions_2.15.4.vcf.bgz +pharmcat_positions_vcf_index s3://pacbio-hifi-human-wgs-reference/dataset/hifi-wdl-resources-v2.0.0/GRCh38/pharmcat/pharmcat_positions_2.15.4.vcf.bgz.csi diff --git a/backends/aws-healthomics/GRCh38.tertiary_map.v2p0p0.aws.tsv b/backends/aws-healthomics/GRCh38.tertiary_map.v2p0p0.aws.tsv new file mode 100644 index 00000000..d17ae629 --- /dev/null +++ b/backends/aws-healthomics/GRCh38.tertiary_map.v2p0p0.aws.tsv @@ -0,0 +1,12 @@ +slivar_js s3://pacbio-hifi-human-wgs-reference/dataset/hifi-wdl-resources-v2.0.0/slivar/slivar-functions.v0.2.8.js +ensembl_gff s3://pacbio-hifi-human-wgs-reference/dataset/hifi-wdl-resources-v2.0.0/GRCh38/ensembl.GRCh38.101.reformatted.gff3.gz +lof_lookup s3://pacbio-hifi-human-wgs-reference/dataset/hifi-wdl-resources-v2.0.0/slivar/lof_lookup.v2.1.1.txt +clinvar_lookup s3://pacbio-hifi-human-wgs-reference/dataset/hifi-wdl-resources-v2.0.0/slivar/clinvar_gene_desc.20240624T165443.txt +slivar_gnotate_files s3://pacbio-hifi-human-wgs-reference/dataset/hifi-wdl-resources-v2.0.0/GRCh38/slivar_gnotate/gnomad.hg38.v4.1.custom.v1.zip,s3://pacbio-hifi-human-wgs-reference/dataset/hifi-wdl-resources-v2.0.0/GRCh38/slivar_gnotate/CoLoRSdb.GRCh38.v1.1.0.deepvariant.glnexus.zip +slivar_gnotate_prefixes gnomad,colors +slivar_max_af 0.03 +slivar_max_nhomalt 4 +slivar_max_ac 4 +slivar_min_gq 5 +svpack_pop_vcfs s3://pacbio-hifi-human-wgs-reference/dataset/hifi-wdl-resources-v2.0.0/GRCh38/sv_pop_vcfs/gnomad.v4.1.sv.sites.pass.vcf.gz,s3://pacbio-hifi-human-wgs-reference/dataset/hifi-wdl-resources-v2.0.0/GRCh38/sv_pop_vcfs/CoLoRSdb.GRCh38.v1.1.0.pbsv.jasmine.vcf.gz +svpack_pop_vcf_indices s3://pacbio-hifi-human-wgs-reference/dataset/hifi-wdl-resources-v2.0.0/GRCh38/sv_pop_vcfs/gnomad.v4.1.sv.sites.pass.vcf.gz.tbi,s3://pacbio-hifi-human-wgs-reference/dataset/hifi-wdl-resources-v2.0.0/GRCh38/sv_pop_vcfs/CoLoRSdb.GRCh38.v1.1.0.pbsv.jasmine.vcf.gz.tbi diff --git a/backends/aws-healthomics/aws_ecr_push_tag_images.sh b/backends/aws-healthomics/aws_ecr_push_tag_images.sh new file mode 100644 index 00000000..939508a5 --- /dev/null +++ b/backends/aws-healthomics/aws_ecr_push_tag_images.sh @@ -0,0 +1,118 @@ +#!/bin/bash + +# Author: Heather Ward +# License: GPLv2 +# +# This script is licensed under the GNU General Public License v2.0. +# See https://www.gnu.org/licenses/old-licenses/gpl-2.0.html for details. + +set -euo pipefail + +usage() { +cat << EOF + + Usage: $0 -d docker_manifest -r aws_region -a aws_account [OPTIONS] + Ensure your AWS_PROFILE is set to the correct AWS account (the one you're pushing images to) + + OPTIONS + -h Display this message and exit + -d Docker manifest; images to pull, tag, and push to the specified ECR, one per line + -r AWS region to host images in + -a AWS account where images are hosted + +EOF +} + +log() { + MESSAGE=$1 + TYPE=${2:-info} + + if [[ "$TYPE" == "err" ]]; then + >&2 echo "${MESSAGE}" + elif [[ "$TYPE" == "info" ]]; then + echo "${MESSAGE}" + fi +} + +push_aws() { + TARGET_IMAGE=$1 + POLICY_JSON=$2 + AWS_REGION=$3 + + repository_name=$(echo "${TARGET_IMAGE}" | cut -d / -f 2- | cut -d ":" -f 1 | cut -d "@" -f 1) + # Create the repositroy if it does not exist + if ! aws ecr describe-repositories --repository-names "${repository_name}" --region "${AWS_REGION}" 2> /dev/null; then + log "Creating repository ${repository_name}" + aws ecr create-repository \ + --repository-name "${repository_name}" \ + --region "${AWS_REGION}" \ + 2> /dev/null + fi + log "Setting ECR cross-account policy on ${repository_name}" + aws ecr set-repository-policy \ + --repository-name "${repository_name}" \ + --policy-text "${POLICY_JSON}" \ + --region "${AWS_REGION}" \ + > /dev/null + + log "Pushing image ${TARGET_IMAGE}" + docker push "${TARGET_IMAGE}" +} + +while getopts "hd:r:a:" OPTION; do + case "${OPTION}" in + h) usage; exit;; + d) DOCKER_MANIFEST="${OPTARG}";; + r) AWS_REGION="${OPTARG}";; + a) AWS_ACCOUNT="${OPTARG}";; + \?) usage; exit;; + esac +done + +DOCKER_MANIFEST=${DOCKER_MANIFEST:-} +AWS_REGION=${AWS_REGION:-} +AWS_ACCOUNT=${AWS_ACCOUNT:-} +POLICY_JSON='{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "OmicsAccessPrincipal", + "Effect": "Allow", + "Principal": { + "Service": "omics.amazonaws.com" + }, + "Action": [ + "ecr:BatchCheckLayerAvailability", + "ecr:BatchGetImage", + "ecr:GetDownloadUrlForLayer" + ] + } + ] +}' + +if [[ -z "${DOCKER_MANIFEST}" ]]; then + usage + log "Must set -d docker_manifest" err + exit 1 +fi + +if [[ -z "${AWS_REGION}" ]]; then + usage + log "Must set -r aws_region" err + exit 1 +fi + +# Pull, rename, push, and apply policy to all images in the manifest +AWS_CONTAINER_REGISTRY="${AWS_ACCOUNT}.dkr.ecr.${AWS_REGION}.amazonaws.com" +aws ecr get-login-password --region "${AWS_REGION}" | docker login --username AWS --password-stdin "${AWS_CONTAINER_REGISTRY}" + +while read -r image || [[ -n "${image}" ]]; do + log "Pulling image ${image}" + docker pull "${image}" + # If the image is a @sha256 rather than a tag, tag the image with the sha hash (removing @sha256) + target_image=${AWS_CONTAINER_REGISTRY}/$(echo "${image}" | awk -F "/" '{print $NF}' | sed "s~@sha256~~") + log "${target_image}" + log "Tagging image ${image} as ${target_image}" + docker tag "${image}" "${target_image}" + push_aws "${target_image}" "${POLICY_JSON}" "${AWS_REGION}" +done < "${DOCKER_MANIFEST}" diff --git a/backends/aws-healthomics/family.healthomics.inputs.json b/backends/aws-healthomics/family.healthomics.inputs.json new file mode 100644 index 00000000..0a3bff61 --- /dev/null +++ b/backends/aws-healthomics/family.healthomics.inputs.json @@ -0,0 +1,23 @@ +{ + "humanwgs_family.family": { + "family_id": "String", + "samples": [ + { + "sample_id": "String", + "hifi_reads": [ + "File" + ], + "affected": "Boolean", + "sex": "String? (optional, ['MALE', 'FEMALE', null])", + "father_id": "String? (optional)", + "mother_id": "String? (optional)" + } + ] + }, + "humanwgs_family.phenotypes": "String? (optional)", + "humanwgs_family.ref_map_file": "s3:///GRCh38.ref_map.v2p0p0-rc6.hpc.tsv", + "humanwgs_family.tertiary_map_file": "s3:///GRCh38.tertiary_map.v2p0p0-rc6.hpc.tsv", + "humanwgs_family.backend": "AWS-HealthOmics", + "humanwgs_family.container_registry": "String", + "humanwgs_family.preemptible": true +} \ No newline at end of file diff --git a/backends/aws-healthomics/singleton.healthomics.inputs.json b/backends/aws-healthomics/singleton.healthomics.inputs.json new file mode 100644 index 00000000..83b6d3b4 --- /dev/null +++ b/backends/aws-healthomics/singleton.healthomics.inputs.json @@ -0,0 +1,13 @@ +{ + "humanwgs_singleton.sample_id": "String", + "humanwgs_singleton.sex": "String? (optional, ['MALE', 'FEMALE', null])", + "humanwgs_singleton.hifi_reads": [ + "File" + ], + "humanwgs_singleton.phenotypes": "String? (optional)", + "humanwgs_singleton.ref_map_file": "s3:///GRCh38.ref_map.v2p0p0-rc6.hpc.tsv", + "humanwgs_singleton.tertiary_map_file": "s3:///GRCh38.tertiary_map.v2p0p0-rc6.hpc.tsv", + "humanwgs_singleton.backend": "AWS-HealthOmics", + "humanwgs_singleton.container_registry": "String", + "humanwgs_singleton.preemptible": true +} \ No newline at end of file diff --git a/backends/aws/.gitignore b/backends/aws/.gitignore deleted file mode 100644 index 10ef663f..00000000 --- a/backends/aws/.gitignore +++ /dev/null @@ -1 +0,0 @@ -agc-project.yaml diff --git a/backends/aws/README.md b/backends/aws/README.md deleted file mode 100644 index 9dc11104..00000000 --- a/backends/aws/README.md +++ /dev/null @@ -1,123 +0,0 @@ -# Configuring the Amazon Genomics CLI - -The Amazon Genomics CLI (`agc`) allows users to orchestrate workflow execution using AWS Batch. See the [Workbench documentation](https://docs.dnastack.com/docs/cromwell-on-aws-amazon-genomics-cli) for information on installing and using the `agc` to configure and run workflows. The following section provides additional information on deploying a project using the `agc`. - -## Deploying a context with `agc` - -Once you have installed and authenticated with the `agc`, you can deploy a context using an agc project YAML file. This file must be named `agc-project.yaml`. - -An [example agc-project.yaml file](agc-project.template.yaml) that has the workflow, reference data source, and both on-demand and spot contexts configured using Cromwell as the engine is provided here. This will create an agc project named `humanwgsAGC`, with either (or both) a `spotContext` or an `onDemandContext`. The `spotContext` will allow you to run worklfows using [AWS spot instances](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-spot-instances.html), which can result in substantial cost savings relative to using [on-demand instances](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-on-demand-instances.html). - -Note that deploying a context **will incur costs** even if you are not actively running workflows; ensure that [contexts that are not in use are destroyed](https://aws.github.io/amazon-genomics-cli/docs/reference/agc_context_destroy/) to avoid incurring ongoing costs. - -To deploy the agc project using the template file, first copy the template file to a file named `agc-project.yaml` (`cp agc-project.template.yaml agc-project.yaml`). - -In the `data` section of the `agc-project.yaml` file, add any additional s3 buckets that the workflow will require access to, for example the bucket containing sample input data. Make sure that you do not remove the section granting access to the s3://dnastack-resources bucket; this is where [reference datasets are hosted](#reference-data-hosted-in-aws). - -``` -data: - - location: s3://dnastack-resources - readOnly: true - - location: s3:// - readOnly: true -``` - -Then from the directory containing the `agc-project.yaml` file, run: - -```bash -agc context deploy --context ${context} -``` - -Where `${context}` is either `spotContext` or `onDemandContext`. - -If you want both spot and on-demand contexts, all contexts can be deployed at once by running: - -``` -agc context deploy --all -``` - -Note that the `miniwdl` engine run via AWS is currently not supported for this workflow. - -# Checking and requesting quota in AWS - -See [resources requirements](../../README.md#resource-requirements) for information on the minimum requirements for running the workflow. Typically in a new AWS environment, additional vCPU quota will be required. - -## Checking current quota - -1. Navigate to [the AWS console](https://console.aws.amazon.com/). -2. In the top right corner, select the region where your `agc` deployment is located. -3. Navigate to EC2. -4. In the menu on the left, select 'Limits'. -5. Filter the limits by searching for "Standard". The current limit field indicates the number of vCPUs that you currently have access to. -- Spot instance limit: `All Standard (A, C, D, H, I, M, R, T, Z) Spot Instance Requests` -- On-demand instance limit: `Running On-Demand All Standard (A, C, D, H, I, M, R, T, Z) instances` - -If the number of vCPUs in the context you plan to run the workflow in is less than the limites specified in [the resources requirements](../../README.md#resource-requirements) section, you will need to request additional quota before you can run the workflow. - -## Requesting additional quota - -5. Continuing from the steps outlined in [checking the current quota](#checking-current-quota), select the service you want to request an increase for. -6. In the top right corner, select 'Request limit increase'. -7. Fill out the appropriate fields in the request form, ensuring that the region you select is the region where you have deployed your `agc` and where your data is located. 256 vCPUs are recommended for running trio data. - -Low quota increase requests are typically fulfilled within a 1-2 hours. - -# Configuring and running the workflow - -## Filling out workflow inputs - -Fill out any information missing in [the inputs file](inputs.aws.json). Ensure that all data files used by the workflow are at locations that have been configured in the agc-project.yaml file; see the [granting access to other data files](#granting-access-to-other-data-files) for more information. - -See [the inputs section of the main README](../../README.md#workflow-inputs) for more information on the structure of the inputs.json file. - -Note that you only need to fill out the queueArn corresponding to the context you are submitting the workflow to (spot or on-demand). - -### Determining available zones - -To determine available zones in AWS, look for the `ZoneName` attribute output by the following command: - -```bash -aws ec2 describe-availability-zones --region -``` - -For example, the zones in region us-east-2 are `"us-east-2a us-east-2b us-east-2c"`. - -### Determining the AWS batch queue ARN - -**Note that if you are using a `miniwdl` engine, you can skip these steps; workflows run via miniwdl will run exclusively in the job queue to which they are submitted.** - -1. Visit [the AWS console](https://console.aws.amazon.com/). -2. Navigate to the Batch service. -3. In the lefthand sidebar, select "Compute environments". Note the name of the compute environment with the provisioning model SPOT (if you have deployed a context using spot instances) and the name of the compute environment with provisioning model "EC2" (if you have deployed a context that does not use spot instances). -4. In the lefthand sidebar, select "Job queues". -5. Clicking into an individual queue will show information about the compute environment ("Compute environment order"). Identify the job queue with the Compute environment name that matches the name you identified for the SPOT compute environment; copy the Amazon Resource Name (ARN) for this job queue. This is the value that should be used for the `aws_spot_queue_arn`. Repeat this process to find the ARN for the `aws_on_demand_queue_arn`. - -- If `preemptible = true`, only the `aws_spot_queue_arn` is required. -- If `preemptible = false`, only the `aws_on_demand_queue_arn` is required. - -## Running the workflow - -### Running via `agc` - -From the directory where your `agc-project.yaml` is located, run: - -`agc workflow run humanwgs --context --inputsFile ` - -The running workflow can be monitored via [`agc workflow` commands](https://aws.github.io/amazon-genomics-cli/docs/reference/agc_workflow/), or via the AWS console. - -# Reference data hosted in AWS - -AWS reference data is hosted in the `us-west-2` region in the bucket `s3://dnastack-resources`. - -To use AWS reference data, add the following line to the data section of your [`agc-project.yaml`](https://aws.github.io/amazon-genomics-cli/docs/concepts/projects/): - -```yaml -data: - - location: s3://dnastack-resources - readOnly: true -``` -The [AWS input file template](inputs.aws.json) has paths to the reference files in s3 prefilled. The template [agc-project.template.yaml file](agc-project.template.yaml) has this section filled out already. - -### Granting access to other data files - -S3 buckets outside of the reference files can be accessed by adding additional data blocks to the agc-project.yaml file. See the [agc documentation](https://aws.github.io/amazon-genomics-cli/docs/concepts/data/) for more details on adding additional data sources. All inputs referenced in the inputs.json file will need to be at locations that have been configured in the agc-project.yaml. diff --git a/backends/aws/agc-project.template.yaml b/backends/aws/agc-project.template.yaml deleted file mode 100644 index 46ebf5fb..00000000 --- a/backends/aws/agc-project.template.yaml +++ /dev/null @@ -1,167 +0,0 @@ -name: humanwgsAgc -schemaVersion: 1 -data: - - location: s3://dnastack-resources - readOnly: true -workflows: - humanwgs: - type: - language: wdl - version: 1.0 - sourceURL: ../../workflows -contexts: - onDemandContext: - instanceTypes: [ - "c5.large", - "c5.xlarge", - "c5.2xlarge", - "c5.4xlarge", - "c5.9xlarge", - "c5.12xlarge", - "c5.18xlarge", - "c5.24xlarge", - "c5.metal", - "c5a.large", - "c5a.xlarge", - "c5a.2xlarge", - "c5a.4xlarge", - "c5a.8xlarge", - "c5a.12xlarge", - "c5a.16xlarge", - "c5a.24xlarge", - "c5n.large", - "c5n.xlarge", - "c5n.2xlarge", - "c5n.4xlarge", - "c5n.9xlarge", - "c5n.18xlarge", - "m5.large", - "m5.xlarge", - "m5.2xlarge", - "m5.4xlarge", - "m5.8xlarge", - "m5.12xlarge", - "m5.16xlarge", - "m5.24xlarge", - "m5a.large", - "m5a.xlarge", - "m5a.2xlarge", - "m5a.4xlarge", - "m5a.8xlarge", - "m5a.12xlarge", - "m5a.16xlarge", - "m5a.24xlarge", - "m5n.large", - "m5n.xlarge", - "m5n.2xlarge", - "m5n.4xlarge", - "m5n.8xlarge", - "m5n.12xlarge", - "m5n.16xlarge", - "m5n.24xlarge", - "r5.large", - "r5.xlarge", - "r5.2xlarge", - "r5.4xlarge", - "r5.8xlarge", - "r5.12xlarge", - "r5.16xlarge", - "r5.24xlarge", - "r5a.large", - "r5a.xlarge", - "r5a.2xlarge", - "r5a.4xlarge", - "r5a.8xlarge", - "r5a.12xlarge", - "r5a.16xlarge", - "r5a.24xlarge", - "r5n.large", - "r5n.xlarge", - "r5n.2xlarge", - "r5n.4xlarge", - "r5n.8xlarge", - "r5n.12xlarge", - "r5n.16xlarge", - "r5n.24xlarge", - ] - engines: - - type: wdl - engine: cromwell - spotContext: - requestSpotInstances: true - instanceTypes: [ - "c5.large", - "c5.xlarge", - "c5.2xlarge", - "c5.4xlarge", - "c5.9xlarge", - "c5.12xlarge", - "c5.18xlarge", - "c5.24xlarge", - "c5.metal", - "c5a.large", - "c5a.xlarge", - "c5a.2xlarge", - "c5a.4xlarge", - "c5a.8xlarge", - "c5a.12xlarge", - "c5a.16xlarge", - "c5a.24xlarge", - "c5n.large", - "c5n.xlarge", - "c5n.2xlarge", - "c5n.4xlarge", - "c5n.9xlarge", - "c5n.18xlarge", - "m5.large", - "m5.xlarge", - "m5.2xlarge", - "m5.4xlarge", - "m5.8xlarge", - "m5.12xlarge", - "m5.16xlarge", - "m5.24xlarge", - "m5a.large", - "m5a.xlarge", - "m5a.2xlarge", - "m5a.4xlarge", - "m5a.8xlarge", - "m5a.12xlarge", - "m5a.16xlarge", - "m5a.24xlarge", - "m5n.large", - "m5n.xlarge", - "m5n.2xlarge", - "m5n.4xlarge", - "m5n.8xlarge", - "m5n.12xlarge", - "m5n.16xlarge", - "m5n.24xlarge", - "r5.large", - "r5.xlarge", - "r5.2xlarge", - "r5.4xlarge", - "r5.8xlarge", - "r5.12xlarge", - "r5.16xlarge", - "r5.24xlarge", - "r5a.large", - "r5a.xlarge", - "r5a.2xlarge", - "r5a.4xlarge", - "r5a.8xlarge", - "r5a.12xlarge", - "r5a.16xlarge", - "r5a.24xlarge", - "r5n.large", - "r5n.xlarge", - "r5n.2xlarge", - "r5n.4xlarge", - "r5n.8xlarge", - "r5n.12xlarge", - "r5n.16xlarge", - "r5n.24xlarge", - ] - engines: - - type: wdl - engine: cromwell diff --git a/backends/aws/inputs.aws.json b/backends/aws/inputs.aws.json deleted file mode 100644 index 87b4f162..00000000 --- a/backends/aws/inputs.aws.json +++ /dev/null @@ -1,76 +0,0 @@ -{ - "humanwgs.cohort": { - "cohort_id": "String", - "samples": [ - { - "sample_id": "String", - "movie_bams": [ - "File" - ], - "sex": "String?", - "affected": "Boolean", - "father_id": "String?", - "mother_id": "String?" - } - ], - "phenotypes": [ - "String" - ] - }, - "humanwgs.reference": { - "name": "GRCh38", - "fasta": { - "data": "s3://dnastack-resources/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", - "data_index": "s3://dnastack-resources/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai" - }, - "pbsv_splits": "s3://dnastack-resources/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.pbsv_splits.json", - "tandem_repeat_bed": "s3://dnastack-resources/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.trf.bed", - "trgt_tandem_repeat_bed": "s3://dnastack-resources/dataset/GRCh38/trgt/human_GRCh38_no_alt_analysis_set.trgt.v0.3.4.bed", - "hificnv_exclude_bed": { - "data": "s3://dnastack-resources/dataset/GRCh38/hificnv/cnv.excluded_regions.common_50.hg38.bed.gz", - "data_index": "s3://dnastack-resources/dataset/GRCh38/hificnv/cnv.excluded_regions.common_50.hg38.bed.gz.tbi" - }, - "hificnv_expected_bed_male": "s3://dnastack-resources/dataset/GRCh38/hificnv/expected_cn.hg38.XY.bed", - "hificnv_expected_bed_female": "s3://dnastack-resources/dataset/GRCh38/hificnv/expected_cn.hg38.XX.bed", - "gnomad_af": "s3://dnastack-resources/dataset/GRCh38/slivar_gnotate/gnomad.hg38.v3.custom.v1.zip", - "hprc_af": "s3://dnastack-resources/dataset/GRCh38/slivar_gnotate/hprc.deepvariant.glnexus.hg38.v1.zip", - "gff": "s3://dnastack-resources/dataset/GRCh38/ensembl.GRCh38.101.reformatted.gff3.gz", - "population_vcfs": [ - { - "data": "s3://dnastack-resources/dataset/GRCh38/sv_pop_vcfs/EEE_SV-Pop_1.ALL.sites.20181204.vcf.gz", - "data_index": "s3://dnastack-resources/dataset/GRCh38/sv_pop_vcfs/EEE_SV-Pop_1.ALL.sites.20181204.vcf.gz.tbi" - }, - { - "data": "s3://dnastack-resources/dataset/GRCh38/sv_pop_vcfs/nstd166.GRCh38.variant_call.vcf.gz", - "data_index": "s3://dnastack-resources/dataset/GRCh38/sv_pop_vcfs/nstd166.GRCh38.variant_call.vcf.gz.tbi" - }, - { - "data": "s3://dnastack-resources/dataset/GRCh38/sv_pop_vcfs/hprc.GRCh38.pbsv.vcf.gz", - "data_index": "s3://dnastack-resources/dataset/GRCh38/sv_pop_vcfs/hprc.GRCh38.pbsv.vcf.gz.tbi" - }, - { - "data": "s3://dnastack-resources/dataset/GRCh38/sv_pop_vcfs/ont_sv_high_confidence_SVs.sorted.vcf.gz", - "data_index": "s3://dnastack-resources/dataset/GRCh38/sv_pop_vcfs/ont_sv_high_confidence_SVs.sorted.vcf.gz.tbi" - } - ] - }, - "humanwgs.slivar_data": { - "slivar_js": "s3://dnastack-resources/dataset/slivar/slivar-functions.v0.2.8.js", - "hpo_terms": "s3://dnastack-resources/dataset/hpo/hpoTerms.txt", - "hpo_dag": "s3://dnastack-resources/dataset/hpo/hpoDag.txt", - "hpo_annotations": "s3://dnastack-resources/dataset/hpo/ensembl.hpoPhenotype.tsv", - "ensembl_to_hgnc": "s3://dnastack-resources/dataset/genes/ensembl.hgncSymbol.tsv", - "lof_lookup": "s3://dnastack-resources/dataset/slivar/lof_lookup.v2.1.1.txt", - "clinvar_lookup": "s3://dnastack-resources/dataset/slivar/clinvar_gene_desc.20221214T183140.txt" - }, - "humanwgs.deepvariant_version": "String (optional)", - "humanwgs.deepvariant_model": "WomCompositeType {\n model -> WomCompositeType {\n data -> File\ndata_index -> File \n}\nmetadata -> File \n}? (optional)", - "humanwgs.pbsv_call_mem_gb": "Int (optional, default = if N<=3: 64 else 96)", - "humanwgs.glnexus_mem_gb": "Int (optional, default = 30)", - "humanwgs.run_tertiary_analysis": "Boolean (optional, default = false)", - "humanwgs.backend": "AWS", - "humanwgs.zones": "us-east-2a us-east-2b us-east-2c", - "humanwgs.aws_spot_queue_arn": "", - "humanwgs.aws_on_demand_queue_arn": "", - "humanwgs.preemptible": "Boolean" -} diff --git a/backends/azure/GRCh38.ref_map.v2p0p0.azure.tsv b/backends/azure/GRCh38.ref_map.v2p0p0.azure.tsv new file mode 100644 index 00000000..2300b1f3 --- /dev/null +++ b/backends/azure/GRCh38.ref_map.v2p0p0.azure.tsv @@ -0,0 +1,12 @@ +name GRCh38 +fasta /datasetpbrarediseases/dataset/hifi-wdl-resources-v2.0.0/GRCh38/human_GRCh38_no_alt_analysis_set.fasta +fasta_index /datasetpbrarediseases/dataset/hifi-wdl-resources-v2.0.0/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai +pbsv_splits /datasetpbrarediseases/dataset/hifi-wdl-resources-v2.0.0/GRCh38/human_GRCh38_no_alt_analysis_set.pbsv_splits.json +pbsv_tandem_repeat_bed /datasetpbrarediseases/dataset/hifi-wdl-resources-v2.0.0/GRCh38/human_GRCh38_no_alt_analysis_set.trf.bed +trgt_tandem_repeat_bed /datasetpbrarediseases/dataset/hifi-wdl-resources-v2.0.0/GRCh38/trgt/human_GRCh38_no_alt_analysis_set.trgt.v0.3.4.bed +hificnv_exclude_bed /datasetpbrarediseases/dataset/hifi-wdl-resources-v2.0.0/GRCh38/hificnv/cnv.excluded_regions.common_50.hg38.bed.gz +hificnv_exclude_bed_index /datasetpbrarediseases/dataset/hifi-wdl-resources-v2.0.0/GRCh38/hificnv/cnv.excluded_regions.common_50.hg38.bed.gz.tbi +hificnv_expected_bed_male /datasetpbrarediseases/dataset/hifi-wdl-resources-v2.0.0/GRCh38/hificnv/expected_cn.hg38.XY.bed +hificnv_expected_bed_female /datasetpbrarediseases/dataset/hifi-wdl-resources-v2.0.0/GRCh38/hificnv/expected_cn.hg38.XX.bed +pharmcat_positions_vcf /datasetpbrarediseases/dataset/hifi-wdl-resources-v2.0.0/GRCh38/pharmcat/pharmcat_positions_2.15.4.vcf.bgz +pharmcat_positions_vcf_index /datasetpbrarediseases/dataset/hifi-wdl-resources-v2.0.0/GRCh38/pharmcat/pharmcat_positions_2.15.4.vcf.bgz.csi diff --git a/backends/azure/GRCh38.tertiary_map.v2p0p0.azure.tsv b/backends/azure/GRCh38.tertiary_map.v2p0p0.azure.tsv new file mode 100644 index 00000000..49090470 --- /dev/null +++ b/backends/azure/GRCh38.tertiary_map.v2p0p0.azure.tsv @@ -0,0 +1,12 @@ +slivar_js /datasetpbrarediseases/dataset/hifi-wdl-resources-v2.0.0/slivar/slivar-functions.v0.2.8.js +ensembl_gff /datasetpbrarediseases/dataset/hifi-wdl-resources-v2.0.0/GRCh38/ensembl.GRCh38.101.reformatted.gff3.gz +lof_lookup /datasetpbrarediseases/dataset/hifi-wdl-resources-v2.0.0/slivar/lof_lookup.v2.1.1.txt +clinvar_lookup /datasetpbrarediseases/dataset/hifi-wdl-resources-v2.0.0/slivar/clinvar_gene_desc.20240624T165443.txt +slivar_gnotate_files /datasetpbrarediseases/dataset/hifi-wdl-resources-v2.0.0/GRCh38/slivar_gnotate/gnomad.hg38.v4.1.custom.v1.zip,/datasetpbrarediseases/dataset/hifi-wdl-resources-v2.0.0/GRCh38/slivar_gnotate/CoLoRSdb.GRCh38.v1.1.0.deepvariant.glnexus.zip +slivar_gnotate_prefixes gnomad,colors +slivar_max_af 0.03 +slivar_max_nhomalt 4 +slivar_max_ac 4 +slivar_min_gq 5 +svpack_pop_vcfs /datasetpbrarediseases/dataset/hifi-wdl-resources-v2.0.0/GRCh38/sv_pop_vcfs/gnomad.v4.1.sv.sites.pass.vcf.gz,/datasetpbrarediseases/dataset/hifi-wdl-resources-v2.0.0/GRCh38/sv_pop_vcfs/CoLoRSdb.GRCh38.v1.1.0.pbsv.jasmine.vcf.gz +svpack_pop_vcf_indices /datasetpbrarediseases/dataset/hifi-wdl-resources-v2.0.0/GRCh38/sv_pop_vcfs/gnomad.v4.1.sv.sites.pass.vcf.gz.tbi,/datasetpbrarediseases/dataset/hifi-wdl-resources-v2.0.0/GRCh38/sv_pop_vcfs/CoLoRSdb.GRCh38.v1.1.0.pbsv.jasmine.vcf.gz.tbi diff --git a/backends/azure/family.azure.inputs.json b/backends/azure/family.azure.inputs.json new file mode 100644 index 00000000..d9c300de --- /dev/null +++ b/backends/azure/family.azure.inputs.json @@ -0,0 +1,22 @@ +{ + "humanwgs_family.family": { + "family_id": "String", + "samples": [ + { + "sample_id": "String", + "hifi_reads": [ + "File" + ], + "affected": "Boolean", + "sex": "String? (optional, ['MALE', 'FEMALE', null])", + "father_id": "String? (optional)", + "mother_id": "String? (optional)" + } + ] + }, + "humanwgs_family.phenotypes": "String? (optional)", + "humanwgs_family.ref_map_file": "/datasetpbrarediseases/dataset/hifi-wdl-resources-v2p0p0-rc6/GRCh38.ref_map.v2p0p0-rc6.hpc.tsv", + "humanwgs_family.tertiary_map_file": "/datasetpbrarediseases/dataset/hifi-wdl-resources-v2p0p0-rc6/GRCh38.tertiary_map.v2p0p0-rc6.hpc.tsv", + "humanwgs_family.backend": "Azure", + "humanwgs_family.preemptible": "Boolean" +} \ No newline at end of file diff --git a/backends/azure/inputs.azure.json b/backends/azure/inputs.azure.json deleted file mode 100644 index 8d603d54..00000000 --- a/backends/azure/inputs.azure.json +++ /dev/null @@ -1,73 +0,0 @@ -{ - "humanwgs.cohort": { - "cohort_id": "String", - "samples": [ - { - "sample_id": "String", - "movie_bams": [ - "File" - ], - "sex": "String?", - "affected": "Boolean", - "father_id": "String?", - "mother_id": "String?" - } - ], - "phenotypes": [ - "String" - ] - }, - "humanwgs.reference": { - "name": "GRCh38", - "fasta": { - "data": "/datasetpbrarediseases/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", - "data_index": "/datasetpbrarediseases/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai" - }, - "pbsv_splits": "/datasetpbrarediseases/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.pbsv_splits.json", - "tandem_repeat_bed": "/datasetpbrarediseases/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.trf.bed", - "trgt_tandem_repeat_bed": "/datasetpbrarediseases/dataset/GRCh38/trgt/human_GRCh38_no_alt_analysis_set.trgt.v0.3.4.bed", - "hificnv_exclude_bed": { - "data": "/datasetpbrarediseases/dataset/GRCh38/hificnv/cnv.excluded_regions.common_50.hg38.bed.gz", - "data_index": "/datasetpbrarediseases/dataset/GRCh38/hificnv/cnv.excluded_regions.common_50.hg38.bed.gz.tbi" - }, - "hificnv_expected_bed_male": "/datasetpbrarediseases/dataset/GRCh38/hificnv/expected_cn.hg38.XY.bed", - "hificnv_expected_bed_female": "/datasetpbrarediseases/dataset/GRCh38/hificnv/expected_cn.hg38.XX.bed", - "gnomad_af": "/datasetpbrarediseases/dataset/GRCh38/slivar_gnotate/gnomad.hg38.v3.custom.v1.zip", - "hprc_af": "/datasetpbrarediseases/dataset/GRCh38/slivar_gnotate/hprc.deepvariant.glnexus.hg38.v1.zip", - "gff": "/datasetpbrarediseases/dataset/GRCh38/ensembl.GRCh38.101.reformatted.gff3.gz", - "population_vcfs": [ - { - "data": "/datasetpbrarediseases/dataset/GRCh38/sv_pop_vcfs/EEE_SV-Pop_1.ALL.sites.20181204.vcf.gz", - "data_index": "/datasetpbrarediseases/dataset/GRCh38/sv_pop_vcfs/EEE_SV-Pop_1.ALL.sites.20181204.vcf.gz.tbi" - }, - { - "data": "/datasetpbrarediseases/dataset/GRCh38/sv_pop_vcfs/nstd166.GRCh38.variant_call.vcf.gz", - "data_index": "/datasetpbrarediseases/dataset/GRCh38/sv_pop_vcfs/nstd166.GRCh38.variant_call.vcf.gz.tbi" - }, - { - "data": "/datasetpbrarediseases/dataset/GRCh38/sv_pop_vcfs/hprc.GRCh38.pbsv.vcf.gz", - "data_index": "/datasetpbrarediseases/dataset/GRCh38/sv_pop_vcfs/hprc.GRCh38.pbsv.vcf.gz.tbi" - }, - { - "data": "/datasetpbrarediseases/dataset/GRCh38/sv_pop_vcfs/ont_sv_high_confidence_SVs.sorted.vcf.gz", - "data_index": "/datasetpbrarediseases/dataset/GRCh38/sv_pop_vcfs/ont_sv_high_confidence_SVs.sorted.vcf.gz.tbi" - } - ] - }, - "humanwgs.slivar_data": { - "slivar_js": "/datasetpbrarediseases/dataset/slivar/slivar-functions.v0.2.8.js", - "hpo_terms": "/datasetpbrarediseases/dataset/hpo/hpoTerms.txt", - "hpo_dag": "/datasetpbrarediseases/dataset/hpo/hpoDag.txt", - "hpo_annotations": "/datasetpbrarediseases/dataset/hpo/ensembl.hpoPhenotype.tsv", - "ensembl_to_hgnc": "/datasetpbrarediseases/dataset/genes/ensembl.hgncSymbol.tsv", - "lof_lookup": "/datasetpbrarediseases/dataset/slivar/lof_lookup.v2.1.1.txt", - "clinvar_lookup": "/datasetpbrarediseases/dataset/slivar/clinvar_gene_desc.20221214T183140.txt" - }, - "humanwgs.deepvariant_version": "String (optional)", - "humanwgs.deepvariant_model": "WomCompositeType {\n model -> WomCompositeType {\n data -> File\ndata_index -> File \n}\nmetadata -> File \n}? (optional)", - "humanwgs.pbsv_call_mem_gb": "Int (optional, default = if N<=3: 64 else 96)", - "humanwgs.glnexus_mem_gb": "Int (optional, default = 30)", - "humanwgs.run_tertiary_analysis": "Boolean (optional, default = false)", - "humanwgs.backend": "Azure", - "humanwgs.preemptible": "Boolean" -} diff --git a/backends/azure/singleton.azure.inputs.json b/backends/azure/singleton.azure.inputs.json new file mode 100644 index 00000000..06f8d19c --- /dev/null +++ b/backends/azure/singleton.azure.inputs.json @@ -0,0 +1,12 @@ +{ + "humanwgs_singleton.sample_id": "String", + "humanwgs_singleton.sex": "String? (optional, ['MALE', 'FEMALE', null])", + "humanwgs_singleton.hifi_reads": [ + "File" + ], + "humanwgs_singleton.phenotypes": "String? (optional)", + "humanwgs_singleton.ref_map_file": "/datasetpbrarediseases/dataset/hifi-wdl-resources-v2p0p0-rc6/GRCh38.ref_map.v2p0p0-rc6.hpc.tsv", + "humanwgs_singleton.tertiary_map_file": "/datasetpbrarediseases/dataset/hifi-wdl-resources-v2p0p0-rc6/GRCh38.tertiary_map.v2p0p0-rc6.hpc.tsv", + "humanwgs_singleton.backend": "Azure", + "humanwgs_singleton.preemptible": "Boolean" +} \ No newline at end of file diff --git a/backends/gcp/GRCh38.ref_map.v2p0p0.gcp.tsv b/backends/gcp/GRCh38.ref_map.v2p0p0.gcp.tsv new file mode 100644 index 00000000..2d63bdea --- /dev/null +++ b/backends/gcp/GRCh38.ref_map.v2p0p0.gcp.tsv @@ -0,0 +1,12 @@ +name GRCh38 +fasta gs://pacbio-wdl/hifi-wdl-resources-v2.0.0/GRCh38/human_GRCh38_no_alt_analysis_set.fasta +fasta_index gs://pacbio-wdl/hifi-wdl-resources-v2.0.0/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai +pbsv_splits gs://pacbio-wdl/hifi-wdl-resources-v2.0.0/GRCh38/human_GRCh38_no_alt_analysis_set.pbsv_splits.json +pbsv_tandem_repeat_bed gs://pacbio-wdl/hifi-wdl-resources-v2.0.0/GRCh38/human_GRCh38_no_alt_analysis_set.trf.bed +trgt_tandem_repeat_bed gs://pacbio-wdl/hifi-wdl-resources-v2.0.0/GRCh38/trgt/human_GRCh38_no_alt_analysis_set.trgt.v0.3.4.bed +hificnv_exclude_bed gs://pacbio-wdl/hifi-wdl-resources-v2.0.0/GRCh38/hificnv/cnv.excluded_regions.common_50.hg38.bed.gz +hificnv_exclude_bed_index gs://pacbio-wdl/hifi-wdl-resources-v2.0.0/GRCh38/hificnv/cnv.excluded_regions.common_50.hg38.bed.gz.tbi +hificnv_expected_bed_male gs://pacbio-wdl/hifi-wdl-resources-v2.0.0/GRCh38/hificnv/expected_cn.hg38.XY.bed +hificnv_expected_bed_female gs://pacbio-wdl/hifi-wdl-resources-v2.0.0/GRCh38/hificnv/expected_cn.hg38.XX.bed +pharmcat_positions_vcf gs://pacbio-wdl/hifi-wdl-resources-v2.0.0/GRCh38/pharmcat/pharmcat_positions_2.15.4.vcf.bgz +pharmcat_positions_vcf_index gs://pacbio-wdl/hifi-wdl-resources-v2.0.0/GRCh38/pharmcat/pharmcat_positions_2.15.4.vcf.bgz.csi diff --git a/backends/gcp/GRCh38.tertiary_map.v2p0p0.gcp.tsv b/backends/gcp/GRCh38.tertiary_map.v2p0p0.gcp.tsv new file mode 100644 index 00000000..bf5d42a9 --- /dev/null +++ b/backends/gcp/GRCh38.tertiary_map.v2p0p0.gcp.tsv @@ -0,0 +1,12 @@ +slivar_js gs://pacbio-wdl/hifi-wdl-resources-v2.0.0/slivar/slivar-functions.v0.2.8.js +ensembl_gff gs://pacbio-wdl/hifi-wdl-resources-v2.0.0/GRCh38/ensembl.GRCh38.101.reformatted.gff3.gz +lof_lookup gs://pacbio-wdl/hifi-wdl-resources-v2.0.0/slivar/lof_lookup.v2.1.1.txt +clinvar_lookup gs://pacbio-wdl/hifi-wdl-resources-v2.0.0/slivar/clinvar_gene_desc.20240624T165443.txt +slivar_gnotate_files gs://pacbio-wdl/hifi-wdl-resources-v2.0.0/GRCh38/slivar_gnotate/gnomad.hg38.v4.1.custom.v1.zip,gs://pacbio-wdl/hifi-wdl-resources-v2.0.0/GRCh38/slivar_gnotate/CoLoRSdb.GRCh38.v1.1.0.deepvariant.glnexus.zip +slivar_gnotate_prefixes gnomad,colors +slivar_max_af 0.03 +slivar_max_nhomalt 4 +slivar_max_ac 4 +slivar_min_gq 5 +svpack_pop_vcfs gs://pacbio-wdl/hifi-wdl-resources-v2.0.0/GRCh38/sv_pop_vcfs/gnomad.v4.1.sv.sites.pass.vcf.gz,gs://pacbio-wdl/hifi-wdl-resources-v2.0.0/GRCh38/sv_pop_vcfs/CoLoRSdb.GRCh38.v1.1.0.pbsv.jasmine.vcf.gz +svpack_pop_vcf_indices gs://pacbio-wdl/hifi-wdl-resources-v2.0.0/GRCh38/sv_pop_vcfs/gnomad.v4.1.sv.sites.pass.vcf.gz.tbi,gs://pacbio-wdl/hifi-wdl-resources-v2.0.0/GRCh38/sv_pop_vcfs/CoLoRSdb.GRCh38.v1.1.0.pbsv.jasmine.vcf.gz.tbi diff --git a/backends/gcp/family.gcp.inputs.json b/backends/gcp/family.gcp.inputs.json new file mode 100644 index 00000000..10392061 --- /dev/null +++ b/backends/gcp/family.gcp.inputs.json @@ -0,0 +1,23 @@ +{ + "humanwgs_family.family": { + "family_id": "String", + "samples": [ + { + "sample_id": "String", + "hifi_reads": [ + "File" + ], + "affected": "Boolean", + "sex": "String? (optional, ['MALE', 'FEMALE', null])", + "father_id": "String? (optional)", + "mother_id": "String? (optional)" + } + ] + }, + "humanwgs_family.phenotypes": "String? (optional)", + "humanwgs_family.ref_map_file": "gs://pacbio-wdl/hifi-wdl-resources-v2p0p0-rc6/GRCh38.ref_map.v2p0p0-rc6.gcp.tsv", + "humanwgs_family.tertiary_map_file": "gs://pacbio-wdl/hifi-wdl-resources-v2p0p0-rc6/GRCh38.tertiary_map.v2p0p0-rc6.gcp.tsv", + "humanwgs_family.backend": "GCP", + "humanwgs_family.zones": "String", + "humanwgs_family.preemptible": "Boolean" +} \ No newline at end of file diff --git a/backends/gcp/inputs.gcp.json b/backends/gcp/inputs.gcp.json deleted file mode 100644 index 483998e7..00000000 --- a/backends/gcp/inputs.gcp.json +++ /dev/null @@ -1,74 +0,0 @@ -{ - "humanwgs.cohort": { - "cohort_id": "String", - "samples": [ - { - "sample_id": "String", - "movie_bams": [ - "File" - ], - "sex": "String?", - "affected": "Boolean", - "father_id": "String?", - "mother_id": "String?" - } - ], - "phenotypes": [ - "String" - ] - }, - "humanwgs.reference": { - "name": "GRCh38", - "fasta": { - "data": "gs://pacbio-wdl/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", - "data_index": "gs://pacbio-wdl/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai" - }, - "pbsv_splits": "gs://pacbio-wdl/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.pbsv_splits.json", - "tandem_repeat_bed": "gs://pacbio-wdl/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.trf.bed", - "trgt_tandem_repeat_bed": "gs://pacbio-wdl/dataset/GRCh38/trgt/human_GRCh38_no_alt_analysis_set.trgt.v0.3.4.bed", - "hificnv_exclude_bed": { - "data": "gs://pacbio-wdl/dataset/GRCh38/hificnv/cnv.excluded_regions.common_50.hg38.bed.gz", - "data_index": "gs://pacbio-wdl/dataset/GRCh38/hificnv/cnv.excluded_regions.common_50.hg38.bed.gz.tbi" - }, - "hificnv_expected_bed_male": "gs://pacbio-wdl/dataset/GRCh38/hificnv/expected_cn.hg38.XY.bed", - "hificnv_expected_bed_female": "gs://pacbio-wdl/dataset/GRCh38/hificnv/expected_cn.hg38.XX.bed", - "gnomad_af": "gs://pacbio-wdl/dataset/GRCh38/slivar_gnotate/gnomad.hg38.v3.custom.v1.zip", - "hprc_af": "gs://pacbio-wdl/dataset/GRCh38/slivar_gnotate/hprc.deepvariant.glnexus.hg38.v1.zip", - "gff": "gs://pacbio-wdl/dataset/GRCh38/ensembl.GRCh38.101.reformatted.gff3.gz", - "population_vcfs": [ - { - "data": "gs://pacbio-wdl/dataset/GRCh38/sv_pop_vcfs/EEE_SV-Pop_1.ALL.sites.20181204.vcf.gz", - "data_index": "gs://pacbio-wdl/dataset/GRCh38/sv_pop_vcfs/EEE_SV-Pop_1.ALL.sites.20181204.vcf.gz.tbi" - }, - { - "data": "gs://pacbio-wdl/dataset/GRCh38/sv_pop_vcfs/nstd166.GRCh38.variant_call.vcf.gz", - "data_index": "gs://pacbio-wdl/dataset/GRCh38/sv_pop_vcfs/nstd166.GRCh38.variant_call.vcf.gz.tbi" - }, - { - "data": "gs://pacbio-wdl/dataset/GRCh38/sv_pop_vcfs/hprc.GRCh38.pbsv.vcf.gz", - "data_index": "gs://pacbio-wdl/dataset/GRCh38/sv_pop_vcfs/hprc.GRCh38.pbsv.vcf.gz.tbi" - }, - { - "data": "gs://pacbio-wdl/dataset/GRCh38/sv_pop_vcfs/ont_sv_high_confidence_SVs.sorted.vcf.gz", - "data_index": "gs://pacbio-wdl/dataset/GRCh38/sv_pop_vcfs/ont_sv_high_confidence_SVs.sorted.vcf.gz.tbi" - } - ] - }, - "humanwgs.slivar_data": { - "slivar_js": "gs://pacbio-wdl/dataset/slivar/slivar-functions.v0.2.8.js", - "hpo_terms": "gs://pacbio-wdl/dataset/hpo/hpoTerms.txt", - "hpo_dag": "gs://pacbio-wdl/dataset/hpo/hpoDag.txt", - "hpo_annotations": "gs://pacbio-wdl/dataset/hpo/ensembl.hpoPhenotype.tsv", - "ensembl_to_hgnc": "gs://pacbio-wdl/dataset/genes/ensembl.hgncSymbol.tsv", - "lof_lookup": "gs://pacbio-wdl/dataset/slivar/lof_lookup.v2.1.1.txt", - "clinvar_lookup": "gs://pacbio-wdl/dataset/slivar/clinvar_gene_desc.20221214T183140.txt" - }, - "humanwgs.deepvariant_version": "String (optional)", - "humanwgs.deepvariant_model": "WomCompositeType {\n model -> WomCompositeType {\n data -> File\ndata_index -> File \n}\nmetadata -> File \n}? (optional)", - "humanwgs.pbsv_call_mem_gb": "Int (optional, default = if N<=3: 64 else 96)", - "humanwgs.glnexus_mem_gb": "Int (optional, default = 30)", - "humanwgs.run_tertiary_analysis": "Boolean (optional, default = false)", - "humanwgs.backend": "GCP", - "humanwgs.zones": "String", - "humanwgs.preemptible": "Boolean" -} diff --git a/backends/gcp/singleton.gcp.inputs.json b/backends/gcp/singleton.gcp.inputs.json new file mode 100644 index 00000000..6629ab17 --- /dev/null +++ b/backends/gcp/singleton.gcp.inputs.json @@ -0,0 +1,13 @@ +{ + "humanwgs_singleton.sample_id": "String", + "humanwgs_singleton.sex": "String? (optional, ['MALE', 'FEMALE', null])", + "humanwgs_singleton.hifi_reads": [ + "File" + ], + "humanwgs_singleton.phenotypes": "String? (optional)", + "humanwgs_singleton.ref_map_file": "gs://pacbio-wdl/hifi-wdl-resources-v2p0p0-rc6/GRCh38.ref_map.v2p0p0-rc6.gcp.tsv", + "humanwgs_singleton.tertiary_map_file": "gs://pacbio-wdl/hifi-wdl-resources-v2p0p0-rc6/GRCh38.tertiary_map.v2p0p0-rc6.gcp.tsv", + "humanwgs_singleton.backend": "GCP", + "humanwgs_singleton.zones": "String", + "humanwgs_singleton.preemptible": "Boolean" +} \ No newline at end of file diff --git a/backends/hpc/GRCh38.ref_map.v2p0p0.template.tsv b/backends/hpc/GRCh38.ref_map.v2p0p0.template.tsv new file mode 100644 index 00000000..3a095aaf --- /dev/null +++ b/backends/hpc/GRCh38.ref_map.v2p0p0.template.tsv @@ -0,0 +1,12 @@ +name GRCh38 +fasta /hifi-wdl-resources-v2.0.0/GRCh38/human_GRCh38_no_alt_analysis_set.fasta +fasta_index /hifi-wdl-resources-v2.0.0/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai +pbsv_splits /hifi-wdl-resources-v2.0.0/GRCh38/human_GRCh38_no_alt_analysis_set.pbsv_splits.json +pbsv_tandem_repeat_bed /hifi-wdl-resources-v2.0.0/GRCh38/human_GRCh38_no_alt_analysis_set.trf.bed +trgt_tandem_repeat_bed /hifi-wdl-resources-v2.0.0/GRCh38/trgt/human_GRCh38_no_alt_analysis_set.trgt.v0.3.4.bed +hificnv_exclude_bed /hifi-wdl-resources-v2.0.0/GRCh38/hificnv/cnv.excluded_regions.common_50.hg38.bed.gz +hificnv_exclude_bed_index /hifi-wdl-resources-v2.0.0/GRCh38/hificnv/cnv.excluded_regions.common_50.hg38.bed.gz.tbi +hificnv_expected_bed_male /hifi-wdl-resources-v2.0.0/GRCh38/hificnv/expected_cn.hg38.XY.bed +hificnv_expected_bed_female /hifi-wdl-resources-v2.0.0/GRCh38/hificnv/expected_cn.hg38.XX.bed +pharmcat_positions_vcf /hifi-wdl-resources-v2.0.0/GRCh38/pharmcat/pharmcat_positions_2.15.4.vcf.bgz +pharmcat_positions_vcf_index /hifi-wdl-resources-v2.0.0/GRCh38/pharmcat/pharmcat_positions_2.15.4.vcf.bgz.csi diff --git a/backends/hpc/GRCh38.tertiary_map.v2p0p0.template.tsv b/backends/hpc/GRCh38.tertiary_map.v2p0p0.template.tsv new file mode 100644 index 00000000..f7a2d2f3 --- /dev/null +++ b/backends/hpc/GRCh38.tertiary_map.v2p0p0.template.tsv @@ -0,0 +1,12 @@ +slivar_js /hifi-wdl-resources-v2.0.0/slivar/slivar-functions.v0.2.8.js +ensembl_gff /hifi-wdl-resources-v2.0.0/GRCh38/ensembl.GRCh38.101.reformatted.gff3.gz +lof_lookup /hifi-wdl-resources-v2.0.0/slivar/lof_lookup.v2.1.1.txt +clinvar_lookup /hifi-wdl-resources-v2.0.0/slivar/clinvar_gene_desc.20240624T165443.txt +slivar_gnotate_files /hifi-wdl-resources-v2.0.0/GRCh38/slivar_gnotate/gnomad.hg38.v4.1.custom.v1.zip,/hifi-wdl-resources-v2.0.0/GRCh38/slivar_gnotate/CoLoRSdb.GRCh38.v1.1.0.deepvariant.glnexus.zip +slivar_gnotate_prefixes gnomad,colors +slivar_max_af 0.03 +slivar_max_nhomalt 4 +slivar_max_ac 4 +slivar_min_gq 5 +svpack_pop_vcfs /hifi-wdl-resources-v2.0.0/GRCh38/sv_pop_vcfs/gnomad.v4.1.sv.sites.pass.vcf.gz,/hifi-wdl-resources-v2.0.0/GRCh38/sv_pop_vcfs/CoLoRSdb.GRCh38.v1.1.0.pbsv.jasmine.vcf.gz +svpack_pop_vcf_indices /hifi-wdl-resources-v2.0.0/GRCh38/sv_pop_vcfs/gnomad.v4.1.sv.sites.pass.vcf.gz.tbi,/hifi-wdl-resources-v2.0.0/GRCh38/sv_pop_vcfs/CoLoRSdb.GRCh38.v1.1.0.pbsv.jasmine.vcf.gz.tbi diff --git a/backends/hpc/README.md b/backends/hpc/README.md deleted file mode 100644 index bee848a7..00000000 --- a/backends/hpc/README.md +++ /dev/null @@ -1,48 +0,0 @@ -Either `miniwdl` or `Cromwell` can be used to run workflows on the HPC. - -# Installing and configuring `miniwdl` - -## Requirements - -- [`miniwdl`](https://github.com/chanzuckerberg/miniwdl) >= 1.9.0 -- [`miniwdl-slurm`](https://github.com/miniwdl-ext/miniwdl-slurm) - -## Configuring - -An [example miniwdl.cfg file](miniwdl.cfg) is provided here. This should be placed at `~/.config/miniwdl.cfg` and edited to match your slurm configuration. This allows running workflows using a basic SLURM setup. - -# Installing and configuring `Cromwell` - -Cromwell supports a number of different HPC backends; see [Cromwell's documentation](https://cromwell.readthedocs.io/en/stable/backends/HPC/) for more information on configuring each of the backends. - -# Configuring and running the workflow - -## Filling out workflow inputs - -Fill out any information missing in [the inputs file](inputs.hpc.json). Once you have downloaded the reference data bundle, ensure that you have replaced the `` in the input template file with the local path to the reference datasets on your HPC. - -See [the inputs section of the main README](../../README.md#workflow-inputs) for more information on the structure of the inputs.json file. - -## Running the workflow - -### Running via miniwdl - -`miniwdl run workflows/main.wdl -i ` - -### Running via Cromwell - -`cromwell run workflows/main.wdl -i ` - -# Reference data bundle - -![https://doi.org/10.5281/zenodo.8415406](https://zenodo.org/badge/DOI/10.5281/zenodo.8415406.svg) - -Reference data is hosted on Zenodo at [10.5281/zenodo.8415406](https://zenodo.org/record/8415406). Download the reference data bundle and extract it to a location on your HPC, then update the input template file with the path to the reference data. - -```bash -# download the reference data bundle -wget https://zenodo.org/record/8415406/files/wdl-humanwgs.v1.0.2.resource.tgz - -# extract the reference data bundle and rename as dataset -tar -xzf wdl-humanwgs.v1.0.2.resource.tgz && mv static_resources dataset -``` diff --git a/backends/hpc/family.hpc.inputs.json b/backends/hpc/family.hpc.inputs.json new file mode 100644 index 00000000..a1db7a1e --- /dev/null +++ b/backends/hpc/family.hpc.inputs.json @@ -0,0 +1,22 @@ +{ + "humanwgs_family.family": { + "family_id": "String", + "samples": [ + { + "sample_id": "String", + "hifi_reads": [ + "File" + ], + "affected": "Boolean", + "sex": "String? (optional, ['MALE', 'FEMALE', null])", + "father_id": "String? (optional)", + "mother_id": "String? (optional)" + } + ] + }, + "humanwgs_family.phenotypes": "String? (optional)", + "humanwgs_family.ref_map_file": "/dataset/GRCh38.ref_map.v2p0p0-rc6.hpc.tsv", + "humanwgs_family.tertiary_map_file": "/dataset/GRCh38.tertiary_map.v2p0p0-rc6.hpc.tsv", + "humanwgs_family.backend": "HPC", + "humanwgs_family.preemptible": true +} \ No newline at end of file diff --git a/backends/hpc/inputs.hpc.json b/backends/hpc/inputs.hpc.json deleted file mode 100644 index e5a66215..00000000 --- a/backends/hpc/inputs.hpc.json +++ /dev/null @@ -1,73 +0,0 @@ -{ - "humanwgs.cohort": { - "cohort_id": "String", - "samples": [ - { - "sample_id": "String", - "movie_bams": [ - "File" - ], - "sex": "String?", - "affected": "Boolean", - "father_id": "String?", - "mother_id": "String?" - } - ], - "phenotypes": [ - "String" - ] - }, - "humanwgs.reference": { - "name": "GRCh38", - "fasta": { - "data": "/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", - "data_index": "/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai" - }, - "pbsv_splits": "/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.pbsv_splits.json", - "tandem_repeat_bed": "/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.trf.bed", - "trgt_tandem_repeat_bed": "/dataset/GRCh38/trgt/human_GRCh38_no_alt_analysis_set.trgt.v0.3.4.bed", - "hificnv_exclude_bed": { - "data": "/dataset/GRCh38/hificnv/cnv.excluded_regions.common_50.hg38.bed.gz", - "data_index": "/dataset/GRCh38/hificnv/cnv.excluded_regions.common_50.hg38.bed.gz.tbi" - }, - "hificnv_expected_bed_male": "/dataset/GRCh38/hificnv/expected_cn.hg38.XY.bed", - "hificnv_expected_bed_female": "/dataset/GRCh38/hificnv/expected_cn.hg38.XX.bed", - "gnomad_af": "/dataset/GRCh38/slivar_gnotate/gnomad.hg38.v3.custom.v1.zip", - "hprc_af": "/dataset/GRCh38/slivar_gnotate/hprc.deepvariant.glnexus.hg38.v1.zip", - "gff": "/dataset/GRCh38/ensembl.GRCh38.101.reformatted.gff3.gz", - "population_vcfs": [ - { - "data": "/dataset/GRCh38/sv_pop_vcfs/EEE_SV-Pop_1.ALL.sites.20181204.vcf.gz", - "data_index": "/dataset/GRCh38/sv_pop_vcfs/EEE_SV-Pop_1.ALL.sites.20181204.vcf.gz.tbi" - }, - { - "data": "/dataset/GRCh38/sv_pop_vcfs/nstd166.GRCh38.variant_call.vcf.gz", - "data_index": "/dataset/GRCh38/sv_pop_vcfs/nstd166.GRCh38.variant_call.vcf.gz.tbi" - }, - { - "data": "/dataset/GRCh38/sv_pop_vcfs/hprc.GRCh38.pbsv.vcf.gz", - "data_index": "/dataset/GRCh38/sv_pop_vcfs/hprc.GRCh38.pbsv.vcf.gz.tbi" - }, - { - "data": "/dataset/GRCh38/sv_pop_vcfs/ont_sv_high_confidence_SVs.sorted.vcf.gz", - "data_index": "/dataset/GRCh38/sv_pop_vcfs/ont_sv_high_confidence_SVs.sorted.vcf.gz.tbi" - } - ] - }, - "humanwgs.slivar_data": { - "slivar_js": "/dataset/slivar/slivar-functions.v0.2.8.js", - "hpo_terms": "/dataset/hpo/hpoTerms.txt", - "hpo_dag": "/dataset/hpo/hpoDag.txt", - "hpo_annotations": "/dataset/hpo/ensembl.hpoPhenotype.tsv", - "ensembl_to_hgnc": "/dataset/genes/ensembl.hgncSymbol.tsv", - "lof_lookup": "/dataset/slivar/lof_lookup.v2.1.1.txt", - "clinvar_lookup": "/dataset/slivar/clinvar_gene_desc.20221214T183140.txt" - }, - "humanwgs.deepvariant_version": "String (optional)", - "humanwgs.deepvariant_model": "WomCompositeType {\n model -> WomCompositeType {\n data -> File\ndata_index -> File \n}\nmetadata -> File \n}? (optional)", - "humanwgs.pbsv_call_mem_gb": "Int (optional, default = if N<=3: 64 else 96)", - "humanwgs.glnexus_mem_gb": "Int (optional, default = 30)", - "humanwgs.run_tertiary_analysis": "Boolean (optional, default = false)", - "humanwgs.backend": "HPC", - "humanwgs.preemptible": true -} diff --git a/backends/hpc/miniwdl.cfg b/backends/hpc/miniwdl.cfg index 3bdd33dc..190a3b45 100644 --- a/backends/hpc/miniwdl.cfg +++ b/backends/hpc/miniwdl.cfg @@ -5,12 +5,15 @@ container_backend = slurm_singularity # higher numbers means miniwdl has to monitor more processes simultaneously # which might impact performance. task_concurrency=200 - # This setting allows running tasks to continue, even if one other tasks fails. # Useful in combination with call caching. Prevents wasting resources by # cancelling jobs half-way that would probably succeed. fail_fast = false +[file_io] +# This is required for the new ref_map and tertiary_map inputs. +allow_any_input = true + [call_cache] # The following settings create a call cache under the current directory. # This prevents wasting unnecessary resources on the cluster by rerunning @@ -37,7 +40,8 @@ exe = ["/usr/bin/singularity"] # the miniwdl default options contain options to run as a fake root, which # is not available on most clusters. run_options = [ - "--containall" + "--containall", + "--nv" ] # Location of the singularity images (optional). The miniwdl-slurm plugin @@ -47,4 +51,4 @@ image_cache = "$PWD/miniwdl_singularity_cache" [slurm] # extra arguments passed to the srun command (optional). -extra_args="--partition compute --comment 'run with miniwdl'" +extra_args="--partition compute --comment 'run with miniwdl'" \ No newline at end of file diff --git a/backends/hpc/singleton.hpc.inputs.json b/backends/hpc/singleton.hpc.inputs.json new file mode 100644 index 00000000..52ff1b2b --- /dev/null +++ b/backends/hpc/singleton.hpc.inputs.json @@ -0,0 +1,12 @@ +{ + "humanwgs_singleton.sample_id": "String", + "humanwgs_singleton.sex": "String? (optional, ['MALE', 'FEMALE', null])", + "humanwgs_singleton.hifi_reads": [ + "File" + ], + "humanwgs_singleton.phenotypes": "String? (optional)", + "humanwgs_singleton.ref_map_file": "/dataset/GRCh38.ref_map.v2p0p0-rc6.hpc.tsv", + "humanwgs_singleton.tertiary_map_file": "/dataset/GRCh38.tertiary_map.v2p0p0-rc6.hpc.tsv", + "humanwgs_singleton.backend": "HPC", + "humanwgs_singleton.preemptible": true +} \ No newline at end of file diff --git a/docs/backend-aws-healthomics.md b/docs/backend-aws-healthomics.md new file mode 100644 index 00000000..70644910 --- /dev/null +++ b/docs/backend-aws-healthomics.md @@ -0,0 +1 @@ +# TBD diff --git a/backends/azure/README.md b/docs/backend-azure.md similarity index 70% rename from backends/azure/README.md rename to docs/backend-azure.md index 357a43fb..3eef5436 100644 --- a/backends/azure/README.md +++ b/docs/backend-azure.md @@ -6,24 +6,22 @@ Workflows can be run in Azure by setting up [Cromwell on Azure (CoA)](https://gi - [Cromwell on Azure](https://github.com/microsoft/CromwellOnAzure) version 3.2+; version 4.0+ is recommended -# Configuring and running the workflow +## Configuring and running the workflow -## Filling out workflow inputs +### Filling out workflow inputs -Fill out any information missing in [the inputs file](inputs.azure.json). +Fill out any information missing in [the inputs file](../blob/main/backends/azure/singleton.azure.inputs.json). -See [the inputs section of the main README](../../README.md#workflow-inputs) for more information on the structure of the inputs.json file. - -## Running the workflow +See [the inputs section of the main README](./singleton#inputs) for more information on the structure of the inputs.json file. ### Running via Cromwell on Azure Instructions for running a workflow from Cromwell on Azure are described in [the Cromwell on Azure documentation](https://github.com/microsoft/CromwellOnAzure/wiki/Running-Workflows). -# Reference data hosted in Azure +## Reference data hosted in Azure To use Azure reference data, add the following line to your `containers-to-mount` file in your Cromwell on Azure installation ([more info here](https://github.com/microsoft/CromwellOnAzure/blob/develop/docs/troubleshooting-guide.md#use-input-data-files-from-an-existing-azure-storage-account-that-my-lab-or-team-is-currently-using)): `https://datasetpbrarediseases.blob.core.windows.net/dataset?si=public&spr=https&sv=2021-06-08&sr=c&sig=o6OkcqWWlGcGOOr8I8gCA%2BJwlpA%2FYsRz0DMB8CCtCJk%3D` -The [Azure input file template](inputs.azure.json) has paths to the reference files in this blob storage prefilled. +The [Azure input file template](../blob/main/backends/azure/singleton.azure.inputs.json) has paths to the reference files in this blob storage prefilled. diff --git a/backends/gcp/README.md b/docs/backend-gcp.md similarity index 71% rename from backends/gcp/README.md rename to docs/backend-gcp.md index e1ea7c4a..3daa1140 100644 --- a/backends/gcp/README.md +++ b/docs/backend-gcp.md @@ -2,15 +2,15 @@ [Cromwell's documentation](https://cromwell.readthedocs.io/en/stable/tutorials/PipelinesApi101/) on getting started with Google's genomics Pipelines API can be used to set up the resources needed to run the workflow. -# Configuring and running the workflow +## Configuring and running the workflow -## Filling out workflow inputs +### Filling out workflow inputs -Fill out any information missing in [the inputs file](inputs.gcp.json). +Fill out any information missing in [the inputs file](../blob/main/backends/gcp/singleton.gcp.inputs.json). -See [the inputs section of the main README](../../README.md#workflow-inputs) for more information on the structure of the inputs.json file. +See [the inputs section of the singleton README](./singleton#inputs) for more information on the structure of the inputs.json file. -### Determining available zones +#### Determining available zones To determine available zones in GCP, run the following; available zones within a region can be found in the first column of the output: @@ -18,16 +18,13 @@ To determine available zones in GCP, run the following; available zones within a gcloud compute zones list | grep ``` -For example, the zones in region us-central1 are `"us-central1-a us-central1-b us-central1c us-central1f"`. +For example, the zones in region `us-central1` are `"us-central1-a us-central1-b us-central1c us-central1f"`. -## Running the workflow - -### Running via Google's genomics Pipelines API +## Running the workflow via Google's genomics Pipelines API [Cromwell's documentation](https://cromwell.readthedocs.io/en/stable/tutorials/PipelinesApi101/) on getting started with Google's genomics Pipelines API can be used as an example for how to run the workflow. - -# Reference data hosted in GCP +## Reference data hosted in GCP GCP reference data is hosted in the `us-west1` region in the bucket `gs://pacbio-wdl`. This bucket is requester-pays, meaning that users will need to [provide a billing project in their Cromwell configuration](https://cromwell.readthedocs.io/en/stable/filesystems/GoogleCloudStorage/) in order to use files located in this bucket. diff --git a/docs/backend-hpc.md b/docs/backend-hpc.md new file mode 100644 index 00000000..c4e87a3c --- /dev/null +++ b/docs/backend-hpc.md @@ -0,0 +1,52 @@ +# Installing and configuring for HPC backends + +Either `miniwdl` or `Cromwell` can be used to run workflows on the HPC. + +## Installing and configuring `miniwdl` + +### Requirements + +- [`miniwdl`](https://github.com/chanzuckerberg/miniwdl) >= 1.9.0 +- [`miniwdl-slurm`](https://github.com/miniwdl-ext/miniwdl-slurm) + +### Configuration + +An [example miniwdl.cfg file](../blob/main/backends/hpc/miniwdl.cfg) is provided here. This should be placed at `~/.config/miniwdl.cfg` and edited to match your slurm configuration. This allows running workflows using a basic SLURM setup. + +## Installing and configuring `Cromwell` + +Cromwell supports a number of different HPC backends; see [Cromwell's documentation](https://cromwell.readthedocs.io/en/stable/backends/HPC/) for more information on configuring each of the backends. Cromwell can be used in a standalone "run" mode, or in "server" mode to allow for multiple users to submit workflows. In the example below, we provide example commands for running Cromwell in "run" mode. + +## Running the workflow + +### Filling out workflow inputs + +Fill out any information missing in [the inputs file](../blob/main/backends/hpc/singleton.hpc.inputs.json). Once you have downloaded the reference data bundle, ensure that you have replaced the `` in the input template file with the local path to the reference datasets on your HPC. + +See [the inputs section of the singleton README](./singleton#inputs) for more information on the structure of the inputs.json file. + +#### Running via miniwdl + +```bash +miniwdl run workflows/singleton.wdl --input +``` + +#### Running via Cromwell + +```bash +cromwell run workflows/singleton.wdl --input +``` + +## Reference data bundle + +![https://zenodo.org/records/14027047](https://zenodo.org/badge/DOI/10.5281/zenodo.14027047.svg) + +Reference data is hosted on Zenodo at [10.5281/zenodo.14027047](https://zenodo.org/record/14027047). Download the reference data bundle and extract it to a location on your HPC, then update the input template file with the path to the reference data. + +```bash +## download the reference data bundle +wget https://zenodo.org/record/12729255/files/hifi-wdl-resources-v2.0.0.tar + +## extract the reference data bundle and rename as dataset +tar -xvf hifi-wdl-resources-v2.0.0.tar +``` diff --git a/docs/bam_stats.md b/docs/bam_stats.md new file mode 100644 index 00000000..bec81ed2 --- /dev/null +++ b/docs/bam_stats.md @@ -0,0 +1,47 @@ +# bam_stats outputs + +```wdl +{sample}.{movie}.read_length_and_quality.tsv - per read length and quality metrics +{sample}.{movie}.read_length_summary.tsv - histogram of read lengths +{sample}.{movie}.read_quality_summary.tsv - histogram of read qualities +``` + +## `{sample}.{movie}.read_length_and_quality.tsv.gz` - per read length and quality metrics + +Base metrics are extracted for each read from the uBAM and stored in these 4 columns: + +- movie +- read name +- read length: length of query sequence +- read quality: transformation of `rq` tag into Phred (log) space, e.g., `rq:f:0.99` (99% accuracy, 1 error in 100 bases) is Phred 20 ($-10 \times \log(1 - 0.99)$); this value is capped at Phred 60 for `rq:f:1.0` + +## `{sample}.{movie}.*_summary.tsv` - read length and quality histograms + +Values are binned for the histogram files. + +For a given row, the values are: + +- column 1: bin name; lower limit of bin; reads counted in this bin fall into the interval `[row N column 1, row N+1 column 1)` +- column 2: count of reads in this bin +- column 3: count of base pairs in this bin + +As an example these are the first few rows of a read_length_summary.tsv: + +```tsv +0 38 28079 +1000 284 466848 +2000 925 2383460 +``` + +First row are reads with 0-999 bp/read. There are 38 reads in this bin. They sum to 28 kbp. +Second row are reads with 1000-1999 bp/read. There are 284 reads in this bin. They sum to 466 kbp. +Third row are reads with 2000-2999 bp/read. There are 925 reads in this bin. They sum to 2.38 Mbp. +And here are some rows from the middle of a read_quality_summary.tsv: + +```tsv +28 370375 6973927935 +29 387446 7256330089 +30 372960 6888342745 +``` + +First row are reads with Phred scaled read quality in `[28, 29)`. These have predicted error rates between ~1/631 (Phred 28, inclusive) and ~1/794 (Phred 29). There are 370k reads in this bin, and they sum to 6.97 Gbp. diff --git a/docs/deepvariant.md b/docs/deepvariant.md new file mode 100644 index 00000000..2b8a9d3c --- /dev/null +++ b/docs/deepvariant.md @@ -0,0 +1,15 @@ +# DeepVariant subworkflow + +```mermaid +flowchart TD + aBAM[/"HiFi aBAM"/] --> make_examples["DeepVariant make_examples"] + make_examples --> gpu{"gpu?"} + gpu -- yes --> call_variants_gpu["DeepVariant call_variants_gpu"] + gpu -- no --> call_variants_cpu["DeepVariant call_variants_cpu"] + call_variants_gpu --> postprocess_variants["DeepVariant postprocess_variants"] + call_variants_cpu --> postprocess_variants + postprocess_variants --> vcf[/"small variant VCF"/] + postprocess_variants --> gvcf[/"small variant gVCF"/] +``` + +This subworkflow runs the three steps of DeepVariant individually in order to make best use of resources. If a GPU is available and `gpu==true`, the `call_variants` step will run on 1 GPU and 8 cpu threads, otherwise it will run on 64 CPU threads. The `make_examples` and `postprocess_variants` steps will always run on the CPU. diff --git a/docs/family.md b/docs/family.md new file mode 100644 index 00000000..b0a14fe4 --- /dev/null +++ b/docs/family.md @@ -0,0 +1,259 @@ +# family.wdl inputs and outputs + +- [family.wdl inputs and outputs](#familywdl-inputs-and-outputs) + - [DAG (simplified)](#dag-simplified) + - [Inputs](#inputs) + - [Family Struct](#family-struct) + - [Sample Struct](#sample-struct) + - [Outputs](#outputs) + - [Alignments, Coverage, and QC](#alignments-coverage-and-qc) + - [Small Variants (\<50 bp)](#small-variants-50-bp) + - [Structural Variants (≥50 bp)](#structural-variants-50-bp) + - [Copy Number Variants (≥100 kb)](#copy-number-variants-100-kb) + - [Tandem Repeat Genotyping](#tandem-repeat-genotyping) + - [Variant Phasing](#variant-phasing) + - [Variant Calling in Dark Regions](#variant-calling-in-dark-regions) + - [5mCpG Methylation Calling](#5mcpg-methylation-calling) + - [PGx Typing](#pgx-typing) + - [Tertiary Analysis](#tertiary-analysis) + +## DAG (simplified) + +```mermaid +--- +title: family.wdl +--- +flowchart TD + subgraph "`**Upstream of Phasing (per-sample)**`" + subgraph "per-movie" + ubam[/"HiFi uBAM"/] --> pbmm2_align["pbmm2 align"] + pbmm2_align --> pbsv_discover["PBSV discover"] + end + pbmm2_align --> merge_read_stats["merge read statistics"] + pbmm2_align --> samtools_merge["samtools merge"] + samtools_merge --> mosdepth["mosdepth"] + samtools_merge --> paraphase["Paraphase"] + samtools_merge --> hificnv["HiFiCNV"] + samtools_merge --> trgt["TRGT"] + samtools_merge --> trgt_dropouts["TR coverage dropouts"] + samtools_merge --> deepvariant["DeepVariant"] + end + subgraph "`**Joint Calling**`" + deepvariant --> glnexus["GLnexus (joint-call small variants)"] + pbsv_discover --> pbsv_call["PBSV call"] + glnexus --> split_glnexus["split small variant vcf by sample"] + pbsv_call --> split_pbsv["split SV vcf by sample"] + end + subgraph "`**Phasing and Downstream (per-sample)**`" + split_glnexus --> hiphase["HiPhase"] + trgt --> hiphase + split_pbsv --> hiphase + hiphase --> bcftools_roh["bcftools roh"] + hiphase --> bcftools_stats["bcftools stats\n(small variants)"] + hiphase --> sv_stats["SV stats"] + hiphase --> cpg_pileup["5mCpG pileup"] + hiphase --> starphase["StarPhase"] + hiphase --> pharmcat["PharmCat"] + starphase --> pharmcat + end + subgraph " " + hiphase --> merge_small_variants["bcftools merge small variants"] + hiphase --> merge_svs["bcftools merge SV"] + hiphase --> trgt_merge["trgt merge"] + end + subgraph "`**Tertiary Analysis**`" + merge_small_variants --> slivar_small_variants["slivar small variants"] + merge_svs --> svpack["svpack filter and annotate"] + svpack --> slivar_svpack["slivar svpack tsv"] + end +``` + +## Inputs + +| Type | Name | Description | Notes | +| ---- | ---- | ----------- | ----- | +| [Family](../blob/main/workflows/humanwgs_structs.wdl#L15) | family | Family struct describing samples, relationships, and unaligned BAM paths | [below](#family-struct) | +| File | [ref_map_file](./ref_map) | TSV containing reference genome file paths; must match backend | | +| String? | phenotypes | Comma-delimited list of HPO terms. | [Human Phenotype Ontology (HPO) phenotypes](https://hpo.jax.org/app/) associated with the cohort.

    If omitted, tertiary analysis will be skipped. | +| File? | [tertiary_map_file](./tertiary_map) | TSV containing tertiary analysis file paths and thresholds; must match backend | `AF`/`AC`/`nhomalt` thresholds can be modified, but this will affect performance.

    If omitted, tertiary analysis will be skipped. | +| Int? | glnexus_mem_gb | Override GLnexus memory; optional | | +| Int? | pbsv_call_mem_gb | Override PBSV call memory; optional | | +| Boolean | gpu | Use GPU when possible

    Default: `false` | [GPU support](./gpu#gpu-support) | +| String | backend | Backend where the workflow will be executed

    `["GCP", "Azure", "AWS-HealthOmics", "HPC"]` | | +| String? | zones | Zones where compute will take place; required if backend is set to 'AWS' or 'GCP'. | [Determining available zones in GCP](./backends/gcp#determining-available-zones) | +| String? | gpuType | GPU type to use; required if gpu is set to `true` for cloud backends; must match backend | [Available GPU types](./gpu#gpu-types) | +| String? | container_registry | Container registry where workflow images are hosted.

    Default: `"quay.io/pacbio"` | If omitted, [PacBio's public Quay.io registry](https://quay.io/organization/pacbio) will be used.

    Custom container_registry must be set if backend is set to 'AWS-HealthOmics'. | +| Boolean | preemptible | Where possible, run tasks preemptibly

    `[true, false]`

    Default: `true` | If set to `true`, run tasks preemptibly where possible. If set to `false`, on-demand VMs will be used for every task. Ignored if backend is set to HPC. | + +### Family Struct + +The `Family` struct contains the samples for the family. The struct has the following fields: + +| Type | Name | Description | Notes | +| ---- | ---- | ----------- | ----- | +| String | family_id | Unique identifier for the family | Alphanumeric characters, periods, dashes, and underscores are allowed. | +| Array\[Sample\] | samples | Sample struct with sample specific data and metadata. | [below](#sample-struct) | + +### Sample Struct + +The `Sample` struct contains sample specific data and metadata. The struct has the following fields: + +| Type | Name | Description | Notes | +| ---- | ---- | ----------- | ----- | +| String | sample_id | Unique identifier for the sample | Alphanumeric characters, periods, dashes, and underscores are allowed. | +| String? | sex | Sample sex
    `["MALE", "FEMALE", null]` | Used by HiFiCNV and TRGT for genotyping. Allosome karyotype will default to XX unless sex is specified as `"MALE"`. Used for tertiary analysis X-linked inheritance filtering. | +| Boolean | affected | Affected status | If set to `true`, sample is described as being affected by all HPO terms in `phenotypes`.
    If set to `false`, sample is described as not being affected by all HPO terms in `phenotypes`. | +| Array\[File\] | hifi_reads | Array of paths to HiFi reads in unaligned BAM format. | | +| String? | father_id | sample_id of father (optional) | | +| String? | mother_id | sample_id of mother (optional) | | + +## Outputs + +### Alignments, Coverage, and QC + +| Type | Name | Description | Notes | +| ---- | ---- | ----------- | ----- | +| String | workflow_name | Workflow name | | +| String | workflow_version | Workflow version | | +| Array\[String\] | sample_ids | Sample IDs | | +| File | stats_file | Table of summary statistics | | +| Array\[File\] | bam_stats | BAM stats | Per-read length and read-quality | +| Array\[File\] | read_length_plot | Read length plot | | +| Array\[File\] | read_quality_plot | Read quality plot | | +| Array\[File\] | merged_haplotagged_bam | Merged, haplotagged alignments | Includes unmapped reads | +| Array\[File\] | merged_haplotagged_bam_index | | | +| Array\[File\] | mosdepth_summary | Summary of aligned read depth. | | +| Array\[File\] | mosdepth_region_bed | Median aligned read depth by 500bp windows. | | +| Array\[File\] | mosdepth_region_bed_index | | | +| Array\[File\] | mosdepth_depth_distribution_plot | | | +| Array\[File\] | mapq_distribution_plot | Distribution of mapping quality per alignment | | +| Array\[File\] | mg_distribution_plot | Distribution of gap-compressed identity score per alignment | | +| Array\[String\] | stat_num_reads | Number of reads | | +| Array\[String\] | stat_read_length_mean | Mean read length | | +| Array\[String\] | stat_read_length_median | Median read length | | +| Array\[String\] | stat_read_quality_mean | Mean read quality | | +| Array\[String\] | stat_read_quality_median | Median read quality | | +| Array\[String\] | stat_mapped_read_count | Count of reads mapped to reference | | +| Array\[String\] | stat_mapped_percent | Percent of reads mapped to reference | | +| Array\[String\] | inferred_sex | Inferred sex | Sex is inferred based on relative depth of chrY alignments. | +| Array\[String\] | stat_mean_depth | Mean depth | | + +### Small Variants (<50 bp) + +| Type | Name | Description | Notes | +| ---- | ---- | ----------- | ----- | +| Array\[File\] | phased_small_variant_vcf | Phased small variant VCF | | +| Array\[File\] | phased_small_variant_vcf_index | | | +| Array\[File\] | small_variant_gvcf | Small variant GVCF | Can be used for joint-calling. | +| Array\[File\] | small_variant_gvcf_index | | | +| Array\[File\] | small_variant_stats | Small variant stats | Generated by `bcftools stats`. | +| Array\[String\] | stat_small_variant_SNV_count | SNV count | (PASS variants) | +| Array\[String\] | stat_small_variant_INDEL_count | INDEL count | (PASS variants) | +| Array\[String\] | stat_small_variant_TSTV_ratio | Ts/Tv ratio | (PASS variants) | +| Array\[String\] | stat_small_variant_HETHOM_ratio | Het/Hom ratio | (PASS variants) | +| Array\[File\] | snv_distribution_plot | Distribution of SNVs by REF, ALT | | +| Array\[File\] | indel_distribution_plot | Distribution of indels by size | | +| File? | joint_small_variants_vcf | Joint-called small variant VCF | | +| File? | joint_small_variants_vcf_index | | | + +### Structural Variants (≥50 bp) + +| Type | Name | Description | Notes | +| ---- | ---- | ----------- | ----- | +| Array\[File\] | phased_sv_vcf | Phased structural variant VCF | | +| Array\[File\] | phased_sv_vcf_index | Index for phased structural variant VCF | | +| Array\[String\] | stat_sv_DUP_count | Structural variant DUP count | (PASS variants) | +| Array\[String\] | stat_sv_DEL_count | Structural variant DEL count | (PASS variants) | +| Array\[String\] | stat_sv_INS_count | Structural variant INS count | (PASS variants) | +| Array\[String\] | stat_sv_INV_count | Structural variant INV count | (PASS variants) | +| Array\[String\] | stat_sv_BND_count | Structural variant BND count | (PASS variants) | +| Array\[File\] | bcftools_roh_out | ROH calling | `bcftools roh` | +| Array\[File\] | bcftools_roh_bed | Generated from above, without filtering | | +| File? | joint_sv_vcf | Joint-called structural variant VCF | | +| File? | joint_sv_vcf_index | | | + +### Copy Number Variants (≥100 kb) + +| Type | Name | Description | Notes | +| ---- | ---- | ----------- | ----- | +| Array\[File\] | cnv_vcf | CNV VCF | | +| Array\[File\] | cnv_vcf_index | Index for CNV VCF | | +| Array\[File\] | cnv_copynum_bedgraph | CNV copy number BEDGraph | | +| Array\[File\] | cnv_depth_bw | CNV depth BigWig | | +| Array\[File\] | cnv_maf_bw | CNV MAF BigWig | | +| Array\[String\] | stat_cnv_DUP_count | Count of DUP events | (for PASS variants) | +| Array\[String\] | stat_cnv_DEL_count | Count of DEL events | (PASS variants) | +| Array\[String\] | stat_cnv_DUP_sum | Sum of DUP bp | (PASS variants) | +| Array\[String\] | stat_cnv_DEL_sum | Sum of DEL bp | (PASS variants) | + +### Tandem Repeat Genotyping + +| Type | Name | Description | Notes | +| ---- | ---- | ----------- | ----- | +| Array\[File\] | phased_trgt_vcf | Phased TRGT VCF | | +| Array\[File\] | phased_trgt_vcf_index | | | +| Array\[File\] | trgt_spanning_reads | TRGT spanning reads | | +| Array\[File\] | trgt_spanning_reads_index | | | +| Array\[String\] | stat_trgt_genotyped_count | Count of genotyped sites | | +| Array\[String\] | stat_trgt_uncalled_count | Count of ungenotyped sites | | + +### Variant Phasing + +| Type | Name | Description | Notes | +| ---- | ---- | ----------- | ----- | +| Array\[File\] | phase_stats | Phasing stats | | +| Array\[File\] | phase_blocks | Phase blocks | | +| Array\[File\] | phase_haplotags | Per-read haplotag assignment | | +| Array\[String\] | stat_phased_basepairs | Count of bp within phase blocks | | +| Array\[String\] | stat_phase_block_ng50 | Phase block NG50 | | + +### Variant Calling in Dark Regions + +| Type | Name | Description | Notes | +| ---- | ---- | ----------- | ----- | +| Array\[File\] | paraphase_output_json | Paraphase output JSON | | +| Array\[File\] | paraphase_realigned_bam | Paraphase realigned BAM | | +| Array\[File\] | paraphase_realigned_bam_index | | | +| Array\[File?\] | paraphase_vcfs | Paraphase VCFs | Compressed as `.tar.gz` | + +### 5mCpG Methylation Calling + +| Type | Name | Description | Notes | +| ---- | ---- | ----------- | ----- | +| Array\[File\] | cpg_hap1_bed | CpG hap1 BED | | +| Array\[File\] | cpg_hap1_bed_index | | | +| Array\[File\] | cpg_hap2_bed | CpG hap2 BED | | +| Array\[File\] | cpg_hap2_bed_index | | | +| Array\[File\] | cpg_combined_bed | CpG combined BED | | +| Array\[File\] | cpg_combined_bed_index | | | +| Array\[File\] | cpg_hap1_bw | CpG hap1 BigWig | | +| Array\[File\] | cpg_hap2_bw | CpG hap2 BigWig | | +| Array\[File\] | cpg_combined_bw | CpG combined BigWig | | +| Array\[String\] | stat_cpg_hap1_count | Hap1 CpG count | | +| Array\[String\] | stat_cpg_hap2_count | Hap2 CpG count | | +| Array\[String\] | stat_cpg_combined_count | Combined CpG count | | + +### PGx Typing + +| Type | Name | Description | Notes | +| ---- | ---- | ----------- | ----- | +| Array\[File\] | pbstarphase_json | PBstarPhase JSON | Haplotype calls for PGx loci | +| Array\[File\] | pharmcat_match_json | PharmCAT match JSON | | +| Array\[File\] | pharmcat_phenotype_json | PharmCAT phenotype JSON | | +| Array\[File\] | pharmcat_report_html | PharmCAT report HTML | | +| Array\[File\] | pharmcat_report_json | PharmCAT report JSON | | + +### Tertiary Analysis + +| Type | Name | Description | Notes | +| ---- | ---- | ----------- | ----- | +| File? | pedigree | Pedigree file in PLINK PED [format](https://zzz.bwh.harvard.edu/plink/data.shtml#ped) | | +| File? | small_variant_filtered_vcf | Filtered, annotated small variant VCF | | +| File? | small_variant_filtered_vcf_index | | | +| File? | small_variant_filtered_tsv | Filtered, annotated small variant calls | | +| File? | small_variant_compound_het_vcf | Filtered, annotated compound heterozygous small variant VCF | | +| File? | small_variant_compound_het_vcf_index | | | +| File? | small_variant_compound_het_tsv | Filtered, annotated compound heterozygous small variant calls | | +| File? | sv_filtered_vcf | Filtered, annotated structural variant VCF | | +| File? | sv_filtered_vcf_index | | | +| File? | sv_filtered_tsv | Filtered, annotated structural variant TSV | | diff --git a/docs/gpu.md b/docs/gpu.md new file mode 100644 index 00000000..b9e7d20f --- /dev/null +++ b/docs/gpu.md @@ -0,0 +1,17 @@ +# GPU support + +Starting in workflow version 2.0.0, we have added support for running workflows on GPU-enabled nodes. The first task to take advantage of this is the [`deepvariant_call_variants` task](../blob/main/workflows/wdl-common/wdl/workflows/deepvariant/deepvariant.wdl) in the DeepVariant workflow, which can use 1 GPU. To run the DeepVariant workflow on a GPU-enabled node, you will need to provide some additional configuration in your inputs JSON file. + +| Type | Name | Description | Notes | +| ---- | ---- | ----------- | ----- | +| Boolean | gpu | Use GPUs. | default = `false` | +| String | gpuType | Type of GPU/Accelerator to use. | This will depend on your backend configuration. | + +## GPU Types + +| Backend | GPU Type | Notes | +| ------- | -------- | ----- | +| AWS-HealthOmics | `["nvidia-tesla-a10g", "nvidia-tesla-t4", "nvidia-tesla-t4-a10g"]` | [GPU availability varies by zone.](https://aws.amazon.com/ec2/instance-types) | +| Azure | | GPU support not yet implemented, but monitoring microsoft/ga4gh-tes#717. | +| GCP | `["nvidia-tesla-t4", "nvidia-tesla-v100"]` | [GPU availability varies by zone.](https://cloud.google.com/compute/docs/gpus/gpu-regions-zones) | +| HPC | | This will depend on HPC and miniwdl or Cromwell configuration. Reach out to [support@pacb.com](mailto:support@pacb.com?subject=WDL%20Workflows%20-%20GPU%20Support) | diff --git a/docs/pharmcat.md b/docs/pharmcat.md new file mode 100644 index 00000000..bcd11775 --- /dev/null +++ b/docs/pharmcat.md @@ -0,0 +1,10 @@ +# PharmCat subworkflow + +```mermaid +flowchart TD + phased_vcf[/"phased small variant VCF"/] --> preprocess["pharmcat preprocess"] + aBAM[/"haplotagged BAM"/] --> filter["filter preprocessed VCF"] + preprocess --> filter + filter --> pharmcat["PharmCat"] + pharmcat --> outputs[/"PharmCat outputs"/] +``` diff --git a/docs/ref_map.md b/docs/ref_map.md new file mode 100644 index 00000000..0d6c5e93 --- /dev/null +++ b/docs/ref_map.md @@ -0,0 +1,39 @@ +# Reference Map File Specification + +| Type | Key | Description | Notes | +| ---- | --- | ----------- | ----- | +| String | name | Short name for reference | Alphanumeric characters, underscores, and dashes only. Will be used in file names. | +| File | fasta | Reference genome FASTA | | +| File | fasta_index | Reference genome FASTA index | | +| File | pbsv_splits | Regions for pbsv parallelization | [below](#pbsv_splits) | +| File | pbsv_tandem_repeat_bed | Tandem Repeat BED used by PBSV to normalize SVs within TRs | [link](https://github.com/PacificBiosciences/pbsv/tree/master/annotations) | +| File | trgt_tandem_repeat_bed | Tandem Repeat catalog (BED) for TRGT genotyping | [link](https://github.com/PacificBiosciences/trgt/blob/main/docs/repeat_files.md) | +| File | hificnv_exclude_bed | Regions to be excluded by HIFICNV in gzipped BED format | [link](https://github.com/PacificBiosciences/HiFiCNV/blob/main/docs/aux_data.md) | +| File | hificnv_exclude_bed_index | BED index | [link](https://github.com/PacificBiosciences/HiFiCNV/blob/main/docs/aux_data.md) | +| File | hificnv_expected_bed_male | Expected allosome copy number BED for XY samples | [link](https://github.com/PacificBiosciences/HiFiCNV/blob/main/docs/aux_data.md) | +| File | hificnv_expected_bed_female | Expected allosome copy number BED for XX samples | [link](https://github.com/PacificBiosciences/HiFiCNV/blob/main/docs/aux_data.md) | +| File | pharmcat_positions_vcf | PharmCAT positions VCF | | +| File | pharmcat_positions_vcf_index | PharmCAT positions VCF index | | + +## pbsv_splits + +The `pbsv_splits` file is a JSON array of arrays of strings. Each inner array contains one or more chromosome names such that each inner array is of roughly equal size in base pairs. The inner arrays are processed in parallel. For example: + +```json +[ + ... + [ + "chr10", + "chr11" + ], + [ + "chr12", + "chr13" + ], + [ + "chr14", + "chr15" + ], + ... +] +``` diff --git a/docs/singleton.md b/docs/singleton.md new file mode 100644 index 00000000..9e627e45 --- /dev/null +++ b/docs/singleton.md @@ -0,0 +1,220 @@ +# singleton.wdl inputs and outputs + +- [singleton.wdl inputs and outputs](#singletonwdl-inputs-and-outputs) + - [DAG (simplified)](#dag-simplified) + - [Inputs](#inputs) + - [Outputs](#outputs) + - [Alignments, Coverage, and QC](#alignments-coverage-and-qc) + - [Small Variants (\<50 bp)](#small-variants-50-bp) + - [Structural Variants (≥50 bp)](#structural-variants-50-bp) + - [Copy Number Variants (≥100 kb)](#copy-number-variants-100-kb) + - [Tandem Repeat Genotyping](#tandem-repeat-genotyping) + - [Variant Phasing](#variant-phasing) + - [Variant Calling in Dark Regions](#variant-calling-in-dark-regions) + - [5mCpG Methylation Calling](#5mcpg-methylation-calling) + - [PGx Typing](#pgx-typing) + - [Tertiary Analysis](#tertiary-analysis) + +## DAG (simplified) + +```mermaid +--- +title: singleton.wdl +--- +flowchart TD + subgraph "`**Upstream of Phasing**`" + subgraph "per-movie" + ubam[/"HiFi uBAM"/] --> pbmm2_align["pbmm2 align"] + pbmm2_align --> pbsv_discover["PBSV discover"] + end + pbmm2_align --> merge_read_stats["merge read statistics"] + pbmm2_align --> samtools_merge["samtools merge"] + samtools_merge --> mosdepth["mosdepth"] + samtools_merge --> paraphase["Paraphase"] + samtools_merge --> hificnv["HiFiCNV"] + samtools_merge --> trgt["TRGT"] + samtools_merge --> trgt_dropouts["TR coverage dropouts"] + samtools_merge --> deepvariant["DeepVariant"] + pbsv_discover --> pbsv_call["PBSV call"] + end + subgraph "`**Phasing and Downstream**`" + deepvariant --> hiphase["HiPhase"] + trgt --> hiphase + pbsv_call --> hiphase + hiphase --> bcftools_roh["bcftools roh"] + hiphase --> bcftools_stats["bcftools stats\n(small variants)"] + hiphase --> sv_stats["SV stats"] + hiphase --> cpg_pileup["5mCpG pileup"] + hiphase --> starphase["StarPhase"] + hiphase --> pharmcat["PharmCat"] + starphase --> pharmcat + end + subgraph "`**Tertiary Analysis**`" + hiphase --> slivar_small_variants["slivar small variants"] + hiphase --> svpack["svpack filter and annotate"] + svpack --> slivar_svpack["slivar svpack tsv"] + end +``` + +## Inputs + +| Type | Name | Description | Notes | +| ---- | ---- | ----------- | ----- | +| String | sample_id | Unique identifier for the sample | Alphanumeric characters, periods, dashes, and underscores are allowed. | +| String? | sex | Sample sex
    `["MALE", "FEMALE"]` | Used by HiFiCNV and TRGT for genotyping. Allosome karyotype will default to XX unless sex is specified as `"MALE"`. | +| Array\[File\] | hifi_reads | Array of paths to HiFi reads in unaligned BAM format. | | +| File | [ref_map_file](./ref_map) | TSV containing reference genome file paths; must match backend | | +| String? | phenotypes | Comma-delimited list of HPO terms. | [Human Phenotype Ontology (HPO) phenotypes](https://hpo.jax.org/app/) associated with the cohort.

    If omitted, tertiary analysis will be skipped. | +| File? | [tertiary_map_file](./tertiary_map) | TSV containing tertiary analysis file paths and thresholds; must match backend | `AF`/`AC`/`nhomalt` thresholds can be modified, but this will affect performance.

    If omitted, tertiary analysis will be skipped. | +| Boolean | gpu | Use GPU when possible

    Default: `false` | [GPU support](./gpu#gpu-support) | +| String | backend | Backend where the workflow will be executed

    `["GCP", "Azure", "AWS-AGC", "AWS-HealthOmics", "HPC"]` | | +| String? | zones | Zones where compute will take place; required if backend is set to 'AWS' or 'GCP'. | [Determining available zones in GCP](./backends/gcp#determining-available-zones) | +| String? | gpuType | GPU type to use; required if gpu is set to `true` for cloud backends; must match backend | [Available GPU types](./gpu#gpu-types) | +| String? | container_registry | Container registry where workflow images are hosted.

    Default: `"quay.io/pacbio"` | If omitted, [PacBio's public Quay.io registry](https://quay.io/organization/pacbio) will be used.

    Custom container_registry must be set if backend is set to 'AWS-HealthOmics'. | +| Boolean | preemptible | Where possible, run tasks preemptibly

    `[true, false]`

    Default: `true` | If set to `true`, run tasks preemptibly where possible. If set to `false`, on-demand VMs will be used for every task. Ignored if backend is set to HPC. | + +## Outputs + +### Alignments, Coverage, and QC + +| Type | Name | Description | Notes | +| ---- | ---- | ----------- | ----- | +| String | workflow_name | Workflow name | | +| String | workflow_version | Workflow version | | +| File | stats_file | Table of summary statistics | | +| File | bam_stats | BAM stats | Per-read length and read-quality | +| File | read_length_plot | Read length plot | | +| File | read_quality_plot | Read quality plot | | +| File | merged_haplotagged_bam | Merged, haplotagged alignments | Includes unmapped reads | +| File | merged_haplotagged_bam_index | | | +| File | mosdepth_summary | Summary of aligned read depth. | | +| File | mosdepth_region_bed | Median aligned read depth by 500bp windows. | | +| File | mosdepth_region_bed_index | | | +| File | mosdepth_depth_distribution_plot | | | +| File | mapq_distribution_plot | Distribution of mapping quality per alignment | | +| File | mg_distribution_plot | Distribution of gap-compressed identity score per alignment | | +| String | stat_num_reads | Number of reads | | +| String | stat_read_length_mean | Mean read length | | +| String | stat_read_length_median | Median read length | | +| String | stat_read_quality_mean | Mean read quality | | +| String | stat_read_quality_median | Median read quality | | +| String | stat_mapped_read_count | Count of reads mapped to reference | | +| String | stat_mapped_percent | Percent of reads mapped to reference | | +| String | inferred_sex | Inferred sex | Sex is inferred based on relative depth of chrY alignments. | +| String | stat_mean_depth | Mean depth | | + +### Small Variants (<50 bp) + +| Type | Name | Description | Notes | +| ---- | ---- | ----------- | ----- | +| File | phased_small_variant_vcf | Phased small variant VCF | | +| File | phased_small_variant_vcf_index | | | +| File | small_variant_gvcf | Small variant GVCF | Can be used for joint-calling. | +| File | small_variant_gvcf_index | | | +| File | small_variant_stats | Small variant stats | Generated by `bcftools stats`. | +| String | stat_small_variant_SNV_count | SNV count | (PASS variants) | +| String | stat_small_variant_INDEL_count | INDEL count | (PASS variants) | +| String | stat_small_variant_TSTV_ratio | Ts/Tv ratio | (PASS variants) | +| String | stat_small_variant_HETHOM_ratio | Het/Hom ratio | (PASS variants) | +| File | snv_distribution_plot | Distribution of SNVs by REF, ALT | | +| File | indel_distribution_plot | Distribution of indels by size | | + +### Structural Variants (≥50 bp) + +| Type | Name | Description | Notes | +| ---- | ---- | ----------- | ----- | +| File | phased_sv_vcf | Phased structural variant VCF | | +| File | phased_sv_vcf_index | Index for phased structural variant VCF | | +| String | stat_sv_DUP_count | Structural variant DUP count | (PASS variants) | +| String | stat_sv_DEL_count | Structural variant DEL count | (PASS variants) | +| String | stat_sv_INS_count | Structural variant INS count | (PASS variants) | +| String | stat_sv_INV_count | Structural variant INV count | (PASS variants) | +| String | stat_sv_BND_count | Structural variant BND count | (PASS variants) | +| File | bcftools_roh_out | ROH calling | `bcftools roh` | +| File | bcftools_roh_bed | Generated from above, without filtering | | + +### Copy Number Variants (≥100 kb) + +| Type | Name | Description | Notes | +| ---- | ---- | ----------- | ----- | +| File | cnv_vcf | CNV VCF | | +| File | cnv_vcf_index | Index for CNV VCF | | +| File | cnv_copynum_bedgraph | CNV copy number BEDGraph | | +| File | cnv_depth_bw | CNV depth BigWig | | +| File | cnv_maf_bw | CNV MAF BigWig | | +| String | stat_cnv_DUP_count | Count of DUP events | (for PASS variants) | +| String | stat_cnv_DEL_count | Count of DEL events | (PASS variants) | +| String | stat_cnv_DUP_sum | Sum of DUP bp | (PASS variants) | +| String | stat_cnv_DEL_sum | Sum of DEL bp | (PASS variants) | + +### Tandem Repeat Genotyping + +| Type | Name | Description | Notes | +| ---- | ---- | ----------- | ----- | +| File | phased_trgt_vcf | Phased TRGT VCF | | +| File | phased_trgt_vcf_index | | | +| File | trgt_spanning_reads | TRGT spanning reads | | +| File | trgt_spanning_reads_index | | | +| String | stat_trgt_genotyped_count | Count of genotyped sites | | +| String | stat_trgt_uncalled_count | Count of ungenotyped sites | | + +### Variant Phasing + +| Type | Name | Description | Notes | +| ---- | ---- | ----------- | ----- | +| File | phase_stats | Phasing stats | | +| File | phase_blocks | Phase blocks | | +| File | phase_haplotags | Per-read haplotag assignment | | +| String | stat_phased_basepairs | Count of bp within phase blocks | | +| String | stat_phase_block_ng50 | Phase block NG50 | | + +### Variant Calling in Dark Regions + +| Type | Name | Description | Notes | +| ---- | ---- | ----------- | ----- | +| File | paraphase_output_json | Paraphase output JSON | | +| File | paraphase_realigned_bam | Paraphase realigned BAM | | +| File | paraphase_realigned_bam_index | | | +| File? | paraphase_vcfs | Paraphase VCFs | Compressed as `.tar.gz` | + +### 5mCpG Methylation Calling + +| Type | Name | Description | Notes | +| ---- | ---- | ----------- | ----- | +| File | cpg_hap1_bed | CpG hap1 BED | | +| File | cpg_hap1_bed_index | | | +| File | cpg_hap2_bed | CpG hap2 BED | | +| File | cpg_hap2_bed_index | | | +| File | cpg_combined_bed | CpG combined BED | | +| File | cpg_combined_bed_index | | | +| File | cpg_hap1_bw | CpG hap1 BigWig | | +| File | cpg_hap2_bw | CpG hap2 BigWig | | +| File | cpg_combined_bw | CpG combined BigWig | | +| String | stat_cpg_hap1_count | Hap1 CpG count | | +| String | stat_cpg_hap2_count | Hap2 CpG count | | +| String | stat_cpg_combined_count | Combined CpG count | | + +### PGx Typing + +| Type | Name | Description | Notes | +| ---- | ---- | ----------- | ----- | +| File | pbstarphase_json | PBstarPhase JSON | Haplotype calls for PGx loci | +| File | pharmcat_match_json | PharmCAT match JSON | | +| File | pharmcat_phenotype_json | PharmCAT phenotype JSON | | +| File | pharmcat_report_html | PharmCAT report HTML | | +| File | pharmcat_report_json | PharmCAT report JSON | | + +### Tertiary Analysis + +| Type | Name | Description | Notes | +| ---- | ---- | ----------- | ----- | +| File? | pedigree | Pedigree file in PLINK PED [format](https://zzz.bwh.harvard.edu/plink/data.shtml#ped) | | +| File? | small_variant_filtered_vcf | Filtered, annotated small variant VCF | | +| File? | small_variant_filtered_vcf_index | | | +| File? | small_variant_filtered_tsv | Filtered, annotated small variant calls | | +| File? | small_variant_compound_het_vcf | Filtered, annotated compound heterozygous small variant VCF | | +| File? | small_variant_compound_het_vcf_index | | | +| File? | small_variant_compound_het_tsv | Filtered, annotated compound heterozygous small variant calls | | +| File? | sv_filtered_vcf | Filtered, annotated structural variant VCF | | +| File? | sv_filtered_vcf_index | | | +| File? | sv_filtered_tsv | Filtered, annotated structural variant TSV | | diff --git a/docs/tertiary.md b/docs/tertiary.md new file mode 100644 index 00000000..b871789c --- /dev/null +++ b/docs/tertiary.md @@ -0,0 +1,65 @@ +# tertiary.wdl analysis workflow + +This is a simple, opinionated subworkflow for tertiary analysis in rare disease research. It starts with small variants and structural variants in VCF format, filters to remove variants that are common in the population, annotates with functional impact, and then prioritizes based on the predicted impact on the gene and the gene's relevance to the phenotype. It has been designed for ~30x WGS HiFi for the proband and ~10-30x WGS HiFi for parents (optional). + +## Inputs + +- Small variants and structural variants are provided to this workflow in VCF format. If multiple family members have been sequenced, they are provided as a single joint-called VCF per variant type per family. If only the proband has been sequenced, the VCFs are provided for the proband only. +- We generate a pedigree describing sample relationships and phenotype status, based on the input provided to the entrypoint workflow. In the case of a singleton, the pedigree is a single row. +- Using the comma-delimited list of HPO terms provided to the entrypoint workflow, we generate a Phenotype Rank (Phrank) lookup table, a simple two column lookup table mapping gene symbols to Phrank score. Phrank scores are positive real numbers (or null) such that higher scores indicate a gene is more likely to be relevant to the phenotypes. The Phrank lookup is used to prioritize variants based on the predicted impact on the gene and the gene's relevance to the phenotype. Phrank scores are not normalized, and providing more phenotypes for a sample will result in a higher maximum Phrank score. +- Reference data is provided by the [`ref_map_file`](./ref_map) input. This workflow is currently only compatible with the GRCh38 human reference. +- Population data, other supplemental data, and allele thresholds are provided by the [`tertiary_map_file`](./tertiary_map) input. We provide a version of this file that uses population data from [gnomAD v4.1](https://gnomad.broadinstitute.org/news/2024-05-gnomad-v4-1-updates/) and [CoLoRSdb](https://colorsdb.org) v1.0.0 ![10.5281/zenodo.13145123](https://zenodo.org/badge/DOI/10.5281/zenodo.13145123.svg "10.5281/zenodo.13145123"). We provide the ability to tweak the allele thresholds, but the default values are recommended, as increasing these will result in much higher resource usage. + +## Process + +### Small variants + +We use [`slivar`](https://github.com/brentp/slivar) and [`bcftools csq`](https://samtools.github.io/bcftools/howtos/csq-calling.html) to filter and annotate small variants, and to identify potential compound heterozygous ("comphet") candidate pairs. Slivar uses variant annotations stored in "gnotate" databases. We use the following steps (nb: some steps performed within the same command). + +1. Ignore variants with a non-passing `FILTER` value. +2. Ignore variants that are present at > 3% (`slivar_max_af`) in any of the population datasets. +3. Ignore variants with more than 4 homozygous alternate ("homalt") calls (`slivar_max_nhomalt`) in any of the population datasets. For the purposes of this tool, we count hemizygous ("hemialt") calls on the X chromosome as homalt. +4. To be tagged as a potential "dominant" variant, the site must be high quality[^1] in all relevant samples, present as homref in all unaffected samples, present as homalt or hetalt in all affected samples, and have allele count < 4 (`slivar_max_ac`) in the population datasets. +5. To be tagged as a potential "recessive" variant, the site must be high quality[^1] in all relevant samples, present as homalt or hemi in all affected samples, and present as homref or hetalt in all unaffected samples. +6. To be tagged in comphet analysis, the site must be have GQ > 5 (`slivar_min_gq`) and present as hetalt in all affected samples. +7. All remaining "tagged" variants are annotated with predicted impact using Ensembl GFF3 gene set and `bcftools csq`. This annotated VCF is provided for downstream analysis. +8. All variants considered for comphet analysis with high potential impacts[^2] are considered in pairs. If the pair of variants are shown to be _in cis_ according to HiPhase phasing, they are rejected. The passing pairs are stored in a second VCF for downstream analysis. + +We use [`slivar tsv`](https://github.com/brentp/slivar/wiki/tsv:-creating-a-spreadsheet-from-a-filtered-VCF) to produce TSVs from the VCFs generated above. These TSVs have many of the relevant fields from the VCF, as well as: + +- Clinvar annotations for the gene +- gnomAD [loss-of-function tolerance metrics](https://gnomad.broadinstitute.org/downloads#v2-lof-curation-results) +- Phrank scores for the gene + +### Structural variants + +We use [`svpack`](https://github.com/PacificBiosciences/svpack) to filter and annotate SVs, with the following steps. + +1. Remove variants with a non-passing `FILTER` value. +2. Remove variants < 50bp +3. Remove variants that match any existing variant in: gnomAD v4.1 (n=) or CoLoRSdb (n=). In this case, "match" means that the variant is the same type, the difference in position is <= 100bp, and the difference in size is <= 100bp. +4. Annotate `INFO/BCSQ` with predicted impact using Ensembl GFF3 gene set. +5. Annotate `INFO/homalt` and `INFO/hetalt` with the names of samples in this cohort that have the variant in homozygous or heterozygous form, respectively. + +We use [`slivar tsv`](https://github.com/brentp/slivar/wiki/tsv:-creating-a-spreadsheet-from-a-filtered-VCF) to produce a TSV of structural variants that impact genes in affected samples. This TSV has many of the relevant fields from the VCF, as well as: + +- Clinvar annotations for the gene +- gnomAD [loss-of-function tolerance metrics](https://gnomad.broadinstitute.org/downloads#v2-lof-curation-results) +- Phrank scores for the gene + +[^1]: High quality is defined as: + GQ >= 20 (GQ >= 10 for males on chrX) + DP >= 6 + 0.2 <= hetalt AB <= 0.8 + homref AB < 0.02 + homalt AB > 0.98 +[^2]: For more description of considered impacts, see [`slivar` documentation](https://github.com/brentp/slivar/wiki/compound-heterozygotes). We alter the default "skip" list to: + non_coding_transcript + intron + non_coding + upstream_gene + downstream_gene + non_coding_transcript_exon + NMD_transcript + 5_prime_UTR + 3_prime_UTR diff --git a/docs/tertiary_map.md b/docs/tertiary_map.md new file mode 100644 index 00000000..bad97fcf --- /dev/null +++ b/docs/tertiary_map.md @@ -0,0 +1,19 @@ +# Tertiary Map File Specification + +| Type | Key | Description | Notes | +| ---- | --- | ----------- | ----- | +| File | slivar_js | slivar functions | [link](https://raw.githubusercontent.com/brentp/slivar/91a40d582805d6607fa8a76a8fce15fd2e4be3b8/js/slivar-functions.js) | +| File | ensembl_gff | [Ensembl](https://useast.ensembl.org/index.html) GFF3 reference annotation | | +| File | lof_lookup | Path to table of loss-of-function scores per gene | | +| File | clinvar_lookup | Path to table of ClinVar annotations per gene | | +| File | slivar_gnotate_files | Comma-delimited array of population dataset allele frequencies in [`slivar gnotate`](https://github.com/brentp/slivar/wiki/gnotate) format | | +| String | slivar_gnotate_prefixes | Comma-delimieted array of prefixes to `_af`, `_nhomalt`, and `_ac` in `slivar_gnotate_files` | | +| String (Float) [^1] | slivar_max_af | Maximum allele frequency within population for small variants | | +| String (Int) [^2] | slivar_max_nhomalt | Maximum number of homozygous alternate alleles within population for small variants | | +| String (Int) [^2] | slivar_max_ac | Maximum allele count within population for small variants | | +| String (Int) [^2] | slivar_min_gq | Minimum genotype quality for small variants to be considered for compound heterozygous pairs | | +| String | svpack_pop_vcfs | Comma-delimited array of structural variant population VCF paths | | +| String | svpack_pop_vcf_indices | Comma-delimited array of structural variant population VCF index paths | | + +[^1]: Technically this value is interpreted as String by WDL, but slivar expects a Float, e.g, `0.03`. +[^2]: Technically these values are interpreted as String by WDL, but slivar expects an Int. diff --git a/docs/tools_containers.md b/docs/tools_containers.md new file mode 100644 index 00000000..ec849521 --- /dev/null +++ b/docs/tools_containers.md @@ -0,0 +1,29 @@ +# Tool versions and Containers + +Containers are used to package tools and their dependencies. This ensures that the tools are reproducible and can be run on any system that supports the container runtime. Our containers are built using [Docker](https://www.docker.com/) and are compatible with any container runtime that supports the OCI Image Specification, like [Singularity](https://sylabs.io/singularity/) or [Podman](https://podman.io/). + +Most of our containers are built on the `pb_wdl_base` container, which includes common bioinformatics tools and libraries. We tag our containers with a version number and build count, but the containers are referenced within the WDL files by the sha256 sum tags for reproducibility and better compatibility with Cromwell and miniwdl call caching. + +Our Dockerfiles can be inspected on GitHub, and the containers can be pulled from our [Quay.io organization](https://quay.io/pacbio). + +We directly use `deepvariant`, `deepvariant-gpu`, `pharmcat`, and `glnexus` containers from their respective authors, although we have mirrored some for better compatibility with Cromwell call caching. + +| Container | Major tool versions | Dockerfile | Container | +| --------: | ------------------- | :---: | :---: | +| pb_wdl_base |
    • htslib 1.20
    • bcftools 1.20
    • samtools 1.20
    • bedtools 2.31.0
    • python3.9
    • numpy 1.24.24
    • pandas 2.0.3
    • matplotlib 3.7.5
    • seaborn 0.13.2
    • pysam 0.22.1
    • vcfpy 0.13.8
    • biopython 1.83
    | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/6b13cc246dd44e41903d17a660bb5432cdd18dbe/docker/pb_wdl_base) | [sha256:4b889a1f21a6a7fecf18820613cf610103966a93218de772caba126ab70a8e87](https://quay.io/pacbio/pb_wdl_base/manifest/pb_wdl_base@sha256:4b889a1f21a6a7fecf18820613cf610103966a93218de772caba126ab70a8e87) | +| pbmm2 |
    • pbmm2 1.16.0
    | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/44df87558e18ce9d3b65f3ede9c7ba1513669ccb/docker/pbmm2) | [pbmm2@sha256:24218cb5cbc68d1fd64db14a9dc38263d3d931c74aca872c998d12ef43020ef0](https://quay.io/pacbio/pbmm2/manifest/sha256:24218cb5cbc68d1fd64db14a9dc38263d3d931c74aca872c998d12ef43020ef0) | +| mosdepth |
    • mosdepth 0.3.9
    | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/fa84fbf582738c05c750e667ff43d11552ad4183/docker/mosdepth) | [mosdepth@sha256:63f7a5d1a4a17b71e66d755d3301a951e50f6b63777d34dab3ee9e182fd7acb1](https://quay.io/pacbio/mosdepth/manifest/sha256:63f7a5d1a4a17b71e66d755d3301a951e50f6b63777d34dab3ee9e182fd7acb1) | +| pbsv |
    • pbsv 2.10.0
    | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/e82dddf32b042e985a5d66d0ebe25ca57058e61c/docker/pbsv) | [pbsv@sha256:3a8529853c1e214809dcdaacac0079de70d0c037b41b43bb8ba7c3fc5f783e26](https://quay.io/pacbio/pbsv/manifest/sha256:3a8529853c1e214809dcdaacac0079de70d0c037b41b43bb8ba7c3fc5f783e26) | +| trgt |
    • trgt 1.2.0
    • `/opt/scripts/check_trgt_coverage.py` 0.1.0
    | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/ed658e93fc51229f20415e0784dc242a8e4ef66a/docker/trgt) | [trgt@sha256:0284ff5756f8d47d9d81b515b8b1a6c81fac862ae5a7b4fe89f65235c3e5e0c9](https://quay.io/pacbio/trgt/manifest/sha256:0284ff5756f8d47d9d81b515b8b1a6c81fac862ae5a7b4fe89f65235c3e5e0c9) | +| hiphase |
    • hiphase 1.4.5
    | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/1051d12818e165a2145526e0b58f0ed0d0dc023a/docker/hiphase) | [hiphase@sha256:47fe7d42aea6b1b2e6d3c7401bc35a184464c3f647473d0525c00f3c968b40ad](https://quay.io/pacbio/hiphase/manifest/sha256:47fe7d42aea6b1b2e6d3c7401bc35a184464c3f647473d0525c00f3c968b40ad) | +| hificnv |
    • hificnv 1.0.1
    | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/a58f8b44cf8fd09c39c90e07076dbb418188084d/docker/hificnv) | [hificnv@sha256:c4764a70c8c2028edb1cdb4352997269947c5076ddd1aeaeef6c5076c630304d](https://quay.io/pacbio/hificnv/manifest/sha256:c4764a70c8c2028edb1cdb4352997269947c5076ddd1aeaeef6c5076c630304d) | +| paraphase |
    • paraphase 3.1.1
    • minimap 2.28
    | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/6b13cc246dd44e41903d17a660bb5432cdd18dbe/docker/paraphase) | [paraphase@sha256:a114ac5b9a682d7dc0fdf25c92cfb36f80c07ab4f1fb76b2e58092521b123a4d](https://quay.io/pacbio/paraphase/manifest/sha256:a114ac5b9a682d7dc0fdf25c92cfb36f80c07ab4f1fb76b2e58092521b123a4d) | +| pbstarphase |
    • pbstarphase 1.0.0
    • Database 20240826
    | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/813c7dc3143b91c34754d768c3e27a46355bb3e5/docker/pbstarphase) | [pbstarphase@sha256:6954d6f7e462c9cec7aaf7ebb66efaf13d448239aab76a3c947c1dfe24859686](https://quay.io/pacbio/pbstarphase/manifest/sha256:6954d6f7e462c9cec7aaf7ebb66efaf13d448239aab76a3c947c1dfe24859686) | +| pb-cpg-tools |
    • pb-cpg-tools 2.3.2
    | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/6b13cc246dd44e41903d17a660bb5432cdd18dbe/docker/pb-cpg-tools) | [pb-cpg-tools@sha256:d6e63fe3f6855cfe60f573de1ca85fab27f4a68e24a7f5691a7a805a22af292d](https://quay.io/pacbio/pb-cpg-tools/manifest/sha256:d6e63fe3f6855cfe60f573de1ca85fab27f4a68e24a7f5691a7a805a22af292d) | +| wgs_tertiary |
    • `/opt/scripts/calculate_phrank.py` 2.0.0
    • `/opt/scripts/json2ped.py` 0.2.0
    Last built 2021-09-17:
    • ensembl -> HGNC
    • ensembl -> HPO
    • HGNC -> inheritance
    • HPO DAG
    | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/5702b8ca8adbb4545b815373673079356df533e4/docker/wgs_tertiary) | [wgs_tertiary@sha256:128086b938d2602c06f4e5f88a8b7ead70933e3a43237e49cd505d141bb31785](https://quay.io/pacbio/wgs_tertiary/manifest/sha256:128086b938d2602c06f4e5f88a8b7ead70933e3a43237e49cd505d141bb31785) | +| slivar |
    • slivar 0.3.1
    • `/opt/scripts/add_comphet_phase.py` 0.1.0
    | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/5e1094fd6755203b4971fdac6dcb951bbc098bed/docker/slivar) | [slivar@sha256:35be557730d3ac9e883f1c2010fb24ac02631922f9b4948b0608d3e643a46e8b](https://quay.io/pacbio/slivar/manifest/sha256:35be557730d3ac9e883f1c2010fb24ac02631922f9b4948b0608d3e643a46e8b) | +| svpack |
    • svpack 54b54db
    | [Dockerfile](https://github.com/PacificBiosciences/wdl-dockerfiles/tree/6fc750b0c65b4a5c1eb65791eab9eed89864d858/docker/svpack) | [svpack@sha256:628e9851e425ed8044a907d33de04043d1ef02d4d2b2667cf2e9a389bb011eba](https://quay.io/pacbio/svpack/manifest/sha256:628e9851e425ed8044a907d33de04043d1ef02d4d2b2667cf2e9a389bb011eba) | +| deepvariant |
    • DeepVariant 1.6.1
    | | [deepvariant:1.6.1](https://hub.docker.com/layers/google/deepvariant/1.6.1/images/sha256-ccab95548e6c3ec28c75232987f31209ff1392027d67732435ce1ba3d0b55c68) | +| deepvariant-gpu |
    • DeepVariant 1.6.1
    | | [deepvariant:1.6.1-gpu](https://hub.docker.com/layers/google/deepvariant/1.6.1-gpu/images/sha256-7929c55106d3739daa18d52802913c43af4ca2879db29656056f59005d1d46cb) | +| pharmcat |
    • PharmCat 2.15.4
    | | [pharmcat:2.15.4](https://hub.docker.com/layers/pgkb/pharmcat/2.15.4/images/sha256-5b58ae959b4cd85986546c2d67e3596f33097dedc40dfe57dd845b6e78781eb6) | +| glnexus |
    • GLnexus 1.4.3
    | | [glnexus:1.4.3](https://quay.io/pacbio/glnexus/manifest/sha256:ce6fecf59dddc6089a8100b31c29c1e6ed50a0cf123da9f2bc589ee4b0c69c8e) | diff --git a/image_manifest.txt b/image_manifest.txt new file mode 100644 index 00000000..24b0f0ad --- /dev/null +++ b/image_manifest.txt @@ -0,0 +1,17 @@ +quay.io/pacbio/glnexus@sha256:ce6fecf59dddc6089a8100b31c29c1e6ed50a0cf123da9f2bc589ee4b0c69c8e +quay.io/pacbio/hificnv@sha256:c4764a70c8c2028edb1cdb4352997269947c5076ddd1aeaeef6c5076c630304d +quay.io/pacbio/hiphase@sha256:47fe7d42aea6b1b2e6d3c7401bc35a184464c3f647473d0525c00f3c968b40ad +quay.io/pacbio/mosdepth@sha256:63f7a5d1a4a17b71e66d755d3301a951e50f6b63777d34dab3ee9e182fd7acb1 +quay.io/pacbio/paraphase@sha256:a114ac5b9a682d7dc0fdf25c92cfb36f80c07ab4f1fb76b2e58092521b123a4d +quay.io/pacbio/pb-cpg-tools@sha256:d6e63fe3f6855cfe60f573de1ca85fab27f4a68e24a7f5691a7a805a22af292d +quay.io/pacbio/pbmm2@sha256:24218cb5cbc68d1fd64db14a9dc38263d3d931c74aca872c998d12ef43020ef0 +quay.io/pacbio/pbstarphase@sha256:6954d6f7e462c9cec7aaf7ebb66efaf13d448239aab76a3c947c1dfe24859686 +quay.io/pacbio/pbsv@sha256:3a8529853c1e214809dcdaacac0079de70d0c037b41b43bb8ba7c3fc5f783e26 +quay.io/pacbio/pb_wdl_base@sha256:4b889a1f21a6a7fecf18820613cf610103966a93218de772caba126ab70a8e87 +quay.io/pacbio/slivar@sha256:f71a27f756e2d69ec30949cbea97c54abbafde757562a98ef965f21a28aa8eaa +quay.io/pacbio/svpack@sha256:628e9851e425ed8044a907d33de04043d1ef02d4d2b2667cf2e9a389bb011eba +quay.io/pacbio/trgt@sha256:0284ff5756f8d47d9d81b515b8b1a6c81fac862ae5a7b4fe89f65235c3e5e0c9 +quay.io/pacbio/wgs_tertiary@sha256:128086b938d2602c06f4e5f88a8b7ead70933e3a43237e49cd505d141bb31785 +google/deepvariant:1.6.1 +google/deepvariant:1.6.1-gpu +pgkb/pharmcat:2.15.4 diff --git a/images/main.graphviz.svg b/images/main.graphviz.svg deleted file mode 100644 index 7df5907d..00000000 --- a/images/main.graphviz.svg +++ /dev/null @@ -1,136 +0,0 @@ - - - - - - -%3 - - -cluster-scatter-L40C2-sample - -scatter(cohort.samples) - - -cluster-if-L51C2 - -if(length(cohort.samples) > 1) - - -cluster-if-L63C2 - -if(run_tertiary_analysis) - - - -call-backend_configuration - -backend_configuration - - - -decl-default_runtime_attributes -default_runtime_attributes - - - -call-backend_configuration->decl-default_runtime_attributes - - - - - -call-sample_analysis - -sample_analysis - - - -decl-default_runtime_attributes->call-sample_analysis - - - - - -call-cohort_analysis - -cohort_analysis - - - -decl-default_runtime_attributes->call-cohort_analysis - - - - - -call-tertiary_analysis - -tertiary_analysis - - - -decl-default_runtime_attributes->call-tertiary_analysis - - - - - -call-sample_analysis->call-cohort_analysis - - - - - -decl-slivar_small_variant_input_vcf -slivar_small_variant_input_vcf - - - -call-sample_analysis->decl-slivar_small_variant_input_vcf - - - - - -decl-slivar_sv_input_vcf -slivar_sv_input_vcf - - - -call-sample_analysis->decl-slivar_sv_input_vcf - - - - - - -call-cohort_analysis->decl-slivar_small_variant_input_vcf - - - - - -call-cohort_analysis->decl-slivar_sv_input_vcf - - - - - - -decl-slivar_small_variant_input_vcf->call-tertiary_analysis - - - - - -decl-slivar_sv_input_vcf->call-tertiary_analysis - - - - - - diff --git a/scripts/create_image_manifest.sh b/scripts/create_image_manifest.sh new file mode 100644 index 00000000..a1704f82 --- /dev/null +++ b/scripts/create_image_manifest.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +set -e + +# This script generates a manifest file that lists all the docker images used in the WDL files. + +grep '@sha' -h -r workflows/ \ +| tr --squeeze-repeats ' ' \ +| cut --fields=3 --delimiter=' ' \ +| sed 's!~{runtime_attributes.container_registry}!quay.io/pacbio!;s/"//g;' \ +| sort --unique \ +> ./image_manifest.txt + +deepvariant_version=$(grep -m1 'String deepvariant_version' workflows/singleton.wdl | tr -s ' ' | cut -f5 -d' ' | sed 's/"//g') +echo "google/deepvariant:${deepvariant_version}" >> ./image_manifest.txt +echo "google/deepvariant:${deepvariant_version}-gpu" >> ./image_manifest.txt + +pharmcat_version=$(grep -m1 'String pharmcat_version' workflows/singleton.wdl | tr -s ' ' | cut -f5 -d' ' | sed 's/"//g') +echo "pgkb/pharmcat:${pharmcat_version}" >> ./image_manifest.txt \ No newline at end of file diff --git a/wdl-ci.config.json b/wdl-ci.config.json index e54beb90..8fe0c871 100644 --- a/wdl-ci.config.json +++ b/wdl-ci.config.json @@ -25,7 +25,7 @@ "tasks": { "pbmm2_align": { "key": "pbmm2_align", - "digest": "3r4icze5zkps7m6xoruzvnfzk2fp4gqd", + "digest": "ntqgonovwrgxezaewjfw7oec4t34yvhp", "tests": [ { "inputs": { @@ -142,7 +142,7 @@ }, "trgt": { "key": "trgt", - "digest": "ylzep5nroxhzjff43gkc6fs25ydor7dd", + "digest": "fyt3gqmt5tfykbls33kbr62nw3d4rvhj", "tests": [ { "inputs": { @@ -178,7 +178,7 @@ }, "coverage_dropouts": { "key": "coverage_dropouts", - "digest": "3el45hg36hlyx5cswr3dkvqfg644cvbn", + "digest": "iac5c3nzugilbarb4xsat37n2j5hjov4", "tests": [ { "inputs": { @@ -245,7 +245,7 @@ }, "paraphase": { "key": "paraphase", - "digest": "gzktyxvdrw73el5khnudlpu23x34lbxv", + "digest": "xme5pugrmhvnys5vas6jfagm65eac4hz", "tests": [ { "inputs": { @@ -570,7 +570,7 @@ "tasks": { "deepvariant_make_examples": { "key": "deepvariant_make_examples", - "digest": "35kzpf37semcoxs7frzvhjrc4zvwoyan", + "digest": "3jfeho4suf23enopqj23cx6ygviftqny", "tests": [ { "inputs": { @@ -598,7 +598,7 @@ "task_start_index": 0, "tasks_per_shard": 8, "total_deepvariant_tasks": 64, - "deepvariant_version": "1.5.0", + "deepvariant_version": "1.6.0", "runtime_attributes": "${default_runtime_attributes}" }, "output_tests": { @@ -620,7 +620,7 @@ }, "deepvariant_call_variants": { "key": "deepvariant_call_variants", - "digest": "a6ksi3haiz5pye7p64c67zeeauit7gqf", + "digest": "rywffaewvhwakdysxejsrqbz7bqwzt2o", "tests": [ { "inputs": { @@ -637,12 +637,12 @@ "${resources_file_path}/deepvariant/${sample_id}.56.example_tfrecords.tar.gz" ], "total_deepvariant_tasks": 64, - "deepvariant_version": "1.5.0", + "deepvariant_version": "1.6.0", "runtime_attributes": "${default_runtime_attributes}" }, "output_tests": { - "tfrecord": { - "value": "${resources_file_path}/deepvariant/${sample_id}.GRCh38.call_variants_output.tfrecord.gz", + "tfrecords_tar": { + "value": "${resources_file_path}/deepvariant/${sample_id}.GRCh38.call_variants_output.tar.gz", "test_tasks": [ "compare_file_basename", "check_gzip" @@ -654,12 +654,12 @@ }, "deepvariant_postprocess_variants": { "key": "deepvariant_postprocess_variants", - "digest": "afmkoy2hy4lcyolms7n6lgzuzudr7uxx", + "digest": "ey7zqpajaeesvsg372rehhjmkpqld2qx", "tests": [ { "inputs": { "sample_id": "${sample_id}", - "tfrecord": "${resources_file_path}/deepvariant/${sample_id}.GRCh38.call_variants_output.tfrecord.gz", + "tfrecords_tar": "${resources_file_path}/deepvariant/${sample_id}.GRCh38.call_variants_output.tar.gz", "nonvariant_site_tfrecord_tars": [ "${resources_file_path}/deepvariant/${sample_id}.0.nonvariant_site_tfrecords.tar.gz", "${resources_file_path}/deepvariant/${sample_id}.8.nonvariant_site_tfrecords.tar.gz", @@ -674,7 +674,7 @@ "reference_index": "${datasets_file_path}/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai", "reference_name": "GRCh38", "total_deepvariant_tasks": 64, - "deepvariant_version": "1.5.0", + "deepvariant_version": "1.6.0", "runtime_attributes": "${default_runtime_attributes}" }, "output_tests": { diff --git a/workflows/cohort_analysis/cohort_analysis.wdl b/workflows/cohort_analysis/cohort_analysis.wdl deleted file mode 100644 index 8a9970b5..00000000 --- a/workflows/cohort_analysis/cohort_analysis.wdl +++ /dev/null @@ -1,112 +0,0 @@ -version 1.0 - -# Run joint genotyping for a cohort. This workflow will be run if there is more than one sample in the cohort. - -import "../humanwgs_structs.wdl" -import "../wdl-common/wdl/tasks/pbsv_call.wdl" as PbsvCall -import "../wdl-common/wdl/tasks/concat_vcf.wdl" as ConcatVcf -import "../wdl-common/wdl/tasks/glnexus.wdl" as Glnexus -import "../wdl-common/wdl/workflows/hiphase/hiphase.wdl" as HiPhase - -workflow cohort_analysis { - input { - String cohort_id - Array[String] sample_ids - Array[IndexData] aligned_bams - Array[File] svsigs - Array[IndexData] gvcfs - - ReferenceData reference - - Int? pbsv_call_mem_gb - Int? glnexus_mem_gb - - RuntimeAttributes default_runtime_attributes - } - - Int sample_count = length(sample_ids) - Array[Array[String]] pbsv_splits = read_json(reference.pbsv_splits) - - scatter (gvcf_object in gvcfs) { - File gvcf = gvcf_object.data - File gvcf_index = gvcf_object.data_index - } - - scatter (shard_index in range(length(pbsv_splits))) { - Array[String] region_set = pbsv_splits[shard_index] - - call PbsvCall.pbsv_call { - input: - sample_id = cohort_id + ".joint", - svsigs = svsigs, - sample_count = sample_count, - reference = reference.fasta.data, - reference_index = reference.fasta.data_index, - reference_name = reference.name, - shard_index = shard_index, - regions = region_set, - mem_gb = pbsv_call_mem_gb, - runtime_attributes = default_runtime_attributes - } - } - - call ConcatVcf.concat_vcf { - input: - vcfs = pbsv_call.pbsv_vcf, - vcf_indices = pbsv_call.pbsv_vcf_index, - output_vcf_name = "~{cohort_id}.joint.~{reference.name}.pbsv.vcf.gz", - runtime_attributes = default_runtime_attributes - } - - IndexData zipped_pbsv_vcf = { - "data": concat_vcf.concatenated_vcf, - "data_index": concat_vcf.concatenated_vcf_index - } - - call Glnexus.glnexus { - input: - cohort_id = cohort_id + ".joint", - gvcfs = gvcf, - gvcf_indices = gvcf_index, - reference_name = reference.name, - mem_gb = glnexus_mem_gb, - runtime_attributes = default_runtime_attributes - } - - IndexData glnexus_vcf = { - "data": glnexus.vcf, - "data_index": glnexus.vcf_index - } - - call HiPhase.hiphase { - # VCF order: small variants, SVs - input: - id = cohort_id + ".joint", - refname = reference.name, - sample_ids = sample_ids, - vcfs = [glnexus_vcf, zipped_pbsv_vcf], - bams = aligned_bams, - haplotag = false, - reference_fasta = reference.fasta, - default_runtime_attributes = default_runtime_attributes - } - - output { - IndexData phased_joint_small_variant_vcf = hiphase.phased_vcfs[0] - IndexData phased_joint_sv_vcf = hiphase.phased_vcfs[1] - File hiphase_stats = hiphase.hiphase_stats - File hiphase_blocks = hiphase.hiphase_blocks - } - - parameter_meta { - cohort_id: {help: "Cohort ID; used for naming files"} - sample_ids: {help: "Sample IDs for all samples in the cohort"} - aligned_bams: {help: "BAM and index aligned to the reference genome for each movie associated with all samples in the cohort"} - svsigs: {help: "pbsv svsig files for each sample and movie BAM in the cohort"} - gvcfs: {help: "gVCF for each sample in the cohort"} - reference: {help: "Reference genome data"} - pbsv_call_mem_gb: {help: "Optional amount of RAM in GB for pbsv_call; default 64 for cohorts N<=3, 96 for cohorts N>3"} - glnexus_mem_gb: {help: "Optional amount of RAM in GB for glnexus; default 30"} - default_runtime_attributes: {help: "Default RuntimeAttributes; spot if preemptible was set to true, otherwise on_demand"} - } -} diff --git a/workflows/cohort_analysis/inputs.json b/workflows/cohort_analysis/inputs.json deleted file mode 100644 index fab5ca69..00000000 --- a/workflows/cohort_analysis/inputs.json +++ /dev/null @@ -1,50 +0,0 @@ -{ - "cohort_analysis.cohort_id": "String", - "cohort_analysis.sample_ids": [ - "String" - ], - "cohort_analysis.aligned_bams": [ - { - "data": "File", - "data_index": "File" - } - ], - "cohort_analysis.svsigs": [ - "File" - ], - "cohort_analysis.gvcfs": [ - { - "data": "File", - "data_index": "File" - } - ], - "cohort_analysis.reference": { - "name": "String", - "fasta": { - "data": "File", - "data_index": "File" - }, - "tandem_repeat_bed": "File", - "trgt_tandem_repeat_bed": "File", - "hificnv_exclude_bed": { - "data": "File", - "data_index": "File" - }, - "pbsv_splits": "File", - "hificnv_expected_bed_male": "File", - "hificnv_expected_bed_female": "File", - "gnomad_af": "File? (optional)", - "hprc_af": "File? (optional)", - "gff": "File? (optional)", - "population_vcfs": "Array[WomCompositeType {\n data -> File\ndata_index -> File \n}]? (optional)" - }, - "cohort_analysis.glnexus_mem_gb": "Int? (optional)", - "cohort_analysis.pbsv_call_mem_gb": "Int? (optional)", - "cohort_analysis.default_runtime_attributes": { - "preemptible_tries": "Int", - "max_retries": "Int", - "zones": "String", - "queue_arn": "String", - "container_registry": "String" - } -} diff --git a/workflows/downstream/downstream.wdl b/workflows/downstream/downstream.wdl new file mode 100644 index 00000000..031ca870 --- /dev/null +++ b/workflows/downstream/downstream.wdl @@ -0,0 +1,223 @@ +version 1.0 + +import "../wdl-common/wdl/structs.wdl" +import "../wdl-common/wdl/tasks/hiphase.wdl" as Hiphase +import "../wdl-common/wdl/tasks/bcftools.wdl" as Bcftools +import "../wdl-common/wdl/tasks/cpg_pileup.wdl" as Cpgpileup +import "../wdl-common/wdl/tasks/pbstarphase.wdl" as Pbstarphase +import "../wdl-common/wdl/workflows/pharmcat/pharmcat.wdl" as Pharmcat + +workflow downstream { + meta { + description: "Phases small variants, SVs, and TRGTs, haplotags alignments, calls HLA and PGx alleles." + } + + parameter_meta { + sample_id: { + name: "Sample ID" + } + small_variant_vcf: { + name: "Small variant VCF" + } + small_variant_vcf_index: { + name: "Small variant VCF index" + } + sv_vcf: { + name: "Structural variant VCF" + } + sv_vcf_index: { + name: "Structural variant VCF index" + } + trgt_vcf: { + name: "TRGT VCF" + } + trgt_vcf_index: { + name: "TRGT VCF index" + } + aligned_bam: { + name: "Aligned BAM" + } + aligned_bam_index: { + name: "Aligned BAI" + } + pharmcat_version: { + name: "PharmCAT version" + } + pharmcat_min_coverage: { + name: "Minimum coverage for PharmCAT" + } + ref_map_file: { + name: "Reference map file" + } + default_runtime_attributes: { + name: "Default runtime attributes" + } + } + + input { + String sample_id + + File small_variant_vcf + File small_variant_vcf_index + File sv_vcf + File sv_vcf_index + File trgt_vcf + File trgt_vcf_index + + File aligned_bam + File aligned_bam_index + + String pharmcat_version + Int pharmcat_min_coverage + + File ref_map_file + + RuntimeAttributes default_runtime_attributes + } + + Map[String, String] ref_map = read_map(ref_map_file) + + Array[File] hiphase_input_vcfs = [small_variant_vcf, sv_vcf, trgt_vcf] + Array[File] hiphase_input_vcf_indices = [small_variant_vcf_index, sv_vcf_index, trgt_vcf_index] + + scatter (vcf_index in range(length(hiphase_input_vcfs))) { + # generate an array of phased VCF names that match the input VCFs + String phased_vcf_name = basename(hiphase_input_vcfs[vcf_index], ".vcf.gz") + ".phased.vcf.gz" + String phased_vcf_index_name = basename(hiphase_input_vcf_indices[vcf_index], ".vcf.gz.tbi") + ".phased.vcf.gz.tbi" + } + + call Hiphase.hiphase { + input: + sample_id = sample_id, + vcfs = hiphase_input_vcfs, + vcf_indices = hiphase_input_vcf_indices, + phased_vcf_names = phased_vcf_name, + phased_vcf_index_names = phased_vcf_index_name, + aligned_bam = aligned_bam, + aligned_bam_index = aligned_bam_index, + ref_name = ref_map["name"], + ref_fasta = ref_map["fasta"], # !FileCoercion + ref_index = ref_map["fasta_index"], # !FileCoercion + runtime_attributes = default_runtime_attributes + } + + # hiphase.phased_vcfs[0] -> phased small variant VCF + # hiphase.phased_vcfs[1] -> phased SV VCF + # hiphase.phased_vcfs[2] -> phased TRGT VCF + + call Bcftools.bcftools_stats_roh_small_variants { + input: + sample_id = sample_id, + vcf = hiphase.phased_vcfs[0], + ref_fasta = ref_map["fasta"], # !FileCoercion + ref_name = ref_map["name"], + runtime_attributes = default_runtime_attributes + } + + call Bcftools.sv_stats { + input: + vcf = hiphase.phased_vcfs[1], + runtime_attributes = default_runtime_attributes + } + + call Cpgpileup.cpg_pileup { + input: + haplotagged_bam = hiphase.haplotagged_bam, + haplotagged_bam_index = hiphase.haplotagged_bam_index, + out_prefix = "~{sample_id}.~{ref_map['name']}", + ref_fasta = ref_map["fasta"], # !FileCoercion + ref_index = ref_map["fasta_index"], # !FileCoercion + runtime_attributes = default_runtime_attributes + } + + call Pbstarphase.pbstarphase_diplotype { + input: + sample_id = sample_id, + phased_vcf = hiphase.phased_vcfs[0], + phased_vcf_index = hiphase.phased_vcf_indices[0], + aligned_bam = hiphase.haplotagged_bam, + aligned_bam_index = hiphase.haplotagged_bam_index, + ref_fasta = ref_map["fasta"], # !FileCoercion + ref_index = ref_map["fasta_index"], # !FileCoercion + runtime_attributes = default_runtime_attributes + } + + call Pharmcat.pharmcat { + input: + sample_id = sample_id, + haplotagged_bam = hiphase.haplotagged_bam, + haplotagged_bam_index = hiphase.haplotagged_bam_index, + phased_vcf = hiphase.phased_vcfs[0], + phased_vcf_index = hiphase.phased_vcf_indices[0], + input_tsvs = [pbstarphase_diplotype.pharmcat_tsv], + ref_fasta = ref_map["fasta"], # !FileCoercion + ref_index = ref_map["fasta_index"], # !FileCoercion + pharmcat_version = pharmcat_version, + pharmcat_positions = ref_map["pharmcat_positions_vcf"], # !FileCoercion + pharmcat_positions_index = ref_map["pharmcat_positions_vcf_index"], # !FileCoercion + pharmcat_min_coverage = pharmcat_min_coverage, + default_runtime_attributes = default_runtime_attributes + } + + output { + # hiphase outputs + File merged_haplotagged_bam = hiphase.haplotagged_bam + File merged_haplotagged_bam_index = hiphase.haplotagged_bam_index + File phased_small_variant_vcf = hiphase.phased_vcfs[0] + File phased_small_variant_vcf_index = hiphase.phased_vcf_indices[0] + File phased_sv_vcf = hiphase.phased_vcfs[1] + File phased_sv_vcf_index = hiphase.phased_vcf_indices[1] + File phased_trgt_vcf = hiphase.phased_vcfs[2] + File phased_trgt_vcf_index = hiphase.phased_vcf_indices[2] + File phase_stats = hiphase.phase_stats + File phase_blocks = hiphase.phase_blocks + File phase_haplotags = hiphase.phase_haplotags + String stat_phased_basepairs = hiphase.stat_phased_basepairs + String stat_phase_block_ng50 = hiphase.stat_phase_block_ng50 + String stat_mapped_read_count = hiphase.stat_mapped_read_count + String stat_mapped_percent = hiphase.stat_mapped_percent + File mapq_distribution_plot = hiphase.mapq_distribution_plot + File mg_distribution_plot = hiphase.mg_distribution_plot + + # small variant stats + File small_variant_stats = bcftools_stats_roh_small_variants.stats + File bcftools_roh_out = bcftools_stats_roh_small_variants.roh_out + File bcftools_roh_bed = bcftools_stats_roh_small_variants.roh_bed + String stat_SNV_count = bcftools_stats_roh_small_variants.stat_SNV_count + String stat_INDEL_count = bcftools_stats_roh_small_variants.stat_INDEL_count + String stat_TSTV_ratio = bcftools_stats_roh_small_variants.stat_TSTV_ratio + String stat_HETHOM_ratio = bcftools_stats_roh_small_variants.stat_HETHOM_ratio + File snv_distribution_plot = bcftools_stats_roh_small_variants.snv_distribution_plot + File indel_distribution_plot = bcftools_stats_roh_small_variants.indel_distribution_plot + + # sv stats + String stat_sv_DUP_count = sv_stats.stat_sv_DUP_count + String stat_sv_DEL_count = sv_stats.stat_sv_DEL_count + String stat_sv_INS_count = sv_stats.stat_sv_INS_count + String stat_sv_INV_count = sv_stats.stat_sv_INV_count + String stat_sv_BND_count = sv_stats.stat_sv_BND_count + + # cpg_pileup outputs + File cpg_combined_bed = cpg_pileup.combined_bed + File cpg_combined_bed_index = cpg_pileup.combined_bed_index + File cpg_hap1_bed = cpg_pileup.hap1_bed + File cpg_hap1_bed_index = cpg_pileup.hap1_bed_index + File cpg_hap2_bed = cpg_pileup.hap2_bed + File cpg_hap2_bed_index = cpg_pileup.hap2_bed_index + File cpg_combined_bw = cpg_pileup.combined_bw + File cpg_hap1_bw = cpg_pileup.hap1_bw + File cpg_hap2_bw = cpg_pileup.hap2_bw + String stat_hap1_cpg_count = cpg_pileup.stat_hap1_cpg_count + String stat_hap2_cpg_count = cpg_pileup.stat_hap2_cpg_count + String stat_combined_cpg_count = cpg_pileup.stat_combined_cpg_count + + # pbstarphase outputs + File pbstarphase_json = pbstarphase_diplotype.out_json + + # pharmcat and pangu outputs + File pharmcat_match_json = pharmcat.pharmcat_match_json + File pharmcat_phenotype_json = pharmcat.pharmcat_phenotype_json + File pharmcat_report_html = pharmcat.pharmcat_report_html + File pharmcat_report_json = pharmcat.pharmcat_report_json + } +} \ No newline at end of file diff --git a/workflows/downstream/inputs.json b/workflows/downstream/inputs.json new file mode 100644 index 00000000..80474908 --- /dev/null +++ b/workflows/downstream/inputs.json @@ -0,0 +1,22 @@ +{ + "downstream.sample_id": "String", + "downstream.small_variant_vcf": "File", + "downstream.small_variant_vcf_index": "File", + "downstream.sv_vcf": "File", + "downstream.sv_vcf_index": "File", + "downstream.trgt_vcf": "File", + "downstream.trgt_vcf_index": "File", + "downstream.aligned_bam": "File", + "downstream.aligned_bam_index": "File", + "downstream.pharmcat_version": "String", + "downstream.pharmcat_min_coverage": "Int", + "downstream.ref_map_file": "File", + "downstream.default_runtime_attributes": { + "max_retries": "Int", + "container_registry": "String", + "gpuType": "String", + "backend": "String", + "preemptible_tries": "Int", + "zones": "String" + } +} \ No newline at end of file diff --git a/workflows/family.inputs.json b/workflows/family.inputs.json new file mode 100644 index 00000000..ec555a3c --- /dev/null +++ b/workflows/family.inputs.json @@ -0,0 +1,34 @@ +{ + "humanwgs_family.family": { + "family_id": "String", + "samples": [ + { + "sample_id": "String", + "hifi_reads": [ + "File" + ], + "affected": "Boolean", + "sex": "String? (optional); ['MALE', 'FEMALE']", + "father_id": "String? (optional)", + "mother_id": "String? (optional)" + } + ] + }, + "humanwgs_family.phenotypes": "String? (optional)", + "humanwgs_family.ref_map_file": "File", + "humanwgs_family.deepvariant_version": "String (optional, default = \"1.6.1\")", + "humanwgs_family.custom_deepvariant_model_tar": "File? (optional)", + "humanwgs_family.pharmcat_version": "String (optional, default = \"2.15.0\")", + "humanwgs_family.pharmcat_min_coverage": "Int (optional, default = 10)", + "humanwgs_family.tertiary_map_file": "File? (optional)", + "humanwgs_family.glnexus_mem_gb": "Int? (optional)", + "humanwgs_family.pbsv_call_mem_gb": "Int? (optional)", + "humanwgs_family.gpu": "Boolean (optional, default = false)", + "humanwgs_family.backend": "String", + "humanwgs_family.zones": "String? (optional)", + "humanwgs_family.gpuType": "String? (optional)", + "humanwgs_family.container_registry": "String? (optional)", + "humanwgs_family.container_namespace": "String? (optional)", + "humanwgs_family.preemptible": "Boolean", + "humanwgs_family.debug_version": "String? (optional)" +} \ No newline at end of file diff --git a/workflows/family.wdl b/workflows/family.wdl new file mode 100644 index 00000000..faffae41 --- /dev/null +++ b/workflows/family.wdl @@ -0,0 +1,405 @@ +version 1.0 + +import "humanwgs_structs.wdl" +import "wdl-common/wdl/workflows/backend_configuration/backend_configuration.wdl" as BackendConfiguration +import "upstream/upstream.wdl" as Upstream +import "joint/joint.wdl" as Joint +import "downstream/downstream.wdl" as Downstream +import "wdl-common/wdl/tasks/bcftools.wdl" as Bcftools +import "wdl-common/wdl/tasks/trgt.wdl" as Trgt +import "wdl-common/wdl/tasks/write_ped_phrank.wdl" as Write_ped_phrank +import "tertiary/tertiary.wdl" as TertiaryAnalysis +import "wdl-common/wdl/tasks/utilities.wdl" as Utilities + + +workflow humanwgs_family { + meta { + description: "PacBio HiFi human whole genome sequencing pipeline, with joint calling for related samples." + } + + parameter_meta { + family: { + name: "Family struct describing samples, relationships, and unaligned BAM paths" + } + ref_map_file: { + name: "TSV containing reference genome file paths; must match backend" + } + deepvariant_version: { + name: "DeepVariant version" + } + custom_deepvariant_model_tar: { + name: "Custom DeepVariant model tarball" + } + pharmcat_version: { + name: "PharmCAT version" + } + pharmcat_min_coverage: { + name: "Minimum coverage for PharmCAT" + } + phenotypes: { + name: "Comma-delimited list of HPO codes for phenotypes" + } + tertiary_map_file: { + name: "TSV containing tertiary analysis file paths and thresholds; must match backend" + } + glnexus_mem_gb: { + name: "Override GLnexus memory request (GB)" + } + pbsv_call_mem_gb: { + name: "Override PBSV call memory request (GB)" + } + gpu: { + name: "Use GPU when possible" + } + backend: { + name: "Backend where the workflow will be executed", + choices: ["GCP", "Azure", "AWS-HealthOmics", "HPC"] + } + zones: { + name: "Zones where compute will take place; required if backend is set to 'GCP'" + } + gpuType: { + name: "GPU type to use; required if gpu is set to `true` for cloud backends; must match backend" + } + container_registry: { + name: "Container registry where workflow images are hosted. If left blank, PacBio's public Quay.io registry will be used. Must be set if backend is set to 'AWS-HealthOmics'" + } + preemptible: { + name: "Where possible, run tasks preemptibly" + } + debug_version: { + name: "Debug version for testing purposes" + } + } + + input { + Family family + + File ref_map_file + + # These options are only intended for testing purposes. + # There is no guarantee that the pipeline will work with + # other version of DeepVariant or with custom models. + String deepvariant_version = "1.6.1" + File? custom_deepvariant_model_tar + + String pharmcat_version = "2.15.4" + Int pharmcat_min_coverage = 10 + + String phenotypes = "HP:0000001" + File? tertiary_map_file + + Int? glnexus_mem_gb + Int? pbsv_call_mem_gb + + Boolean gpu = false + + # Backend configuration + String backend + String? zones + String? gpuType + String? container_registry + + Boolean preemptible = true + + String? debug_version + } + + call BackendConfiguration.backend_configuration { + input: + backend = backend, + zones = zones, + gpuType = gpuType, + container_registry = container_registry + } + + RuntimeAttributes default_runtime_attributes = if preemptible then backend_configuration.spot_runtime_attributes else backend_configuration.on_demand_runtime_attributes + + Map [String, String] ref_map = read_map(ref_map_file) + + Boolean single_sample = length(family.samples) == 1 + + scatter (sample in family.samples) { + String sample_id = sample.sample_id + call Upstream.upstream { + input: + sample_id = sample.sample_id, + sex = sample.sex, + hifi_reads = sample.hifi_reads, + ref_map_file = ref_map_file, + deepvariant_version = deepvariant_version, + custom_deepvariant_model_tar = custom_deepvariant_model_tar, + single_sample = single_sample, + gpu = gpu, + default_runtime_attributes = default_runtime_attributes + } + } + + if (!single_sample) { + call Joint.joint { + input: + family_id = family.family_id, + sample_ids = sample_id, + gvcfs = upstream.small_variant_gvcf, + gvcf_indices = upstream.small_variant_gvcf_index, + svsigs = flatten(upstream.svsigs), + ref_map_file = ref_map_file, + glnexus_mem_gb = glnexus_mem_gb, + pbsv_call_mem_gb = pbsv_call_mem_gb, + default_runtime_attributes = default_runtime_attributes + } + } + + scatter (sample_index in range(length(family.samples))) { + call Downstream.downstream { + input: + sample_id = sample_id[sample_index], + small_variant_vcf = select_first([joint.split_joint_small_variant_vcfs, upstream.small_variant_vcf])[sample_index], + small_variant_vcf_index = select_first([joint.split_joint_small_variant_vcf_indices, upstream.small_variant_vcf_index])[sample_index], + sv_vcf = select_first([joint.split_joint_structural_variant_vcfs, select_all(upstream.sv_vcf)])[sample_index], + sv_vcf_index = select_first([joint.split_joint_structural_variant_vcf_indices, select_all(upstream.sv_vcf_index)])[sample_index], + trgt_vcf = upstream.trgt_vcf[sample_index], + trgt_vcf_index = upstream.trgt_vcf_index[sample_index], + aligned_bam = upstream.out_bam[sample_index], + aligned_bam_index = upstream.out_bam_index[sample_index], + pharmcat_version = pharmcat_version, + pharmcat_min_coverage = pharmcat_min_coverage, + ref_map_file = ref_map_file, + default_runtime_attributes = default_runtime_attributes + } + } + + Map[String, Array[String]] stats = { + 'sample_id': sample_id, + 'num_reads': upstream.stat_num_reads, + 'read_length_min': upstream.stat_read_length_mean, + 'read_length_median': upstream.stat_read_length_median, + 'read_quality_mean': upstream.stat_read_quality_mean, + 'read_quality_median': upstream.stat_read_quality_median, + 'mapped_read_count': downstream.stat_mapped_read_count, + 'mapped_percent': downstream.stat_mapped_percent, + 'mean_depth': upstream.stat_mean_depth, + 'inferred_sex': upstream.inferred_sex, + 'stat_phased_basepairs': downstream.stat_phased_basepairs, + 'phase_block_ng50': downstream.stat_phase_block_ng50, + 'cpg_combined_count': downstream.stat_combined_cpg_count, + 'cpg_hap1_count': downstream.stat_hap1_cpg_count, + 'cpg_hap2_count': downstream.stat_hap2_cpg_count, + 'SNV_count': downstream.stat_SNV_count, + 'TSTV_ratio': downstream.stat_TSTV_ratio, + 'HETHOM_ratio': downstream.stat_HETHOM_ratio, + 'INDEL_count': downstream.stat_INDEL_count, + 'sv_DUP_count': downstream.stat_sv_DUP_count, + 'sv_DEL_count': downstream.stat_sv_DEL_count, + 'sv_INS_count': downstream.stat_sv_INS_count, + 'sv_INV_count': downstream.stat_sv_INV_count, + 'sv_BND_count': downstream.stat_sv_BND_count, + 'cnv_DUP_count': upstream.stat_cnv_DUP_count, + 'cnv_DEL_count': upstream.stat_cnv_DEL_count, + 'cnv_DUP_sum': upstream.stat_cnv_DUP_sum, + 'cnv_DEL_sum': upstream.stat_cnv_DEL_sum, + 'trgt_genotyped_count': upstream.stat_trgt_genotyped_count, + 'trgt_uncalled_count': upstream.stat_trgt_uncalled_count + } + + call Utilities.consolidate_stats { + input: + id = family.family_id, + stats = stats, + runtime_attributes = default_runtime_attributes + } + + if (!single_sample) { + call Bcftools.bcftools_merge as merge_small_variant_vcfs { + input: + vcfs = downstream.phased_small_variant_vcf, + vcf_indices = downstream.phased_small_variant_vcf_index, + out_prefix = "~{family.family_id}.joint.~{ref_map['name']}.small_variants.phased", + runtime_attributes = default_runtime_attributes + } + + call Bcftools.bcftools_merge as merge_sv_vcfs { + input: + vcfs = downstream.phased_sv_vcf, + vcf_indices = downstream.phased_sv_vcf_index, + out_prefix = "~{family.family_id}.joint.~{ref_map['name']}.structural_variants.phased", + runtime_attributes = default_runtime_attributes + } + + call Trgt.trgt_merge { + input: + vcfs = downstream.phased_trgt_vcf, + vcf_indices = downstream.phased_trgt_vcf_index, + ref_fasta = ref_map["fasta"], # !FileCoercion + ref_index = ref_map["fasta_index"], # !FileCoercion + out_prefix = "~{family.family_id}.~{ref_map['name']}.trgt", + runtime_attributes = default_runtime_attributes + } + } + + if (defined(tertiary_map_file)) { + scatter (sample in family.samples) { + Array[File] hifi_reads = sample.hifi_reads + } + + call Write_ped_phrank.write_ped_phrank { + input: + id = family.family_id, + family = family, + phenotypes = phenotypes, + disk_size = size(flatten(hifi_reads), "GB") + 10, + runtime_attributes = default_runtime_attributes + } + + call TertiaryAnalysis.tertiary_analysis { + input: + pedigree = write_ped_phrank.pedigree, + phrank_lookup = write_ped_phrank.phrank_lookup, + small_variant_vcf = select_first([merge_small_variant_vcfs.merged_vcf, downstream.phased_small_variant_vcf[0]]), + small_variant_vcf_index = select_first([merge_small_variant_vcfs.merged_vcf_index, downstream.phased_small_variant_vcf_index[0]]), + sv_vcf = select_first([merge_sv_vcfs.merged_vcf, downstream.phased_sv_vcf[0]]), + sv_vcf_index = select_first([merge_sv_vcfs.merged_vcf_index, downstream.phased_sv_vcf_index[0]]), + ref_map_file = ref_map_file, + tertiary_map_file = select_first([tertiary_map_file]), + default_runtime_attributes = default_runtime_attributes + } + } + + output { + # to maintain order of samples + Array[String] sample_ids = sample_id + File stats_file = consolidate_stats.output_tsv + + # bam stats + Array[File] bam_stats = upstream.read_length_and_quality + Array[File] read_length_plot = upstream.read_length_plot + Array[File] read_quality_plot = upstream.read_quality_plot + Array[String] stat_num_reads = upstream.stat_num_reads + Array[String] stat_read_length_mean = upstream.stat_read_length_mean + Array[String] stat_read_length_median = upstream.stat_read_length_median + Array[String] stat_read_quality_mean = upstream.stat_read_quality_mean + Array[String] stat_read_quality_median = upstream.stat_read_quality_median + + # merged, haplotagged alignments + Array[File] merged_haplotagged_bam = downstream.merged_haplotagged_bam + Array[File] merged_haplotagged_bam_index = downstream.merged_haplotagged_bam_index + Array[String] stat_mapped_read_count = downstream.stat_mapped_read_count + Array[String] stat_mapped_percent = downstream.stat_mapped_percent + Array[File] mapq_distribution_plot = downstream.mapq_distribution_plot + Array[File] mg_distribution_plot = downstream.mg_distribution_plot + + # mosdepth outputs + Array[File] mosdepth_summary = upstream.mosdepth_summary + Array[File] mosdepth_region_bed = upstream.mosdepth_region_bed + Array[File] mosdepth_region_bed_index = upstream.mosdepth_region_bed_index + Array[File] mosdepth_depth_distribution_plot = upstream.mosdepth_depth_distribution_plot + Array[String] stat_mean_depth = upstream.stat_mean_depth + Array[String] inferred_sex = upstream.inferred_sex + + # phasing stats + Array[File] phase_stats = downstream.phase_stats + Array[File] phase_blocks = downstream.phase_blocks + Array[File] phase_haplotags = downstream.phase_haplotags + Array[String] stat_phased_basepairs = downstream.stat_phased_basepairs + Array[String] stat_phase_block_ng50 = downstream.stat_phase_block_ng50 + + # cpg_pileup outputs + Array[File] cpg_combined_bed = downstream.cpg_combined_bed + Array[File] cpg_combined_bed_index = downstream.cpg_combined_bed_index + Array[File] cpg_hap1_bed = downstream.cpg_hap1_bed + Array[File] cpg_hap1_bed_index = downstream.cpg_hap1_bed_index + Array[File] cpg_hap2_bed = downstream.cpg_hap2_bed + Array[File] cpg_hap2_bed_index = downstream.cpg_hap2_bed_index + Array[File] cpg_combined_bw = downstream.cpg_combined_bw + Array[File] cpg_hap1_bw = downstream.cpg_hap1_bw + Array[File] cpg_hap2_bw = downstream.cpg_hap2_bw + Array[String] stat_cpg_hap1_count = downstream.stat_hap1_cpg_count + Array[String] stat_cpg_hap2_count = downstream.stat_hap2_cpg_count + Array[String] stat_cpg_combined_count = downstream.stat_combined_cpg_count + + # sv outputs + Array[File] phased_sv_vcf = downstream.phased_sv_vcf + Array[File] phased_sv_vcf_index = downstream.phased_sv_vcf_index + + # sv stats + Array[String] stat_sv_DUP_count = downstream.stat_sv_DUP_count + Array[String] stat_sv_DEL_count = downstream.stat_sv_DEL_count + Array[String] stat_sv_INS_count = downstream.stat_sv_INS_count + Array[String] stat_sv_INV_count = downstream.stat_sv_INV_count + Array[String] stat_sv_BND_count = downstream.stat_sv_BND_count + + # small variant outputs + Array[File] phased_small_variant_vcf = downstream.phased_small_variant_vcf + Array[File] phased_small_variant_vcf_index = downstream.phased_small_variant_vcf_index + Array[File] small_variant_gvcf = upstream.small_variant_gvcf + Array[File] small_variant_gvcf_index = upstream.small_variant_gvcf_index + + # small variant stats + Array[File] small_variant_stats = downstream.small_variant_stats + Array[File] bcftools_roh_out = downstream.bcftools_roh_out + Array[File] bcftools_roh_bed = downstream.bcftools_roh_bed + Array[String] stat_small_variant_SNV_count = downstream.stat_SNV_count + Array[String] stat_small_variant_INDEL_count = downstream.stat_INDEL_count + Array[String] stat_small_variant_TSTV_ratio = downstream.stat_TSTV_ratio + Array[String] stat_small_variant_HETHOM_ratio = downstream.stat_HETHOM_ratio + Array[File] snv_distribution_plot = downstream.snv_distribution_plot + Array[File] indel_distribution_plot = downstream.indel_distribution_plot + + # trgt outputs + Array[File] phased_trgt_vcf = downstream.phased_trgt_vcf + Array[File] phased_trgt_vcf_index = downstream.phased_trgt_vcf_index + Array[File] trgt_spanning_reads = upstream.trgt_spanning_reads + Array[File] trgt_spanning_reads_index = upstream.trgt_spanning_reads_index + Array[File] trgt_coverage_dropouts = upstream.trgt_coverage_dropouts + Array[String] stat_trgt_genotyped_count = upstream.stat_trgt_genotyped_count + Array[String] stat_trgt_uncalled_count = upstream.stat_trgt_uncalled_count + + # paraphase outputs + Array[File] paraphase_output_json = upstream.paraphase_output_json + Array[File] paraphase_realigned_bam = upstream.paraphase_realigned_bam + Array[File] paraphase_realigned_bam_index = upstream.paraphase_realigned_bam_index + Array[File?] paraphase_vcfs = upstream.paraphase_vcfs + + # per sample cnv outputs + Array[File] cnv_vcf = upstream.cnv_vcf + Array[File] cnv_vcf_index = upstream.cnv_vcf_index + Array[File] cnv_copynum_bedgraph = upstream.cnv_copynum_bedgraph + Array[File] cnv_depth_bw = upstream.cnv_depth_bw + Array[File] cnv_maf_bw = upstream.cnv_maf_bw + Array[String] stat_cnv_DUP_count = upstream.stat_cnv_DUP_count + Array[String] stat_cnv_DEL_count = upstream.stat_cnv_DEL_count + Array[String] stat_cnv_DUP_sum = upstream.stat_cnv_DUP_sum + Array[String] stat_cnv_DEL_sum = upstream.stat_cnv_DEL_sum + + # PGx outputs + Array[File] pbstarphase_json = downstream.pbstarphase_json + Array[File] pharmcat_match_json = downstream.pharmcat_match_json + Array[File] pharmcat_phenotype_json = downstream.pharmcat_phenotype_json + Array[File] pharmcat_report_html = downstream.pharmcat_report_html + Array[File] pharmcat_report_json = downstream.pharmcat_report_json + + # joint call outputs + File? joint_small_variants_vcf = merge_small_variant_vcfs.merged_vcf + File? joint_small_variants_vcf_index = merge_small_variant_vcfs.merged_vcf_index + File? joint_sv_vcf = merge_sv_vcfs.merged_vcf + File? joint_sv_vcf_index = merge_sv_vcfs.merged_vcf_index + File? joint_trgt_vcf = trgt_merge.merged_vcf + File? joint_trgt_vcf_index = trgt_merge.merged_vcf_index + + # tertiary analysis outputs + File? pedigree = write_ped_phrank.pedigree + File? tertiary_small_variant_filtered_vcf = tertiary_analysis.small_variant_filtered_vcf + File? tertiary_small_variant_filtered_vcf_index = tertiary_analysis.small_variant_filtered_vcf_index + File? tertiary_small_variant_filtered_tsv = tertiary_analysis.small_variant_filtered_tsv + File? tertiary_small_variant_compound_het_vcf = tertiary_analysis.small_variant_compound_het_vcf + File? tertiary_small_variant_compound_het_vcf_index = tertiary_analysis.small_variant_compound_het_vcf_index + File? tertiary_small_variant_compound_het_tsv = tertiary_analysis.small_variant_compound_het_tsv + File? tertiary_sv_filtered_vcf = tertiary_analysis.sv_filtered_vcf + File? tertiary_sv_filtered_vcf_index = tertiary_analysis.sv_filtered_vcf_index + File? tertiary_sv_filtered_tsv = tertiary_analysis.sv_filtered_tsv + + # workflow metadata + String workflow_name = "humanwgs_family" + String workflow_version = "v2.0.0-rc6~{"-" + debug_version}" + } +} \ No newline at end of file diff --git a/workflows/humanwgs_structs.wdl b/workflows/humanwgs_structs.wdl index 1932ab00..098b6abe 100644 --- a/workflows/humanwgs_structs.wdl +++ b/workflows/humanwgs_structs.wdl @@ -1,51 +1,18 @@ version 1.0 -import "wdl-common/wdl/structs.wdl" - -struct ReferenceData { - String name - IndexData fasta - - File pbsv_splits - - File tandem_repeat_bed - File trgt_tandem_repeat_bed - - IndexData hificnv_exclude_bed - File hificnv_expected_bed_male - File hificnv_expected_bed_female - - File? gnomad_af - File? hprc_af - File? gff - - Array[IndexData]? population_vcfs -} - struct Sample { - String sample_id - Array[File] movie_bams + String sample_id - String? sex - Boolean affected - - String? father_id - String? mother_id -} + String? sex + Boolean affected -struct Cohort { - String cohort_id - Array[Sample] samples + Array[File] hifi_reads - Array[String] phenotypes + String? father_id + String? mother_id } -struct SlivarData { - File slivar_js - File hpo_terms - File hpo_dag - File hpo_annotations - File ensembl_to_hgnc - File lof_lookup - File clinvar_lookup +struct Family { + String family_id + Array[Sample] samples } diff --git a/workflows/input_template.json b/workflows/input_template.json deleted file mode 100644 index 39632a22..00000000 --- a/workflows/input_template.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "humanwgs.cohort": { - "cohort_id": "String", - "samples": [ - { - "sample_id": "String", - "movie_bams": "Array[File]", - "sex": "String?", - "affected": "Boolean", - "father_id": "String? (optional)", - "mother_id": "String? (optional)" - } - ], - "phenotypes": "Array[String]" - }, - "humanwgs.reference": { - "name": "String", - "fasta": { - "data": "File", - "data_index": "File" - }, - "tandem_repeat_bed": "File", - "trgt_tandem_repeat_bed": "File", - "hificnv_exclude_bed": { - "data": "File", - "data_index": "File" - }, - "hificnv_expected_bed_male": "File", - "hificnv_expected_bed_female": "File", - "gnomad_af": "File? (optional)", - "hprc_af": "File? (optional)", - "gff": "File? (optional)", - "population_vcfs": "Array[WomCompositeType {\n data -> File\ndata_index -> File \n}]? (optional)" - }, - "humanwgs.slivar_data": { - "slivar_js": "File", - "hpo_terms": "File", - "hpo_dag": "File", - "hpo_annotations": "File", - "ensembl_to_hgnc": "File", - "lof_lookup": "File", - "clinvar_lookup": "File" - }, - "humanwgs.deepvariant_version": "String? (optional)", - "humanwgs.deepvariant_model": "WomCompositeType {\n model -> WomCompositeType {\n data -> File\ndata_index -> File \n}\nmetadata -> File \n}? (optional)", - "humanwgs.pbsv_call_mem_gb": "Int? (optional)", - "humanwgs.glnexus_mem_gb": "Int? (optional)", - "humanwgs.run_tertiary_analysis": "Boolean? (optional, default = false)", - "humanwgs.backend": "String ['GCP', 'Azure', 'AWS', or 'HPC']", - "humanwgs.zones": "String? (optional); required if backend is set to 'GCP' or 'AWS'", - "humanwgs.aws_spot_queue_arn": "String? (optional); required if backend is set to 'AWS'", - "humanwgs.aws_on_demand_queue_arn": "String? (optional); required if backend is set to 'AWS'", - "humanwgs.preemptible": "Boolean" -} diff --git a/workflows/joint/inputs.json b/workflows/joint/inputs.json new file mode 100644 index 00000000..9a0ab408 --- /dev/null +++ b/workflows/joint/inputs.json @@ -0,0 +1,18 @@ +{ + "joint.family_id": "String", + "joint.sample_ids": "Array[String]", + "joint.gvcfs": "Array[File]", + "joint.gvcf_indices": "Array[File]", + "joint.svsigs": "Array[File]", + "joint.ref_map_file": "File", + "joint.glnexus_mem_gb": "Int? (optional)", + "joint.pbsv_call_mem_gb": "Int? (optional)", + "joint.default_runtime_attributes": { + "max_retries": "Int", + "container_registry": "String", + "gpuType": "String", + "backend": "String", + "preemptible_tries": "Int", + "zones": "String" + } +} \ No newline at end of file diff --git a/workflows/joint/joint.wdl b/workflows/joint/joint.wdl new file mode 100644 index 00000000..6f1f32b3 --- /dev/null +++ b/workflows/joint/joint.wdl @@ -0,0 +1,158 @@ +version 1.0 + +import "../wdl-common/wdl/structs.wdl" +import "../wdl-common/wdl/tasks/glnexus.wdl" as Glnexus +import "../wdl-common/wdl/tasks/pbsv.wdl" as Pbsv +import "../wdl-common/wdl/tasks/bcftools.wdl" as Bcftools +import "../wdl-common/wdl/workflows/get_pbsv_splits/get_pbsv_splits.wdl" as Pbsv_splits + +workflow joint { + meta { + description: "Tasks for joint-calling variants from a set of samples and splitting the joint calls by sample for parallel phasing." + } + + parameter_meta { + family_id: { + name: "Cohort ID" + } + sample_ids: { + name: "Sample IDs" + } + gvcfs: { + name: "GVCFs" + } + gvcf_indices: { + name: "GVCF Indices" + } + svsigs: { + name: "SV Signatures" + } + ref_map_file: { + name: "Reference Map File" + } + glnexus_mem_gb: { + name: "GLnexus Memory (GB)" + } + pbsv_call_mem_gb: { + name: "PBSV Call Memory (GB)" + } + default_runtime_attributes: { + name: "Default Runtime Attribute Struct" + } + split_joint_structural_variant_vcfs: { + name: "Joint-call structural variant VCF, split by sample" + } + split_joint_structural_variant_vcf_indices: { + name: "Joint-call structural variant VCF indices, split by sample" + } + split_joint_small_variant_vcfs: { + name: "Joint-call small variant VCF, split by sample" + } + split_joint_small_variant_vcf_indices: { + name: "Joint-call small variant VCF indices, split by sample" + } + } + + input { + String family_id + Array[String] sample_ids + + Array[File] gvcfs + Array[File] gvcf_indices + + Array[File] svsigs + + File ref_map_file + + Int? glnexus_mem_gb + Int? pbsv_call_mem_gb + + RuntimeAttributes default_runtime_attributes + } + + Map[String, String] ref_map = read_map(ref_map_file) + + call Pbsv_splits.get_pbsv_splits { + input: + pbsv_splits_file = ref_map["pbsv_splits"], # !FileCoercion + default_runtime_attributes = default_runtime_attributes + } + + scatter (shard_index in range(length(get_pbsv_splits.pbsv_splits))) { + Array[String] region_set = get_pbsv_splits.pbsv_splits[shard_index] + + call Pbsv.pbsv_call { + input: + sample_id = family_id + ".joint", + svsigs = svsigs, + sample_count = length(sample_ids), + ref_fasta = ref_map["fasta"], # !FileCoercion + ref_index = ref_map["fasta_index"], # !FileCoercion + ref_name = ref_map["name"], + shard_index = shard_index, + regions = region_set, + mem_gb = pbsv_call_mem_gb, + runtime_attributes = default_runtime_attributes + } + } + + # concatenate pbsv vcfs + call Bcftools.concat_pbsv_vcf { + input: + vcfs = pbsv_call.vcf, + vcf_indices = pbsv_call.vcf_index, + out_prefix = "~{family_id}.joint.~{ref_map['name']}.structural_variants", + runtime_attributes = default_runtime_attributes + } + + String sv_vcf_basename = basename(concat_pbsv_vcf.concatenated_vcf, ".vcf.gz") + + scatter (sample_id in sample_ids) { + String split_sv_vcf_name = "~{sample_id}.~{sv_vcf_basename}.vcf.gz" + String split_sv_vcf_index_name = "~{sample_id}.~{sv_vcf_basename}.vcf.gz.tbi" + } + + call Bcftools.split_vcf_by_sample as split_pbsv { + input: + sample_ids = sample_ids, + vcf = concat_pbsv_vcf.concatenated_vcf, + vcf_index = concat_pbsv_vcf.concatenated_vcf_index, + split_vcf_names = split_sv_vcf_name, + split_vcf_index_names = split_sv_vcf_index_name, + runtime_attributes = default_runtime_attributes + } + + call Glnexus.glnexus { + input: + cohort_id = family_id + ".joint", + gvcfs = gvcfs, + gvcf_indices = gvcf_indices, + ref_name = ref_map["name"], + mem_gb = glnexus_mem_gb, + runtime_attributes = default_runtime_attributes + } + + String glnexus_vcf_basename = basename(glnexus.vcf, ".vcf.gz") + + scatter (sample_id in sample_ids) { + String split_glnexus_vcf_name = "~{sample_id}.~{glnexus_vcf_basename}.vcf.gz" + String split_glnexus_vcf_index_name = "~{sample_id}.~{glnexus_vcf_basename}.vcf.gz.tbi" + } + + call Bcftools.split_vcf_by_sample as split_glnexus { + input: + sample_ids = sample_ids, + vcf = glnexus.vcf, + vcf_index = glnexus.vcf_index, + split_vcf_names = split_glnexus_vcf_name, + split_vcf_index_names = split_glnexus_vcf_index_name, + runtime_attributes = default_runtime_attributes + } + + output { + Array[File] split_joint_structural_variant_vcfs = split_pbsv.split_vcfs + Array[File] split_joint_structural_variant_vcf_indices = split_pbsv.split_vcf_indices + Array[File] split_joint_small_variant_vcfs = split_glnexus.split_vcfs + Array[File] split_joint_small_variant_vcf_indices = split_glnexus.split_vcf_indices + } +} diff --git a/workflows/main.wdl b/workflows/main.wdl deleted file mode 100644 index 0ee719aa..00000000 --- a/workflows/main.wdl +++ /dev/null @@ -1,172 +0,0 @@ -version 1.0 - -import "humanwgs_structs.wdl" -import "wdl-common/wdl/workflows/backend_configuration/backend_configuration.wdl" as BackendConfiguration -import "sample_analysis/sample_analysis.wdl" as SampleAnalysis -import "cohort_analysis/cohort_analysis.wdl" as CohortAnalysis -import "tertiary_analysis/tertiary_analysis.wdl" as TertiaryAnalysis - -workflow humanwgs { - input { - Cohort cohort - - ReferenceData reference - SlivarData? slivar_data - - String deepvariant_version = "1.5.0" - DeepVariantModel? deepvariant_model - - Int? pbsv_call_mem_gb - Int? glnexus_mem_gb - - Boolean run_tertiary_analysis = false - - # Backend configuration - String backend - String? zones - String? aws_spot_queue_arn - String? aws_on_demand_queue_arn - String? container_registry - - Boolean preemptible - } - - call BackendConfiguration.backend_configuration { - input: - backend = backend, - zones = zones, - aws_spot_queue_arn = aws_spot_queue_arn, - aws_on_demand_queue_arn = aws_on_demand_queue_arn, - container_registry = container_registry - } - - RuntimeAttributes default_runtime_attributes = if preemptible then backend_configuration.spot_runtime_attributes else backend_configuration.on_demand_runtime_attributes - - scatter (sample in cohort.samples) { - call SampleAnalysis.sample_analysis { - input: - sample = sample, - reference = reference, - deepvariant_version = deepvariant_version, - deepvariant_model = deepvariant_model, - default_runtime_attributes = default_runtime_attributes - } - } - - if (length(cohort.samples) > 1) { - - scatter (sample in cohort.samples) { - String sample_id = sample.sample_id - } - - call CohortAnalysis.cohort_analysis { - input: - cohort_id = cohort.cohort_id, - sample_ids = sample_id, - aligned_bams = flatten(sample_analysis.aligned_bams), - svsigs = flatten(sample_analysis.svsigs), - gvcfs = sample_analysis.small_variant_gvcf, - reference = reference, - pbsv_call_mem_gb = pbsv_call_mem_gb, - glnexus_mem_gb = glnexus_mem_gb, - default_runtime_attributes = default_runtime_attributes - } - } - - if (run_tertiary_analysis && defined(slivar_data) && defined(reference.gnomad_af) && defined(reference.hprc_af) && defined(reference.gff) && defined(reference.population_vcfs)) { - IndexData slivar_small_variant_input_vcf = select_first([ - cohort_analysis.phased_joint_small_variant_vcf, - sample_analysis.phased_small_variant_vcf[0] - ]) - IndexData slivar_sv_input_vcf = select_first([ - cohort_analysis.phased_joint_sv_vcf, - sample_analysis.phased_sv_vcf[0] - ]) - - call TertiaryAnalysis.tertiary_analysis { - input: - cohort = cohort, - small_variant_vcf = slivar_small_variant_input_vcf, - sv_vcf = slivar_sv_input_vcf, - reference = reference, - slivar_data = select_first([slivar_data]), - default_runtime_attributes = default_runtime_attributes - } - } - - output { - # sample_analysis output - - # per movie stats, alignments - Array[Array[File]] bam_stats = sample_analysis.bam_stats - Array[Array[File]] read_length_summary = sample_analysis.read_length_summary - Array[Array[File]] read_quality_summary = sample_analysis.read_quality_summary - - # per sample small variant calls - Array[IndexData] small_variant_gvcfs = sample_analysis.small_variant_gvcf - Array[File] small_variant_vcf_stats = sample_analysis.small_variant_vcf_stats - Array[File] small_variant_roh_out = sample_analysis.small_variant_roh_out - Array[File] small_variant_roh_bed = sample_analysis.small_variant_roh_bed - - # per sample final phased variant calls and haplotagged alignments - Array[IndexData] sample_phased_small_variant_vcfs = sample_analysis.phased_small_variant_vcf - Array[IndexData] sample_phased_sv_vcfs = sample_analysis.phased_sv_vcf - Array[File] sample_hiphase_stats = sample_analysis.hiphase_stats - Array[File] sample_hiphase_blocks = sample_analysis.hiphase_blocks - Array[File] sample_hiphase_haplotags = sample_analysis.hiphase_haplotags - Array[IndexData] merged_haplotagged_bam = sample_analysis.merged_haplotagged_bam - Array[File] haplotagged_bam_mosdepth_summary = sample_analysis.haplotagged_bam_mosdepth_summary - Array[File] haplotagged_bam_mosdepth_region_bed = sample_analysis.haplotagged_bam_mosdepth_region_bed - - # per sample trgt outputs - Array[IndexData] trgt_spanning_reads = sample_analysis.trgt_spanning_reads - Array[IndexData] trgt_repeat_vcf = sample_analysis.trgt_repeat_vcf - Array[File] trgt_dropouts = sample_analysis.trgt_dropouts - - # per sample cpg outputs - Array[Array[File]] cpg_pileup_beds = sample_analysis.cpg_pileup_beds - Array[Array[File]] cpg_pileup_bigwigs = sample_analysis.cpg_pileup_bigwigs - - # per sample paraphase outputs - Array[File] paraphase_output_jsons = sample_analysis.paraphase_output_json - Array[IndexData] paraphase_realigned_bams = sample_analysis.paraphase_realigned_bam - Array[Array[File]] paraphase_vcfs = sample_analysis.paraphase_vcfs - - # per sample hificnv outputs - Array[IndexData] hificnv_vcfs = sample_analysis.hificnv_vcf - Array[File] hificnv_copynum_bedgraphs = sample_analysis.hificnv_copynum_bedgraph - Array[File] hificnv_depth_bws = sample_analysis.hificnv_depth_bw - Array[File] hificnv_maf_bws = sample_analysis.hificnv_maf_bw - - # cohort_analysis output - IndexData? cohort_sv_vcf = cohort_analysis.phased_joint_sv_vcf - IndexData? cohort_small_variant_vcf = cohort_analysis.phased_joint_small_variant_vcf - File? cohort_hiphase_stats = cohort_analysis.hiphase_stats - File? cohort_hiphase_blocks = cohort_analysis.hiphase_blocks - - # tertiary_analysis output - IndexData? filtered_small_variant_vcf = tertiary_analysis.filtered_small_variant_vcf - IndexData? compound_het_small_variant_vcf = tertiary_analysis.compound_het_small_variant_vcf - File? filtered_small_variant_tsv = tertiary_analysis.filtered_small_variant_tsv - File? compound_het_small_variant_tsv = tertiary_analysis.compound_het_small_variant_tsv - IndexData? filtered_svpack_vcf = tertiary_analysis.filtered_svpack_vcf - File? filtered_svpack_tsv = tertiary_analysis.filtered_svpack_tsv - } - - parameter_meta { - cohort: {help: "Sample information for the cohort"} - reference: {help: "Reference genome data"} - slivar_data: {help: "Data files used for annotation with slivar (required if `run_tertiary_analysis` is set to `true`)"} - deepvariant_version: {help: "Version of deepvariant to use"} - deepvariant_model: {help: "Optional deepvariant model file to use"} - pbsv_call_mem_gb: {help: "Optional amount of RAM in GB for pbsv_call; default 64 for cohorts N<=3, 96 for cohorts N>3"} - glnexus_mem_gb: {help: "Optional amount of RAM in GB for glnexus; default 30"} - run_tertiary_analysis: {help: "Run the optional tertiary analysis steps"} - backend: {help: "Backend where the workflow will be executed ['GCP', 'Azure', 'AWS', 'HPC']"} - zones: {help: "Zones where compute will take place; required if backend is set to 'AWS' or 'GCP'"} - aws_spot_queue_arn: {help: "Queue ARN for the spot batch queue; required if backend is set to 'AWS'"} - aws_on_demand_queue_arn: {help: "Queue ARN for the on demand batch queue; required if backend is set to 'AWS'"} - container_registry: {help: "Container registry where workflow images are hosted. If left blank, PacBio's public Quay.io registry will be used."} - preemptible: {help: "Where possible, run tasks preemptibly"} - } -} diff --git a/workflows/sample_analysis/inputs.json b/workflows/sample_analysis/inputs.json deleted file mode 100644 index 6a7ff8ca..00000000 --- a/workflows/sample_analysis/inputs.json +++ /dev/null @@ -1,41 +0,0 @@ -{ - "sample_analysis.sample": { - "sample_id": "String", - "movie_bams": [ - "File" - ], - "sex": "String? (optional)", - "affected": "Boolean", - "father_id": "String? (optional)", - "mother_id": "String? (optional)" - }, - "sample_analysis.reference": { - "name": "String", - "fasta": { - "data": "File", - "data_index": "File" - }, - "pbsv_splits": "File", - "tandem_repeat_bed": "File", - "trgt_tandem_repeat_bed": "File", - "hificnv_exclude_bed": { - "data": "File", - "data_index": "File" - }, - "hificnv_expected_bed_male": "File", - "hificnv_expected_bed_female": "File", - "gnomad_af": "File? (optional)", - "hprc_af": "File? (optional)", - "gff": "File? (optional)", - "population_vcfs": "Array[WomCompositeType {\n data -> File\ndata_index -> File \n}]? (optional)" - }, - "sample_analysis.deepvariant_version": "String", - "sample_analysis.deepvariant_model": "WomCompositeType {\n model -> WomCompositeType {\n data -> File\ndata_index -> File \n}\nmetadata -> File \n}? (optional)", - "sample_analysis.default_runtime_attributes": { - "preemptible_tries": "Int", - "max_retries": "Int", - "zones": "String", - "queue_arn": "String", - "container_registry": "String" - } -} diff --git a/workflows/sample_analysis/sample_analysis.wdl b/workflows/sample_analysis/sample_analysis.wdl deleted file mode 100644 index 1f4f1a8d..00000000 --- a/workflows/sample_analysis/sample_analysis.wdl +++ /dev/null @@ -1,737 +0,0 @@ -version 1.0 - -# Run for each sample in the cohort. Aligns reads from each movie to the reference genome, then calls and phases small and structural variants. - -import "../humanwgs_structs.wdl" -import "../wdl-common/wdl/tasks/pbsv_discover.wdl" as PbsvDiscover -import "../wdl-common/wdl/workflows/deepvariant/deepvariant.wdl" as DeepVariant -import "../wdl-common/wdl/tasks/mosdepth.wdl" as Mosdepth -import "../wdl-common/wdl/tasks/pbsv_call.wdl" as PbsvCall -import "../wdl-common/wdl/tasks/concat_vcf.wdl" as ConcatVcf -import "../wdl-common/wdl/workflows/hiphase/hiphase.wdl" as HiPhase - -workflow sample_analysis { - input { - Sample sample - - ReferenceData reference - - String deepvariant_version - DeepVariantModel? deepvariant_model - - RuntimeAttributes default_runtime_attributes - } - - Array[Array[String]] pbsv_splits = read_json(reference.pbsv_splits) - - scatter (movie_bam in sample.movie_bams) { - call pbmm2_align { - input: - sample_id = sample.sample_id, - bam = movie_bam, - reference = reference.fasta.data, - reference_index = reference.fasta.data_index, - reference_name = reference.name, - runtime_attributes = default_runtime_attributes - } - - call PbsvDiscover.pbsv_discover { - input: - aligned_bam = pbmm2_align.aligned_bam, - aligned_bam_index = pbmm2_align.aligned_bam_index, - reference_tandem_repeat_bed = reference.tandem_repeat_bed, - runtime_attributes = default_runtime_attributes - } - - IndexData aligned_bam = { - "data": pbmm2_align.aligned_bam, - "data_index": pbmm2_align.aligned_bam_index - } - } - - call DeepVariant.deepvariant { - input: - sample_id = sample.sample_id, - aligned_bams = aligned_bam, - reference_fasta = reference.fasta, - reference_name = reference.name, - deepvariant_version = deepvariant_version, - deepvariant_model = deepvariant_model, - default_runtime_attributes = default_runtime_attributes - } - - call bcftools { - input: - vcf = deepvariant.vcf.data, - stats_params = "--apply-filters PASS --samples ~{sample.sample_id}", - reference = reference.fasta.data, - runtime_attributes = default_runtime_attributes - } - - scatter (shard_index in range(length(pbsv_splits))) { - Array[String] region_set = pbsv_splits[shard_index] - - call PbsvCall.pbsv_call { - input: - sample_id = sample.sample_id, - svsigs = pbsv_discover.svsig, - reference = reference.fasta.data, - reference_index = reference.fasta.data_index, - reference_name = reference.name, - shard_index = shard_index, - regions = region_set, - runtime_attributes = default_runtime_attributes - } - } - - call ConcatVcf.concat_vcf { - input: - vcfs = pbsv_call.pbsv_vcf, - vcf_indices = pbsv_call.pbsv_vcf_index, - output_vcf_name = "~{sample.sample_id}.~{reference.name}.pbsv.vcf.gz", - runtime_attributes = default_runtime_attributes - } - - IndexData zipped_pbsv_vcf = { - "data": concat_vcf.concatenated_vcf, - "data_index": concat_vcf.concatenated_vcf_index - } - - call HiPhase.hiphase { - # vcfs order: small variants, SVs - input: - id = sample.sample_id, - refname = reference.name, - sample_ids = [sample.sample_id], - vcfs = [deepvariant.vcf, zipped_pbsv_vcf], - bams = aligned_bam, - haplotag = true, - reference_fasta = reference.fasta, - default_runtime_attributes = default_runtime_attributes - } - - # merge haplotagged bams if there are multiple - if (length(hiphase.haplotagged_bams) > 1) { - scatter (bam_object in hiphase.haplotagged_bams) { - File bam_to_merge = bam_object.data - } - call merge_bams { - input: - bams = bam_to_merge, - output_bam_name = "~{sample.sample_id}.~{reference.name}.haplotagged.bam", - runtime_attributes = default_runtime_attributes - } - } - - # select the merged bam if it exists, otherwise select the first (only) haplotagged bam - File haplotagged_bam = select_first([merge_bams.merged_bam, hiphase.haplotagged_bams[0].data]) - File haplotagged_bam_index = select_first([merge_bams.merged_bam_index, hiphase.haplotagged_bams[0].data_index]) - - call Mosdepth.mosdepth { - input: - aligned_bam = haplotagged_bam, - aligned_bam_index = haplotagged_bam_index, - runtime_attributes = default_runtime_attributes - } - - call trgt { - input: - sample_id = sample.sample_id, - sex = sample.sex, - bam = haplotagged_bam, - bam_index = haplotagged_bam_index, - reference = reference.fasta.data, - reference_index = reference.fasta.data_index, - tandem_repeat_bed = reference.trgt_tandem_repeat_bed, - runtime_attributes = default_runtime_attributes - } - - call coverage_dropouts { - input: - bam = haplotagged_bam, - bam_index = haplotagged_bam_index, - tandem_repeat_bed = reference.trgt_tandem_repeat_bed, - output_prefix = "~{sample.sample_id}.~{reference.name}", - runtime_attributes = default_runtime_attributes - } - - call cpg_pileup { - input: - bam = haplotagged_bam, - bam_index = haplotagged_bam_index, - output_prefix = "~{sample.sample_id}.~{reference.name}", - reference = reference.fasta.data, - reference_index = reference.fasta.data_index, - runtime_attributes = default_runtime_attributes - } - - call paraphase { - input: - sample_id = sample.sample_id, - bam = haplotagged_bam, - bam_index = haplotagged_bam_index, - reference = reference.fasta.data, - reference_index = reference.fasta.data_index, - out_directory = "~{sample.sample_id}.paraphase", - runtime_attributes = default_runtime_attributes - } - - call hificnv { - input: - sample_id = sample.sample_id, - sex = sample.sex, - bam = haplotagged_bam, - bam_index = haplotagged_bam_index, - phased_vcf = hiphase.phased_vcfs[0].data, - phased_vcf_index = hiphase.phased_vcfs[0].data_index, - reference = reference.fasta.data, - reference_index = reference.fasta.data_index, - exclude_bed = reference.hificnv_exclude_bed.data, - exclude_bed_index = reference.hificnv_exclude_bed.data_index, - expected_bed_male = reference.hificnv_expected_bed_male, - expected_bed_female = reference.hificnv_expected_bed_female, - output_prefix = "hificnv", - runtime_attributes = default_runtime_attributes - } - - output { - # per movie stats, alignments, and svsigs - Array[File] bam_stats = pbmm2_align.bam_stats - Array[File] read_length_summary = pbmm2_align.read_length_summary - Array[File] read_quality_summary = pbmm2_align.read_quality_summary - Array[IndexData] aligned_bams = aligned_bam - Array[File] svsigs = pbsv_discover.svsig - - # per sample small variant calls - IndexData small_variant_gvcf = deepvariant.gvcf - File small_variant_vcf_stats = bcftools.stats - File small_variant_roh_out = bcftools.roh_out - File small_variant_roh_bed = bcftools.roh_bed - - # per sample final phased variant calls and haplotagged alignments - # phased_vcfs order: small variants, SVs - IndexData phased_small_variant_vcf = hiphase.phased_vcfs[0] - IndexData phased_sv_vcf = hiphase.phased_vcfs[1] - File hiphase_stats = hiphase.hiphase_stats - File hiphase_blocks = hiphase.hiphase_blocks - File hiphase_haplotags = select_first([hiphase.hiphase_haplotags]) - IndexData merged_haplotagged_bam = {"data": haplotagged_bam, "data_index": haplotagged_bam_index} - File haplotagged_bam_mosdepth_summary = mosdepth.summary - File haplotagged_bam_mosdepth_region_bed = mosdepth.region_bed - - # per sample trgt outputs - IndexData trgt_spanning_reads = {"data": trgt.spanning_reads, "data_index": trgt.spanning_reads_index} - IndexData trgt_repeat_vcf = {"data": trgt.repeat_vcf, "data_index": trgt.repeat_vcf_index} - File trgt_dropouts = coverage_dropouts.trgt_dropouts - - # per sample cpg outputs - Array[File] cpg_pileup_beds = cpg_pileup.pileup_beds - Array[File] cpg_pileup_bigwigs = cpg_pileup.pileup_bigwigs - - # per sample paraphase outputs - File paraphase_output_json = paraphase.output_json - IndexData paraphase_realigned_bam = {"data": paraphase.realigned_bam, "data_index": paraphase.realigned_bam_index} - Array[File] paraphase_vcfs = paraphase.paraphase_vcfs - - # per sample hificnv outputs - IndexData hificnv_vcf = {"data": hificnv.cnv_vcf, "data_index": hificnv.cnv_vcf_index} - File hificnv_copynum_bedgraph = hificnv.copynum_bedgraph - File hificnv_depth_bw = hificnv.depth_bw - File hificnv_maf_bw = hificnv.maf_bw - } - - parameter_meta { - sample: {help: "Sample information and associated data files"} - reference: {help: "Reference genome data"} - deepvariant_version: {help: "Version of deepvariant to use"} - deepvariant_model: {help: "Optional deepvariant model file to use"} - default_runtime_attributes: {help: "Default RuntimeAttributes; spot if preemptible was set to true, otherwise on_demand"} - } -} - -task pbmm2_align { - input { - String sample_id - File bam - - File reference - File reference_index - String reference_name - - RuntimeAttributes runtime_attributes - } - - String movie = basename(bam, ".bam") - - Int threads = 24 - Int mem_gb = ceil(threads * 4) - Int disk_size = ceil((size(bam, "GB") + size(reference, "GB")) * 4 + 20) - - command <<< - set -euo pipefail - - pbmm2 --version - - pbmm2 align \ - --num-threads ~{threads} \ - --sort-memory 4G \ - --preset HIFI \ - --sample ~{sample_id} \ - --log-level INFO \ - --sort \ - --unmapped \ - ~{reference} \ - ~{bam} \ - ~{sample_id}.~{movie}.~{reference_name}.aligned.bam - - # movie stats - extract_read_length_and_qual.py \ - ~{bam} \ - > ~{sample_id}.~{movie}.read_length_and_quality.tsv - - awk '{{ b=int($2/1000); b=(b>39?39:b); print 1000*b "\t" $2; }}' \ - ~{sample_id}.~{movie}.read_length_and_quality.tsv \ - | sort -k1,1g \ - | datamash -g 1 count 1 sum 2 \ - | awk 'BEGIN {{ for(i=0;i<=39;i++) {{ print 1000*i"\t0\t0"; }} }} {{ print; }}' \ - | sort -k1,1g \ - | datamash -g 1 sum 2 sum 3 \ - > ~{sample_id}.~{movie}.read_length_summary.tsv - - awk '{{ print ($3>50?50:$3) "\t" $2; }}' \ - ~{sample_id}.~{movie}.read_length_and_quality.tsv \ - | sort -k1,1g \ - | datamash -g 1 count 1 sum 2 \ - | awk 'BEGIN {{ for(i=0;i<=60;i++) {{ print i"\t0\t0"; }} }} {{ print; }}' \ - | sort -k1,1g \ - | datamash -g 1 sum 2 sum 3 \ - > ~{sample_id}.~{movie}.read_quality_summary.tsv - >>> - - output { - File aligned_bam = "~{sample_id}.~{movie}.~{reference_name}.aligned.bam" - File aligned_bam_index = "~{sample_id}.~{movie}.~{reference_name}.aligned.bam.bai" - File bam_stats = "~{sample_id}.~{movie}.read_length_and_quality.tsv" - File read_length_summary = "~{sample_id}.~{movie}.read_length_summary.tsv" - File read_quality_summary = "~{sample_id}.~{movie}.read_quality_summary.tsv" - } - - runtime { - docker: "~{runtime_attributes.container_registry}/pbmm2@sha256:1013aa0fd5fb42c607d78bfe3ec3d19e7781ad3aa337bf84d144c61ed7d51fa1" - cpu: threads - memory: mem_gb + " GB" - disk: disk_size + " GB" - disks: "local-disk " + disk_size + " HDD" - preemptible: runtime_attributes.preemptible_tries - maxRetries: runtime_attributes.max_retries - awsBatchRetryAttempts: runtime_attributes.max_retries - queueArn: runtime_attributes.queue_arn - zones: runtime_attributes.zones - } -} - -task bcftools { - input { - File vcf - - String? stats_params - File reference - - RuntimeAttributes runtime_attributes - } - - String vcf_basename = basename(vcf, ".vcf.gz") - - Int threads = 2 - Int reference_size = if (defined(reference)) then ceil(size(reference, "GB")) else 0 - Int disk_size = ceil((size(vcf, "GB") + reference_size) * 2 + 20) - - command <<< - set -euo pipefail - - bcftools --version - - bcftools stats \ - --threads ~{threads - 1} \ - ~{stats_params} \ - ~{"--fasta-ref " + reference} \ - ~{vcf} \ - > ~{vcf_basename}.vcf.stats.txt - - bcftools roh \ - --threads ~{threads - 1} \ - --AF-dflt 0.4 \ - ~{vcf} \ - > ~{vcf_basename}.bcftools_roh.out - - echo -e "#chr\\tstart\\tend\\tqual" > ~{vcf_basename}.roh.bed - awk -v OFS='\t' '$1=="RG" {{ print $3, $4, $5, $8 }}' \ - ~{vcf_basename}.bcftools_roh.out \ - >> ~{vcf_basename}.roh.bed - >>> - - output { - File stats = "~{vcf_basename}.vcf.stats.txt" - File roh_out = "~{vcf_basename}.bcftools_roh.out" - File roh_bed = "~{vcf_basename}.roh.bed" - } - - runtime { - docker: "~{runtime_attributes.container_registry}/bcftools@sha256:46720a7ab5feba5be06d5269454a6282deec13060e296f0bc441749f6f26fdec" - cpu: threads - memory: "4 GB" - disk: disk_size + " GB" - disks: "local-disk " + disk_size + " HDD" - preemptible: runtime_attributes.preemptible_tries - maxRetries: runtime_attributes.max_retries - awsBatchRetryAttempts: runtime_attributes.max_retries - queueArn: runtime_attributes.queue_arn - zones: runtime_attributes.zones - } -} - -task merge_bams { - input { - Array[File] bams - - String output_bam_name - - RuntimeAttributes runtime_attributes - } - - Int threads = 8 - Int disk_size = ceil(size(bams, "GB") * 2 + 20) - - command <<< - set -euo pipefail - - samtools --version - - samtools merge \ - -@ ~{threads - 1} \ - -o ~{output_bam_name} \ - ~{sep=' ' bams} - - samtools index ~{output_bam_name} - >>> - - output { - File merged_bam = "~{output_bam_name}" - File merged_bam_index = "~{output_bam_name}.bai" - } - - runtime { - docker: "~{runtime_attributes.container_registry}/samtools@sha256:cbe496e16773d4ad6f2eec4bd1b76ff142795d160f9dd418318f7162dcdaa685" - cpu: threads - memory: "4 GB" - disk: disk_size + " GB" - disks: "local-disk " + disk_size + " LOCAL" - preemptible: runtime_attributes.preemptible_tries - maxRetries: runtime_attributes.max_retries - awsBatchRetryAttempts: runtime_attributes.max_retries - queueArn: runtime_attributes.queue_arn - zones: runtime_attributes.zones - } -} - -task trgt { - input { - String sample_id - String? sex - - File bam - File bam_index - - File reference - File reference_index - File tandem_repeat_bed - - RuntimeAttributes runtime_attributes - } - - Boolean sex_defined = defined(sex) - String karyotype = if select_first([sex, "FEMALE"]) == "MALE" then "XY" else "XX" - String bam_basename = basename(bam, ".bam") - Int threads = 4 - Int disk_size = ceil((size(bam, "GB") + size(reference, "GB")) * 2 + 20) - - command <<< - set -euo pipefail - - echo ~{if sex_defined then "" else "Sex is not defined for ~{sample_id}. Defaulting to karyotype XX for TRGT."} - - trgt --version - - trgt \ - --threads ~{threads} \ - --karyotype ~{karyotype} \ - --genome ~{reference} \ - --repeats ~{tandem_repeat_bed} \ - --reads ~{bam} \ - --output-prefix ~{bam_basename}.trgt - - bcftools --version - - bcftools sort \ - --output-type z \ - --output ~{bam_basename}.trgt.sorted.vcf.gz \ - ~{bam_basename}.trgt.vcf.gz - - bcftools index \ - --threads ~{threads - 1} \ - --tbi \ - ~{bam_basename}.trgt.sorted.vcf.gz - - samtools --version - - samtools sort \ - -@ ~{threads - 1} \ - -o ~{bam_basename}.trgt.spanning.sorted.bam \ - ~{bam_basename}.trgt.spanning.bam - - samtools index \ - -@ ~{threads - 1} \ - ~{bam_basename}.trgt.spanning.sorted.bam - >>> - - output { - File spanning_reads = "~{bam_basename}.trgt.spanning.sorted.bam" - File spanning_reads_index = "~{bam_basename}.trgt.spanning.sorted.bam.bai" - File repeat_vcf = "~{bam_basename}.trgt.sorted.vcf.gz" - File repeat_vcf_index = "~{bam_basename}.trgt.sorted.vcf.gz.tbi" - } - - runtime { - docker: "~{runtime_attributes.container_registry}/trgt@sha256:8c9f236eb3422e79d7843ffd59e1cbd9b76774525f20d88cd68ca64eb63054eb" - cpu: threads - memory: "4 GB" - disk: disk_size + " GB" - disks: "local-disk " + disk_size + " HDD" - preemptible: runtime_attributes.preemptible_tries - maxRetries: runtime_attributes.max_retries - awsBatchRetryAttempts: runtime_attributes.max_retries - queueArn: runtime_attributes.queue_arn - zones: runtime_attributes.zones - } -} - -task coverage_dropouts { - input { - File bam - File bam_index - - File tandem_repeat_bed - - String output_prefix - - RuntimeAttributes runtime_attributes - } - - Int threads = 2 - Int disk_size = ceil((size(bam, "GB")) * 2 + 20) - - command <<< - set -euo pipefail - - # Get coverage dropouts - check_trgt_coverage.py \ - ~{tandem_repeat_bed} \ - ~{bam} \ - > ~{output_prefix}.trgt.dropouts.txt - >>> - - output { - File trgt_dropouts = "~{output_prefix}.trgt.dropouts.txt" - } - - runtime { - docker: "~{runtime_attributes.container_registry}/trgt@sha256:8c9f236eb3422e79d7843ffd59e1cbd9b76774525f20d88cd68ca64eb63054eb" - cpu: threads - memory: "4 GB" - disk: disk_size + " GB" - disks: "local-disk " + disk_size + " HDD" - preemptible: runtime_attributes.preemptible_tries - maxRetries: runtime_attributes.max_retries - awsBatchRetryAttempts: runtime_attributes.max_retries - queueArn: runtime_attributes.queue_arn - zones: runtime_attributes.zones - } -} - -task cpg_pileup { - input { - File bam - File bam_index - - String output_prefix - - File reference - File reference_index - - RuntimeAttributes runtime_attributes - } - - Int threads = 12 - # Uses ~4 GB memory / thread - Int mem_gb = threads * 4 - Int disk_size = ceil((size(bam, "GB") + size(reference, "GB")) * 2 + 20) - - command <<< - set -euo pipefail - - aligned_bam_to_cpg_scores --version - - aligned_bam_to_cpg_scores \ - --threads ~{threads} \ - --bam ~{bam} \ - --ref ~{reference} \ - --output-prefix ~{output_prefix} \ - --min-mapq 1 \ - --min-coverage 10 \ - --model "$PILEUP_MODEL_DIR"/pileup_calling_model.v1.tflite - >>> - - output { - Array[File] pileup_beds = glob("~{output_prefix}.*.bed") - Array[File] pileup_bigwigs = glob("~{output_prefix}.*.bw") - } - - runtime { - docker: "~{runtime_attributes.container_registry}/pb-cpg-tools@sha256:b95ff1c53bb16e53b8c24f0feaf625a4663973d80862518578437f44385f509b" - cpu: threads - memory: mem_gb + " GB" - disk: disk_size + " GB" - disks: "local-disk " + disk_size + " HDD" - preemptible: runtime_attributes.preemptible_tries - maxRetries: runtime_attributes.max_retries - awsBatchRetryAttempts: runtime_attributes.max_retries - queueArn: runtime_attributes.queue_arn - zones: runtime_attributes.zones - } -} - -task paraphase { - input { - File bam - File bam_index - - File reference - File reference_index - - String sample_id - String out_directory - - RuntimeAttributes runtime_attributes - } - - Int threads = 4 - Int mem_gb = 4 - Int disk_size = ceil(size(bam, "GB") + 20) - - command <<< - set -euo pipefail - - paraphase --version - - paraphase \ - --threads ~{threads} \ - --bam ~{bam} \ - --reference ~{reference} \ - --out ~{out_directory} - >>> - - output { - File output_json = "~{out_directory}/~{sample_id}.json" - File realigned_bam = "~{out_directory}/~{sample_id}_realigned_tagged.bam" - File realigned_bam_index = "~{out_directory}/~{sample_id}_realigned_tagged.bam.bai" - Array[File] paraphase_vcfs = glob("~{out_directory}/~{sample_id}_vcfs/*.vcf") - } - - runtime { - docker: "~{runtime_attributes.container_registry}/paraphase@sha256:186dec5f6dabedf8c90fe381cd8f934d31fe74310175efee9ca4f603deac954d" - cpu: threads - memory: mem_gb + " GB" - disk: disk_size + " GB" - disks: "local-disk " + disk_size + " HDD" - preemptible: runtime_attributes.preemptible_tries - maxRetries: runtime_attributes.max_retries - awsBatchRetryAttempts: runtime_attributes.max_retries - queueArn: runtime_attributes.queue_arn - zones: runtime_attributes.zones - } -} - -task hificnv { - input { - String sample_id - String? sex - - File bam - File bam_index - - File phased_vcf - File phased_vcf_index - - File reference - File reference_index - - File exclude_bed - File exclude_bed_index - - File expected_bed_male - File expected_bed_female - - String output_prefix - - RuntimeAttributes runtime_attributes - } - - Boolean sex_defined = defined(sex) - File expected_bed = if select_first([sex, "FEMALE"]) == "MALE" then expected_bed_male else expected_bed_female - - Int threads = 8 - # Uses ~2 GB memory / thread - Int mem_gb = threads * 2 - # <1 GB for output - Int disk_size = ceil((size(bam, "GB") + size(reference, "GB"))+ 20) - - command <<< - set -euo pipefail - - echo ~{if sex_defined then "" else "Sex is not defined for ~{sample_id}. Defaulting to karyotype XX for HiFiCNV."} - - hificnv --version - - hificnv \ - --threads ~{threads} \ - --bam ~{bam} \ - --ref ~{reference} \ - --maf ~{phased_vcf} \ - --exclude ~{exclude_bed} \ - --expected-cn ~{expected_bed} \ - --output-prefix ~{output_prefix} - - bcftools index --tbi ~{output_prefix}.~{sample_id}.vcf.gz - >>> - - output { - File cnv_vcf = "~{output_prefix}.~{sample_id}.vcf.gz" - File cnv_vcf_index = "~{output_prefix}.~{sample_id}.vcf.gz.tbi" - File copynum_bedgraph = "~{output_prefix}.~{sample_id}.copynum.bedgraph" - File depth_bw = "~{output_prefix}.~{sample_id}.depth.bw" - File maf_bw = "~{output_prefix}.~{sample_id}.maf.bw" - } - - runtime { - docker: "~{runtime_attributes.container_registry}/hificnv@sha256:19fdde99ad2454598ff7d82f27209e96184d9a6bb92dc0485cc7dbe87739b3c2" - cpu: threads - memory: mem_gb + " GB" - disk: disk_size + " GB" - disks: "local-disk " + disk_size + " HDD" - preemptible: runtime_attributes.preemptible_tries - maxRetries: runtime_attributes.max_retries - awsBatchRetryAttempts: runtime_attributes.max_retries - queueArn: runtime_attributes.queue_arn - zones: runtime_attributes.zones - } -} diff --git a/workflows/singleton.inputs.json b/workflows/singleton.inputs.json new file mode 100644 index 00000000..472b8867 --- /dev/null +++ b/workflows/singleton.inputs.json @@ -0,0 +1,22 @@ +{ + "humanwgs_singleton.sample_id": "String", + "humanwgs_singleton.sex": "String? (optional)", + "humanwgs_singleton.hifi_reads": [ + "File" + ], + "humanwgs_singleton.phenotypes": "String? (optional)", + "humanwgs_singleton.ref_map_file": "File", + "humanwgs_singleton.deepvariant_version": "String (optional, default = \"1.6.1\")", + "humanwgs_singleton.custom_deepvariant_model_tar": "File? (optional)", + "humanwgs_singleton.pharmcat_version": "String (optional, default = \"2.15.0\")", + "humanwgs_singleton.pharmcat_min_coverage": "Int (optional, default = 10)", + "humanwgs_singleton.tertiary_map_file": "File? (optional)", + "humanwgs_singleton.gpu": "Boolean (optional, default = false)", + "humanwgs_singleton.backend": "String", + "humanwgs_singleton.zones": "String? (optional)", + "humanwgs_singleton.gpuType": "String? (optional)", + "humanwgs_singleton.container_registry": "String? (optional)", + "humanwgs_singleton.container_namespace": "String? (optional)", + "humanwgs_singleton.preemptible": "Boolean", + "humanwgs_singleton.debug_version": "String? (optional)" +} \ No newline at end of file diff --git a/workflows/singleton.wdl b/workflows/singleton.wdl new file mode 100644 index 00000000..e94ad8ac --- /dev/null +++ b/workflows/singleton.wdl @@ -0,0 +1,337 @@ +version 1.0 + +import "humanwgs_structs.wdl" +import "wdl-common/wdl/workflows/backend_configuration/backend_configuration.wdl" as BackendConfiguration +import "upstream/upstream.wdl" as Upstream +import "downstream/downstream.wdl" as Downstream +import "wdl-common/wdl/tasks/write_ped_phrank.wdl" as Write_ped_phrank +import "tertiary/tertiary.wdl" as TertiaryAnalysis +import "wdl-common/wdl/tasks/utilities.wdl" as Utilities + + +workflow humanwgs_singleton { + meta { + description: "PacBio HiFi human whole genome sequencing pipeline for individual samples." + } + + parameter_meta { + sample_id: { + name: "Unique identifier for the sample" + } + sex: { + name: "Sample sex", + choices: ["MALE", "FEMALE"] + } + hifi_reads: { + name: "Array of paths to HiFi reads in unaligned BAM format." + } + ref_map_file: { + name: "TSV containing reference genome file paths; must match backend" + } + deepvariant_version: { + name: "DeepVariant version" + } + custom_deepvariant_model_tar: { + name: "Custom DeepVariant model tarball" + } + pharmcat_version: { + name: "PharmCAT version" + } + pharmcat_min_coverage: { + name: "Minimum coverage for PharmCAT" + } + phenotypes: { + name: "Comma-delimited list of HPO codes for phenotypes" + } + tertiary_map_file: { + name: "TSV containing tertiary analysis file paths and thresholds; must match backend" + } + gpu: { + name: "Use GPU when possible" + } + backend: { + name: "Backend where the workflow will be executed", + choices: ["GCP", "Azure", "AWS-HealthOmics", "HPC"] + } + zones: { + name: "Zones where compute will take place; required if backend is set to 'GCP'" + } + gpuType: { + name: "GPU type to use; required if gpu is set to `true` for cloud backends; must match backend" + } + container_registry: { + name: "Container registry where workflow images are hosted. If left blank, PacBio's public Quay.io registry will be used. Must be set if backend is set to 'AWS-HealthOmics'", + default: "quay.io/pacbio" + } + preemptible: { + name: "Where possible, run tasks preemptibly" + } + debug_version: { + name: "Debug version for testing purposes" + } + } + + input { + String sample_id + String? sex + Array[File] hifi_reads + + File ref_map_file + + # These options are only intended for testing purposes. + # There is no guarantee that the pipeline will work with + # other version of DeepVariant or with custom models. + String deepvariant_version = "1.6.1" + File? custom_deepvariant_model_tar + + String pharmcat_version = "2.15.4" + Int pharmcat_min_coverage = 10 + + String phenotypes = "HP:0000001" + File? tertiary_map_file + + Boolean gpu = false + + # Backend configuration + String backend + String? zones + String? gpuType + String? container_registry + + Boolean preemptible = true + + String? debug_version + } + + call BackendConfiguration.backend_configuration { + input: + backend = backend, + zones = zones, + gpuType = gpuType, + container_registry = container_registry + } + + RuntimeAttributes default_runtime_attributes = if preemptible then backend_configuration.spot_runtime_attributes else backend_configuration.on_demand_runtime_attributes + + call Upstream.upstream { + input: + sample_id = sample_id, + sex = sex, + hifi_reads = hifi_reads, + ref_map_file = ref_map_file, + deepvariant_version = deepvariant_version, + custom_deepvariant_model_tar = custom_deepvariant_model_tar, + single_sample = true, + gpu = gpu, + default_runtime_attributes = default_runtime_attributes + } + + call Downstream.downstream { + input: + sample_id = sample_id, + small_variant_vcf = upstream.small_variant_vcf, + small_variant_vcf_index = upstream.small_variant_vcf_index, + sv_vcf = select_first([upstream.sv_vcf]), + sv_vcf_index = select_first([upstream.sv_vcf_index]), + trgt_vcf = upstream.trgt_vcf, + trgt_vcf_index = upstream.trgt_vcf_index, + aligned_bam = upstream.out_bam, + aligned_bam_index = upstream.out_bam_index, + pharmcat_version = pharmcat_version, + pharmcat_min_coverage = pharmcat_min_coverage, + ref_map_file = ref_map_file, + default_runtime_attributes = default_runtime_attributes + } + + Map[String, Array[String]] stats = { + 'sample_id': [sample_id], + 'num_reads': [upstream.stat_num_reads], + 'read_length_min': [upstream.stat_read_length_mean], + 'read_length_median': [upstream.stat_read_length_median], + 'read_quality_mean': [upstream.stat_read_quality_mean], + 'read_quality_median': [upstream.stat_read_quality_median], + 'mapped_read_count': [downstream.stat_mapped_read_count], + 'mapped_percent': [downstream.stat_mapped_percent], + 'mean_depth': [upstream.stat_mean_depth], + 'inferred_sex': [upstream.inferred_sex], + 'stat_phased_basepairs': [downstream.stat_phased_basepairs], + 'phase_block_ng50': [downstream.stat_phase_block_ng50], + 'cpg_combined_count': [downstream.stat_combined_cpg_count], + 'cpg_hap1_count': [downstream.stat_hap1_cpg_count], + 'cpg_hap2_count': [downstream.stat_hap2_cpg_count], + 'SNV_count': [downstream.stat_SNV_count], + 'TSTV_ratio': [downstream.stat_TSTV_ratio], + 'HETHOM_ratio': [downstream.stat_HETHOM_ratio], + 'INDEL_count': [downstream.stat_INDEL_count], + 'sv_DUP_count': [downstream.stat_sv_DUP_count], + 'sv_DEL_count': [downstream.stat_sv_DEL_count], + 'sv_INS_count': [downstream.stat_sv_INS_count], + 'sv_INV_count': [downstream.stat_sv_INV_count], + 'sv_BND_count': [downstream.stat_sv_BND_count], + 'cnv_DUP_count': [upstream.stat_cnv_DUP_count], + 'cnv_DEL_count': [upstream.stat_cnv_DEL_count], + 'cnv_DUP_sum': [upstream.stat_cnv_DUP_sum], + 'cnv_DEL_sum': [upstream.stat_cnv_DEL_sum], + 'trgt_genotyped_count': [upstream.stat_trgt_genotyped_count], + 'trgt_uncalled_count': [upstream.stat_trgt_uncalled_count] + } + + call Utilities.consolidate_stats { + input: + id = sample_id, + stats = stats, + runtime_attributes = default_runtime_attributes + } + + if (defined(tertiary_map_file)) { + call Write_ped_phrank.write_ped_phrank { + input: + id = sample_id, + sex = select_first([sex, upstream.inferred_sex]), + phenotypes = phenotypes, + runtime_attributes = default_runtime_attributes + } + + call TertiaryAnalysis.tertiary_analysis { + input: + pedigree = write_ped_phrank.pedigree, + phrank_lookup = write_ped_phrank.phrank_lookup, + small_variant_vcf = downstream.phased_small_variant_vcf, + small_variant_vcf_index = downstream.phased_small_variant_vcf_index, + sv_vcf = downstream.phased_sv_vcf, + sv_vcf_index = downstream.phased_sv_vcf_index, + ref_map_file = ref_map_file, + tertiary_map_file = select_first([tertiary_map_file]), + default_runtime_attributes = default_runtime_attributes + } + } + + output { + # consolidated stats + File stats_file = consolidate_stats.output_tsv + + # bam stats + File bam_stats = upstream.read_length_and_quality + File read_length_plot = upstream.read_length_plot + File read_quality_plot = upstream.read_quality_plot + String stat_num_reads = upstream.stat_num_reads + String stat_read_length_mean = upstream.stat_read_length_mean + String stat_read_length_median = upstream.stat_read_length_median + String stat_read_quality_mean = upstream.stat_read_quality_mean + String stat_read_quality_median = upstream.stat_read_quality_median + + # merged, haplotagged alignments + File merged_haplotagged_bam = downstream.merged_haplotagged_bam + File merged_haplotagged_bam_index = downstream.merged_haplotagged_bam_index + String stat_mapped_read_count = downstream.stat_mapped_read_count + String stat_mapped_percent = downstream.stat_mapped_percent + File mapq_distribution_plot = downstream.mapq_distribution_plot + File mg_distribution_plot = downstream.mg_distribution_plot + + # mosdepth outputs + File mosdepth_summary = upstream.mosdepth_summary + File mosdepth_region_bed = upstream.mosdepth_region_bed + File mosdepth_region_bed_index = upstream.mosdepth_region_bed_index + File mosdepth_depth_distribution_plot = upstream.mosdepth_depth_distribution_plot + String stat_mean_depth = upstream.stat_mean_depth + String inferred_sex = upstream.inferred_sex + + # phasing stats + File phase_stats = downstream.phase_stats + File phase_blocks = downstream.phase_blocks + File phase_haplotags = downstream.phase_haplotags + String stat_phased_basepairs = downstream.stat_phased_basepairs + String stat_phase_block_ng50 = downstream.stat_phase_block_ng50 + + # cpg_pileup outputs + File cpg_combined_bed = downstream.cpg_combined_bed + File cpg_combined_bed_index = downstream.cpg_combined_bed_index + File cpg_hap1_bed = downstream.cpg_hap1_bed + File cpg_hap1_bed_index = downstream.cpg_hap1_bed_index + File cpg_hap2_bed = downstream.cpg_hap2_bed + File cpg_hap2_bed_index = downstream.cpg_hap2_bed_index + File cpg_combined_bw = downstream.cpg_combined_bw + File cpg_hap1_bw = downstream.cpg_hap1_bw + File cpg_hap2_bw = downstream.cpg_hap2_bw + String stat_cpg_hap1_count = downstream.stat_hap1_cpg_count + String stat_cpg_hap2_count = downstream.stat_hap2_cpg_count + String stat_cpg_combined_count = downstream.stat_combined_cpg_count + + # sv outputs + File phased_sv_vcf = downstream.phased_sv_vcf + File phased_sv_vcf_index = downstream.phased_sv_vcf_index + + # sv stats + String stat_sv_DUP_count = downstream.stat_sv_DUP_count + String stat_sv_DEL_count = downstream.stat_sv_DEL_count + String stat_sv_INS_count = downstream.stat_sv_INS_count + String stat_sv_INV_count = downstream.stat_sv_INV_count + String stat_sv_BND_count = downstream.stat_sv_BND_count + + # small variant outputs + File phased_small_variant_vcf = downstream.phased_small_variant_vcf + File phased_small_variant_vcf_index = downstream.phased_small_variant_vcf_index + File small_variant_gvcf = upstream.small_variant_gvcf + File small_variant_gvcf_index = upstream.small_variant_gvcf_index + + # small variant stats + File small_variant_stats = downstream.small_variant_stats + File bcftools_roh_out = downstream.bcftools_roh_out + File bcftools_roh_bed = downstream.bcftools_roh_bed + String stat_small_variant_SNV_count = downstream.stat_SNV_count + String stat_small_variant_INDEL_count = downstream.stat_INDEL_count + String stat_small_variant_TSTV_ratio = downstream.stat_TSTV_ratio + String stat_small_variant_HETHOM_ratio = downstream.stat_HETHOM_ratio + File snv_distribution_plot = downstream.snv_distribution_plot + File indel_distribution_plot = downstream.indel_distribution_plot + + # trgt outputs + File phased_trgt_vcf = downstream.phased_trgt_vcf + File phased_trgt_vcf_index = downstream.phased_trgt_vcf_index + File trgt_spanning_reads = upstream.trgt_spanning_reads + File trgt_spanning_reads_index = upstream.trgt_spanning_reads_index + File trgt_coverage_dropouts = upstream.trgt_coverage_dropouts + String stat_trgt_genotyped_count = upstream.stat_trgt_genotyped_count + String stat_trgt_uncalled_count = upstream.stat_trgt_uncalled_count + + # paraphase outputs + File paraphase_output_json = upstream.paraphase_output_json + File paraphase_realigned_bam = upstream.paraphase_realigned_bam + File paraphase_realigned_bam_index = upstream.paraphase_realigned_bam_index + File? paraphase_vcfs = upstream.paraphase_vcfs + + # per sample cnv outputs + File cnv_vcf = upstream.cnv_vcf + File cnv_vcf_index = upstream.cnv_vcf_index + File cnv_copynum_bedgraph = upstream.cnv_copynum_bedgraph + File cnv_depth_bw = upstream.cnv_depth_bw + File cnv_maf_bw = upstream.cnv_maf_bw + String stat_cnv_DUP_count = upstream.stat_cnv_DUP_count + String stat_cnv_DEL_count = upstream.stat_cnv_DEL_count + String stat_cnv_DUP_sum = upstream.stat_cnv_DUP_sum + String stat_cnv_DEL_sum = upstream.stat_cnv_DEL_sum + + # PGx outputs + File pbstarphase_json = downstream.pbstarphase_json + File pharmcat_match_json = downstream.pharmcat_match_json + File pharmcat_phenotype_json = downstream.pharmcat_phenotype_json + File pharmcat_report_html = downstream.pharmcat_report_html + File pharmcat_report_json = downstream.pharmcat_report_json + + # tertiary analysis outputs + File? pedigree = write_ped_phrank.pedigree + File? tertiary_small_variant_filtered_vcf = tertiary_analysis.small_variant_filtered_vcf + File? tertiary_small_variant_filtered_vcf_index = tertiary_analysis.small_variant_filtered_vcf_index + File? tertiary_small_variant_filtered_tsv = tertiary_analysis.small_variant_filtered_tsv + File? tertiary_small_variant_compound_het_vcf = tertiary_analysis.small_variant_compound_het_vcf + File? tertiary_small_variant_compound_het_vcf_index = tertiary_analysis.small_variant_compound_het_vcf_index + File? tertiary_small_variant_compound_het_tsv = tertiary_analysis.small_variant_compound_het_tsv + File? tertiary_sv_filtered_vcf = tertiary_analysis.sv_filtered_vcf + File? tertiary_sv_filtered_vcf_index = tertiary_analysis.sv_filtered_vcf_index + File? tertiary_sv_filtered_tsv = tertiary_analysis.sv_filtered_tsv + + # workflow metadata + String workflow_name = "humanwgs_family" + String workflow_version = "v2.0.0-rc6~{"-" + debug_version}" + } +} diff --git a/workflows/tertiary/inputs.json b/workflows/tertiary/inputs.json new file mode 100644 index 00000000..3116e69f --- /dev/null +++ b/workflows/tertiary/inputs.json @@ -0,0 +1,18 @@ +{ + "tertiary_analysis.pedigree": "File", + "tertiary_analysis.phrank_lookup": "File", + "tertiary_analysis.small_variant_vcf": "File", + "tertiary_analysis.small_variant_vcf_index": "File", + "tertiary_analysis.sv_vcf": "File", + "tertiary_analysis.sv_vcf_index": "File", + "tertiary_analysis.ref_map_file": "File", + "tertiary_analysis.tertiary_map_file": "File", + "tertiary_analysis.default_runtime_attributes": { + "max_retries": "Int", + "container_registry": "String", + "gpuType": "String", + "backend": "String", + "preemptible_tries": "Int", + "zones": "String" + } +} \ No newline at end of file diff --git a/workflows/tertiary/tertiary.wdl b/workflows/tertiary/tertiary.wdl new file mode 100644 index 00000000..3dce4dac --- /dev/null +++ b/workflows/tertiary/tertiary.wdl @@ -0,0 +1,631 @@ +version 1.0 + +import "../humanwgs_structs.wdl" +import "../wdl-common/wdl/tasks/utilities.wdl" as Utilities + +workflow tertiary_analysis { + meta { + description: "Run tertiary analysis on small and structural variants." + } + + parameter_meta { + pedigree: { + name: "PLINK pedigree (PED) format" + } + phrank_lookup: { + name: "Gene symbol -> Phrank phenotype rank score lookup table" + } + small_variant_vcf: { + name: "Small variant VCF" + } + small_variant_vcf_index: { + name: "Small variant VCF index" + } + sv_vcf: { + name: "Structural variant VCF" + } + sv_vcf_index: { + name: "Structural variant VCF index" + } + ref_map_file: { + name: "Reference map file" + } + tertiary_map_file: { + name: "Tertiary map file" + } + default_runtime_attributes: { + name: "Runtime attribute structure" + } + small_variant_filtered_vcf: { + name: "Filtered and annotated small variant VCF" + } + small_variant_filtered_vcf_index: { + name: "Filtered and annotated small variant VCF index" + } + small_variant_filtered_tsv: { + name: "Filtered and annotated small variant TSV" + } + small_variant_compound_het_vcf: { + name: "Filtered and annotated compound heterozygous small variant VCF" + } + small_variant_compound_het_vcf_index: { + name: "Filtered and annotated compound heterozygous small variant VCF index" + } + small_variant_compound_het_tsv: { + name: "Filtered and annotated compound heterozygous small variant TSV" + } + sv_filtered_vcf: { + name: "Filtered and annotated structural variant VCF" + } + sv_filtered_vcf_index: { + name: "Filtered and annotated structural variant VCF index" + } + sv_filtered_tsv: { + name: "Filtered and annotated structural variant TSV" + } + } + + input { + File pedigree + File phrank_lookup + + File small_variant_vcf + File small_variant_vcf_index + File sv_vcf + File sv_vcf_index + + File ref_map_file + File tertiary_map_file + + RuntimeAttributes default_runtime_attributes + } + + Map[String, String] ref_map = read_map(ref_map_file) + Map[String, String] tertiary_map = read_map(tertiary_map_file) + + call Utilities.split_string as split_gnotate_files { + input: + concatenated_string = tertiary_map["slivar_gnotate_files"], + delimiter = ",", + runtime_attributes = default_runtime_attributes + } + + call Utilities.split_string as split_gnotate_prefixes { + input: + concatenated_string = tertiary_map["slivar_gnotate_prefixes"], + delimiter = ",", + runtime_attributes = default_runtime_attributes + } + + scatter (gnotate_prefix in split_gnotate_prefixes.array) { + # These would ideally be within slivar_small_variant, but + # cromwell doesn't yet support versions of WDL with the suffix function + # allele frequencies <= max_af in each of the frequency databases + String slivar_af_expr = "INFO.~{gnotate_prefix}_af <= ~{tertiary_map['slivar_max_af']}" + # nhomalt <= max_nhomalt in each of the frequency databases + String slivar_nhomalt_expr = "INFO.~{gnotate_prefix}_nhomalt <= ~{tertiary_map['slivar_max_nhomalt']}" + # allele counts <= max_ac in each of the frequency databases + String slivar_ac_expr = "INFO.~{gnotate_prefix}_ac <= ~{tertiary_map['slivar_max_ac']}" + # info fields for slivar tsv + Array[String] info_fields = ["~{gnotate_prefix}_af","~{gnotate_prefix}_nhomalt","~{gnotate_prefix}_ac"] + } + + call slivar_small_variant { + input: + vcf = small_variant_vcf, + vcf_index = small_variant_vcf_index, + pedigree = pedigree, + phrank_lookup = phrank_lookup, + reference = ref_map["fasta"], # !FileCoercion + reference_index = ref_map["fasta_index"], # !FileCoercion + gff = tertiary_map["ensembl_gff"], # !FileCoercion + lof_lookup = tertiary_map["lof_lookup"], # !FileCoercion + clinvar_lookup = tertiary_map["clinvar_lookup"], # !FileCoercion + slivar_js = tertiary_map["slivar_js"], # !FileCoercion + gnotate_files = split_gnotate_files.array, # !FileCoercion + af_expr = slivar_af_expr, + nhomalt_expr = slivar_nhomalt_expr, + ac_expr = slivar_ac_expr, + info_fields = flatten(info_fields), + min_gq = tertiary_map["slivar_min_gq"], + runtime_attributes = default_runtime_attributes + } + + call Utilities.split_string as split_sv_vcfs { + input: + concatenated_string = tertiary_map["svpack_pop_vcfs"], + delimiter = ",", + runtime_attributes = default_runtime_attributes + } + + call Utilities.split_string as split_sv_vcf_indices { + input: + concatenated_string = tertiary_map["svpack_pop_vcf_indices"], + delimiter = ",", + runtime_attributes = default_runtime_attributes + } + + call svpack_filter_annotated { + input: + sv_vcf = sv_vcf, + pedigree = pedigree, + population_vcfs = split_sv_vcfs.array, # !FileCoercion + population_vcf_indices = split_sv_vcf_indices.array, # !FileCoercion + gff = tertiary_map["ensembl_gff"], # !FileCoercion + runtime_attributes = default_runtime_attributes + } + + call slivar_svpack_tsv { + input: + filtered_vcf = svpack_filter_annotated.svpack_vcf, + pedigree = pedigree, + lof_lookup = tertiary_map["lof_lookup"], # !FileCoercion + clinvar_lookup = tertiary_map["clinvar_lookup"], # !FileCoercion + phrank_lookup = phrank_lookup, + runtime_attributes = default_runtime_attributes + } + + output { + File small_variant_filtered_vcf = slivar_small_variant.filtered_vcf + File small_variant_filtered_vcf_index = slivar_small_variant.filtered_vcf_index + File small_variant_filtered_tsv = slivar_small_variant.filtered_tsv + + File small_variant_compound_het_vcf = slivar_small_variant.compound_het_vcf + File small_variant_compound_het_vcf_index = slivar_small_variant.compound_het_vcf_index + File small_variant_compound_het_tsv = slivar_small_variant.compound_het_tsv + + File sv_filtered_vcf = svpack_filter_annotated.svpack_vcf + File sv_filtered_vcf_index = svpack_filter_annotated.svpack_vcf_index + File sv_filtered_tsv = slivar_svpack_tsv.svpack_tsv + } +} + +task slivar_small_variant { + meta { + description: "Filter and annotate small variants with slivar." + } + parameter_meta { + vcf: { + name: "Small variant VCF" + } + vcf_index: { + name: "Small variant VCF index" + } + pedigree: { + name: "PLINK pedigree (PED) format" + } + phrank_lookup: { + name: "Gene symbol -> Phrank phenotype rank score lookup table" + } + reference: { + name: "Reference genome FASTA" + } + reference_index: { + name: "Reference genome FASTA index" + } + gff: { + name: "Ensembl GFF annotation" + } + lof_lookup: { + name: "Gene symbol -> LoF score lookup table" + } + clinvar_lookup: { + name: "Gene symbol -> ClinVar lookup table" + } + slivar_js: { + name: "Slivar functions JS file" + } + gnotate_files: { + name: "Slivar gnotate files with Allele Frequency (AF), Allele Count (AC), and Number of Homozygotes (nhomalt)" + } + af_expr: { + name: "Allele frequency expressions for slivar" + } + nhomalt_expr: { + name: "nhomalt expressions for slivar" + } + ac_expr: { + name: "Allele count expressions for slivar" + } + min_gq: { + name: "Min genotype quality" + } + runtime_attributes: { + name: "Runtime attribute structure" + } + filtered_vcf: { + name: "Filtered and annotated small variant VCF" + } + filtered_vcf_index: { + name: "Filtered and annotated small variant VCF index" + } + compound_het_vcf: { + name: "Filtered and annotated compound heterozygous small variant VCF" + } + compound_het_vcf_index: { + name: "Filtered and annotated compound heterozygous small variant VCF index" + } + filtered_tsv: { + name: "Filtered and annotated small variant TSV" + } + compound_het_tsv: { + name: "Filtered and annotated compound heterozygous small variant TSV" + } + } + + input { + File vcf + File vcf_index + + File pedigree + File phrank_lookup + + File reference + File reference_index + + File gff + File lof_lookup + File clinvar_lookup + + File slivar_js + Array[File] gnotate_files + + Array[String] af_expr + Array[String] nhomalt_expr + Array[String] ac_expr + Array[String] info_fields + + String min_gq + + RuntimeAttributes runtime_attributes + } + + # First, select only passing variants with AF and nhomalt lower than the specified thresholds + # The af_expr and nhomalt_expr arrays will be concatenated with this array + Array[Array[String]] info_expr = [['variant.FILTER=="PASS"'], af_expr, nhomalt_expr] + + # Implicit "high quality" filters from slivar_js are also applied in steps below + # min_GQ: 20, min_AB: 0.20, min_DP: 6, min_male_X_GQ: 10, min_male_X_DP: 6 + # hom_ref AB < 0.02, hom_alt AB > 0.98, het AB between min_AB and (1-min_AB) + + # Label recessive if all affected samples are HOMALT and all unaffected samples are HETALT or HOMREF + # Special case of x-linked recessive is also handled, see segregating_recessive_x in slivar docs + Array[String] family_recessive_expr = ['recessive:fam.every(segregating_recessive)'] + + # Label dominant if all affected samples are HETALT and all unaffected samples are HOMREF + # Special case of x-linked dominant is also handled, see segregating_dominant_x in slivar docs + # The ac_expr array will be concatenated with this array + Array[Array[String]] family_dominant_expr = [['dominant:fam.every(segregating_dominant)'], ac_expr] + + # Label comphet_side if the sample is HETALT and the GQ is above the specified threshold + Array[String] sample_expr = [ + 'comphet_side:sample.het', + 'sample.GQ > ~{min_gq}' + ] + + # Skip these variant types when looking for compound hets + Array[String] skip_list = [ + 'non_coding_transcript', + 'intron', + 'non_coding', + 'upstream_gene', + 'downstream_gene', + 'non_coding_transcript_exon', + 'NMD_transcript', + '5_prime_UTR', + '3_prime_UTR' + ] + + String vcf_basename = basename(vcf, ".vcf.gz") + + Int threads = 8 + Int mem_gb = 2 * threads + Int disk_size = ceil((size(vcf, "GB") + size(reference, "GB") + size(gnotate_files, "GB") + size(gff, "GB") + size(lof_lookup, "GB") + size(clinvar_lookup, "GB") + size(phrank_lookup, "GB")) * 2 + 20) + + command <<< + set -euo pipefail + + bcftools --version + + bcftools norm \ + --threads ~{threads - 1} \ + --multiallelics \ + - \ + --output-type b \ + --fasta-ref ~{reference} \ + ~{vcf} \ + | bcftools sort \ + --output-type b \ + --output ~{vcf_basename}.norm.bcf + + bcftools index \ + --threads ~{threads - 1} \ + ~{vcf_basename}.norm.bcf + + # slivar has no version option + slivar expr 2>&1 | grep -Eo 'slivar version: [0-9.]+ [0-9a-f]+' + + pslivar \ + --processes ~{threads} \ + --fasta ~{reference} \ + --pass-only \ + --js ~{slivar_js} \ + --info '~{sep=" && " flatten(info_expr)}' \ + --family-expr '~{sep=" && " family_recessive_expr}' \ + --family-expr '~{sep=" && " flatten(family_dominant_expr)}' \ + --sample-expr '~{sep=" && " sample_expr}' \ + ~{sep=" " prefix("--gnotate ", gnotate_files)} \ + --vcf ~{vcf_basename}.norm.bcf \ + --ped ~{pedigree} \ + | bcftools csq \ + --local-csq \ + --samples - \ + --ncsq 40 \ + --gff-annot ~{gff} \ + --fasta-ref ~{reference} \ + - \ + --output-type z \ + --output ~{vcf_basename}.norm.slivar.vcf.gz + + bcftools index \ + --threads ~{threads - 1} \ + --tbi ~{vcf_basename}.norm.slivar.vcf.gz + + slivar \ + compound-hets \ + --skip ~{sep=',' skip_list} \ + --vcf ~{vcf_basename}.norm.slivar.vcf.gz \ + --sample-field comphet_side \ + --ped ~{pedigree} \ + --allow-non-trios \ + | add_comphet_phase.py \ + | bcftools view \ + --output-type z \ + --output ~{vcf_basename}.norm.slivar.compound_hets.vcf.gz + + bcftools index \ + --threads ~{threads - 1} \ + --tbi ~{vcf_basename}.norm.slivar.compound_hets.vcf.gz + + slivar tsv \ + --info-field ~{sep=' --info-field ' info_fields} \ + --sample-field dominant \ + --sample-field recessive \ + --csq-field BCSQ \ + --gene-description ~{lof_lookup} \ + --gene-description ~{clinvar_lookup} \ + --gene-description ~{phrank_lookup} \ + --ped ~{pedigree} \ + --out /dev/stdout \ + ~{vcf_basename}.norm.slivar.vcf.gz \ + | sed '1 s/gene_description_1/lof/;s/gene_description_2/clinvar/;s/gene_description_3/phrank/;' \ + > ~{vcf_basename}.norm.slivar.tsv + + slivar tsv \ + --info-field ~{sep=' --info-field ' info_fields} \ + --sample-field slivar_comphet \ + --info-field slivar_comphet \ + --csq-field BCSQ \ + --gene-description ~{lof_lookup} \ + --gene-description ~{clinvar_lookup} \ + --gene-description ~{phrank_lookup} \ + --ped ~{pedigree} \ + --out /dev/stdout \ + ~{vcf_basename}.norm.slivar.compound_hets.vcf.gz \ + | sed '1 s/gene_description_1/lof/;s/gene_description_2/clinvar/;s/gene_description_3/phrank/;' \ + > ~{vcf_basename}.norm.slivar.compound_hets.tsv + >>> + + output { + File filtered_vcf = "~{vcf_basename}.norm.slivar.vcf.gz" + File filtered_vcf_index = "~{vcf_basename}.norm.slivar.vcf.gz.tbi" + File compound_het_vcf = "~{vcf_basename}.norm.slivar.compound_hets.vcf.gz" + File compound_het_vcf_index = "~{vcf_basename}.norm.slivar.compound_hets.vcf.gz.tbi" + File filtered_tsv = "~{vcf_basename}.norm.slivar.tsv" + File compound_het_tsv = "~{vcf_basename}.norm.slivar.compound_hets.tsv" + } + + runtime { + docker: "~{runtime_attributes.container_registry}/slivar@sha256:f71a27f756e2d69ec30949cbea97c54abbafde757562a98ef965f21a28aa8eaa" + cpu: threads + memory: mem_gb + " GB" + disk: disk_size + " GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: runtime_attributes.preemptible_tries + maxRetries: runtime_attributes.max_retries + awsBatchRetryAttempts: runtime_attributes.max_retries + zones: runtime_attributes.zones + } +} + +task svpack_filter_annotated { + meta { + description: "Filter and annotate structural variants with svpack." + } + + parameter_meta { + pedigree: { + name: "PLINK pedigree (PED) format" + } + sv_vcf: { + name: "Structural variant VCF" + } + population_vcfs: { + name: "SV population VCFs" + } + population_vcf_indices: { + name: "SV population VCF indices" + } + gff: { + name: "Ensembl GFF annotation" + } + runtime_attributes: { + name: "Runtime attribute structure" + } + svpack_vcf: { + name: "Filtered and annotated structural variant VCF" + } + svpack_vcf_index: { + name: "Filtered and annotated structural variant VCF index" + } + } + + input { + File sv_vcf + File pedigree + + Array[File] population_vcfs + Array[File] population_vcf_indices + + File gff + + RuntimeAttributes runtime_attributes + } + + Int threads = 2 + Int mem_gb = 16 + Int disk_size = ceil(size(sv_vcf, "GB") * 2 + 20) + + String out_prefix = basename(sv_vcf, ".vcf.gz") + + command <<< + echo "svpack version:" + cat /opt/svpack/.git/HEAD + + affected=$(awk -F'\t' '$6 ~ /2/ {{ print $2 }}' ~{pedigree} | paste -sd',') # TODO: potentially replace awk + + svpack \ + filter \ + --pass-only \ + --min-svlen 50 \ + ~{sv_vcf} \ + ~{sep=' ' prefix('| svpack match -v - ', population_vcfs)} \ + | svpack \ + consequence \ + - \ + <(zcat ~{gff} || cat ~{gff}) \ + | svpack \ + tagzygosity \ + --samples "${affected}" \ + - \ + > ~{out_prefix}.svpack.vcf + + bgzip --version + + bgzip ~{out_prefix}.svpack.vcf + + tabix --version + + tabix --preset vcf ~{out_prefix}.svpack.vcf.gz + >>> + + output { + File svpack_vcf = "~{out_prefix}.svpack.vcf.gz" + File svpack_vcf_index = "~{out_prefix}.svpack.vcf.gz.tbi" + } + + runtime { + docker: "~{runtime_attributes.container_registry}/svpack@sha256:628e9851e425ed8044a907d33de04043d1ef02d4d2b2667cf2e9a389bb011eba" + cpu: threads + memory: mem_gb + " GB" + disk: disk_size + " GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: runtime_attributes.preemptible_tries + maxRetries: runtime_attributes.max_retries + awsBatchRetryAttempts: runtime_attributes.max_retries + zones: runtime_attributes.zones + } +} + +task slivar_svpack_tsv { + meta { + description: "Create spreadsheet-friendly TSV from svpack annotated VCFs." + } + + parameter_meta { + filtered_vcf : { + name: "Filtered and annotated structural variant VCF" + } + pedigree: { + name: "PLINK pedigree (PED) format" + } + lof_lookup: { + name: "Gene symbol -> LoF score lookup table" + } + clinvar_lookup: { + name: "Gene symbol -> ClinVar lookup table" + } + phrank_lookup: { + name: "Gene symbol -> Phrank phenotype rank score lookup table" + } + runtime_attributes: { + name: "Runtime attribute structure" + } + svpack_tsv: { + name: "Filtered and annotated structural variant TSV" + } + } + + input { + File filtered_vcf + + File pedigree + File lof_lookup + File clinvar_lookup + File phrank_lookup + + RuntimeAttributes runtime_attributes + } + + Array[String] info_fields = [ + 'SVTYPE', + 'SVLEN', + 'SVANN', + 'CIPOS', + 'MATEID', + 'END' + ] + + String filtered_vcf_basename = basename(filtered_vcf, ".vcf.gz") + + Int threads = 2 + Int mem_gb = 4 + Int disk_size = ceil((size(filtered_vcf, "GB") + size(lof_lookup, "GB") + size(clinvar_lookup, "GB") + size(phrank_lookup, "GB")) * 2 + 20) + + command <<< + set -euo pipefail + + # slivar has no version option + slivar expr 2>&1 | grep -Eo 'slivar version: [0-9.]+ [0-9a-f]+' + + slivar tsv \ + --info-field ~{sep=' --info-field ' info_fields} \ + --sample-field hetalt \ + --sample-field homalt \ + --csq-field BCSQ \ + --gene-description ~{lof_lookup} \ + --gene-description ~{clinvar_lookup} \ + --gene-description ~{phrank_lookup} \ + --ped ~{pedigree} \ + --out /dev/stdout \ + ~{filtered_vcf} \ + | sed '1 s/gene_description_1/lof/;s/gene_description_2/clinvar/;s/gene_description_3/phrank/;' \ + > ~{filtered_vcf_basename}.tsv + >>> + + output { + File svpack_tsv = "~{filtered_vcf_basename}.tsv" + } + + runtime { + docker: "~{runtime_attributes.container_registry}/slivar@sha256:f71a27f756e2d69ec30949cbea97c54abbafde757562a98ef965f21a28aa8eaa" + cpu: threads + memory: mem_gb + " GB" + disk: disk_size + " GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: runtime_attributes.preemptible_tries + maxRetries: runtime_attributes.max_retries + awsBatchRetryAttempts: runtime_attributes.max_retries + zones: runtime_attributes.zones + } +} diff --git a/workflows/tertiary_analysis/inputs.json b/workflows/tertiary_analysis/inputs.json deleted file mode 100644 index a806db43..00000000 --- a/workflows/tertiary_analysis/inputs.json +++ /dev/null @@ -1,69 +0,0 @@ -{ - "tertiary_analysis.cohort": { - "cohort_id": "String", - "samples": [ - { - "sample_id": "String", - "movie_bams": [ - "File" - ], - "sex": "String?", - "affected": "Boolean", - "father_id": "String?", - "mother_id": "String?" - } - ], - "phenotypes": [ - "String" - ] - }, - "tertiary_analysis.small_variant_vcf": { - "data": "File", - "data_index": "File" - }, - "tertiary_analysis.sv_vcf": { - "data": "File", - "data_index": "File" - }, - "tertiary_analysis.reference": { - "name": "String", - "fasta": { - "data": "File", - "data_index": "File" - }, - "pbsv_splits": "File", - "tandem_repeat_bed": "File", - "trgt_tandem_repeat_bed": "File", - "gnomad_af": "File", - "hprc_af": "File", - "gff": "File", - "population_vcfs": [ - { - "data": "File", - "data_index": "File" - } - ], - "hificnv_exclude_bed": { - "data": "File", - "data_index": "File" - }, - "hificnv_expected_bed_male": "File", - "hificnv_expected_bed_female": "File" - }, - "tertiary_analysis.slivar_data": { - "slivar_js": "File", - "hpo_terms": "File", - "hpo_dag": "File", - "hpo_annotations": "File", - "ensembl_to_hgnc": "File", - "lof_lookup": "File", - "clinvar_lookup": "File" - }, - "tertiary_analysis.default_runtime_attributes": { - "preemptible_tries": "Int", - "max_retries": "Int", - "zones": "String", - "queue_arn": "String", - "container_registry": "String" - } -} diff --git a/workflows/tertiary_analysis/tertiary_analysis.wdl b/workflows/tertiary_analysis/tertiary_analysis.wdl deleted file mode 100644 index 94518816..00000000 --- a/workflows/tertiary_analysis/tertiary_analysis.wdl +++ /dev/null @@ -1,534 +0,0 @@ -version 1.0 - -# Annotate small and structural variant VCFs using slivar. Outputs annotated VCFs and TSVs. -# This workflow is run on a phased single-sample VCF if there is only a single individual in the cohort, otherwise it is run on the joint-called phased VCF. - -import "../humanwgs_structs.wdl" - -workflow tertiary_analysis { - input { - Cohort cohort - IndexData small_variant_vcf - IndexData sv_vcf - - ReferenceData reference - - SlivarData slivar_data - - RuntimeAttributes default_runtime_attributes - } - - call write_ped_phrank { - input: - cohort_id = cohort.cohort_id, - cohort_json = write_json(cohort), - phenotypes = cohort.phenotypes, - runtime_attributes = default_runtime_attributes - } - - call slivar_small_variant { - input: - vcf = small_variant_vcf.data, - vcf_index = small_variant_vcf.data_index, - pedigree = write_ped_phrank.pedigree, - reference = reference.fasta.data, - reference_index = reference.fasta.data_index, - slivar_js = slivar_data.slivar_js, - gnomad_af = select_first([reference.gnomad_af]), - hprc_af = select_first([reference.hprc_af]), - gff = select_first([reference.gff]), - lof_lookup = slivar_data.lof_lookup, - clinvar_lookup = slivar_data.clinvar_lookup, - phrank_lookup = write_ped_phrank.phrank_lookup, - runtime_attributes = default_runtime_attributes - } - - scatter (vcf_object in select_first([reference.population_vcfs])) { - File population_vcf = vcf_object.data - File population_vcf_index = vcf_object.data_index - } - - call svpack_filter_annotated { - input: - sv_vcf = sv_vcf.data, - pedigree = write_ped_phrank.pedigree, - population_vcfs = population_vcf, - population_vcf_indices = population_vcf_index, - gff = select_first([reference.gff]), - runtime_attributes = default_runtime_attributes - } - - call slivar_svpack_tsv { - input: - filtered_vcf = svpack_filter_annotated.svpack_vcf, - pedigree = write_ped_phrank.pedigree, - lof_lookup = slivar_data.lof_lookup, - clinvar_lookup = slivar_data.clinvar_lookup, - phrank_lookup = write_ped_phrank.phrank_lookup, - runtime_attributes = default_runtime_attributes - } - - output { - IndexData filtered_small_variant_vcf = {"data": slivar_small_variant.filtered_vcf, "data_index": slivar_small_variant.filtered_vcf_index} - IndexData compound_het_small_variant_vcf = {"data": slivar_small_variant.compound_het_vcf, "data_index": slivar_small_variant.compound_het_vcf_index} - File filtered_small_variant_tsv = slivar_small_variant.filtered_tsv - File compound_het_small_variant_tsv = slivar_small_variant.compound_het_tsv - IndexData filtered_svpack_vcf = {"data": svpack_filter_annotated.svpack_vcf, "data_index": svpack_filter_annotated.svpack_vcf_index} - File filtered_svpack_tsv = slivar_svpack_tsv.svpack_tsv - } - - parameter_meta { - cohort: {help: "Sample information for the cohort"} - small_variant_vcf: {help: "Small variant VCF to annotate using slivar"} - sv_vcf: {help: "Structural variant VCF to annotate using slivar"} - reference: {help: "Reference genome data"} - slivar_data: {help: "Data files used for annotation with slivar"} - default_runtime_attributes: {help: "Default RuntimeAttributes; spot if preemptible was set to true, otherwise on_demand"} - } -} - -task write_ped_phrank { - input { - String cohort_id - File cohort_json - - Array[String] phenotypes - - RuntimeAttributes runtime_attributes - } - - Int disk_size = 20 - - command <<< - set -euo pipefail - - cat << EOF > json2ped.py - #!/usr/bin/env python3 - """ - Convert Family JSON structure to tab-delimited PLINK pedigree (PED) format. - - Output PED columns: - 1. family_id - 2. sample_id - 3. father_id (. for unknown) - 4. mother_id (. for unknown) - 5. sex (1=male; 2=female; .=unknown) - 6. phenotype (1=unaffected; 2=affected) - """ - - __version__ = "0.1.0" - - import json - import csv - import sys - - - SEX = {"MALE": "1", "M": "1", "FEMALE": "2", "F": "2"} - STATUS = {False: "1", True: "2"} - - - def parse_sample(family_id, sample): - """For a sample struct, return a list of PED fields.""" - return [ - family_id, - sample["sample_id"], - sample.get("father_id", "."), - sample.get("mother_id", "."), - SEX.get(sample.get("sex", ".").upper(), "."), # all cases accepted - STATUS.get(sample.get("affected"), "0"), - ] - - - def parse_family(family): - """For a family struct, return a list of lists of PED fields for each sample.""" - family_id = family["cohort_id"] - samples = [] - for sample in family["samples"]: - samples.append(parse_sample(family_id, sample)) - return samples - - - def write_ped(samples): - """Write PED format to stdout.""" - tsv_writer = csv.writer(sys.stdout, delimiter="\\t") - for sample in samples: - tsv_writer.writerow(sample) - - - def main(): - with open(sys.argv[1], "r") as family: - samples = parse_family(json.load(family)) - write_ped(samples) - - - if __name__ == "__main__": - if sys.argv[1] in ["-v", "--version"]: - print(__version__) - sys.exit(0) - main() - EOF - - python3 ./json2ped.py ~{cohort_json} > ~{cohort_id}.ped - - cat ~{cohort_id}.ped - - # ENV HPO_TERMS_TSV "/opt/data/hpo/hpoTerms.txt" - # ENV HPO_DAG_TSV "/opt/data/hpo/hpoDag.txt" - # ENV ENSEMBL_TO_HPO_TSV "/opt/data/hpo/ensembl.hpoPhenotype.tsv" - # ENV ENSEMBL_TO_HGNC "/opt/data/genes/ensembl.hgncSymbol.tsv" - - calculate_phrank.py \ - "${HPO_TERMS_TSV}" \ - "${HPO_DAG_TSV}" \ - "${ENSEMBL_TO_HPO_TSV}" \ - "${ENSEMBL_TO_HGNC}" \ - ~{sep="," phenotypes} \ - ~{cohort_id}_phrank.tsv - >>> - - output { - File pedigree = "~{cohort_id}.ped" - File phrank_lookup = "~{cohort_id}_phrank.tsv" - } - - runtime { - docker: "~{runtime_attributes.container_registry}/wgs_tertiary@sha256:46f14de75798b54a38055a364a23ca1c9497bf810fee860431b78abc553434f2" - cpu: 2 - memory: "4 GB" - disk: disk_size + " GB" - disks: "local-disk " + disk_size + " HDD" - preemptible: runtime_attributes.preemptible_tries - maxRetries: runtime_attributes.max_retries - awsBatchRetryAttempts: runtime_attributes.max_retries - queueArn: runtime_attributes.queue_arn - zones: runtime_attributes.zones - } -} - -task slivar_small_variant { - input { - File vcf - File vcf_index - - File pedigree - - File reference - File reference_index - - File slivar_js - File gnomad_af - File hprc_af - File gff - - File lof_lookup - File clinvar_lookup - File phrank_lookup - - RuntimeAttributes runtime_attributes - } - - Float max_gnomad_af = 0.03 - Float max_hprc_af = 0.03 - Int max_gnomad_nhomalt = 4 - Int max_hprc_nhomalt = 4 - Int max_gnomad_ac = 4 - Int max_hprc_ac = 4 - Int min_gq = 5 - - Array[String] info_expr = [ - 'variant.FILTER=="PASS"', - 'INFO.gnomad_af <= ~{max_gnomad_af}', - 'INFO.hprc_af <= ~{max_hprc_af}', - 'INFO.gnomad_nhomalt <= ~{max_gnomad_nhomalt}', - 'INFO.hprc_nhomalt <= ~{max_hprc_nhomalt}' - ] - Array[String] family_recessive_expr = [ - 'recessive:fam.every(segregating_recessive)' - ] - Array[String] family_x_recessive_expr = [ - 'x_recessive:(variant.CHROM == "chrX")', - 'fam.every(segregating_recessive_x)' - ] - Array[String] family_dominant_expr = [ - 'dominant:fam.every(segregating_dominant)', - 'INFO.gnomad_ac <= ~{max_gnomad_ac}', - 'INFO.hprc_ac <= ~{max_hprc_ac}' - ] - Array[String] sample_expr = [ - 'comphet_side:sample.het', - 'sample.GQ > ~{min_gq}' - ] - Array[String] skip_list = [ - 'non_coding_transcript', - 'intron', - 'non_coding', - 'upstream_gene', - 'downstream_gene', - 'non_coding_transcript_exon', - 'NMD_transcript', - '5_prime_UTR', - '3_prime_UTR' - ] - Array[String] info_fields = [ - 'gnomad_af', - 'hprc_af', - 'gnomad_nhomalt', - 'hprc_nhomalt', - 'gnomad_ac', - 'hprc_ac' - ] - - String vcf_basename = basename(vcf, ".vcf.gz") - Int threads = 8 - Int disk_size = ceil((size(vcf, "GB") + size(reference, "GB") + size(gnomad_af, "GB") + size(hprc_af, "GB") + size(gff, "GB") + size(lof_lookup, "GB") + size(clinvar_lookup, "GB") + size(phrank_lookup, "GB")) * 2 + 20) - - command <<< - set -euo pipefail - - bcftools --version - - bcftools norm \ - --threads ~{threads - 1} \ - --multiallelics \ - - \ - --output-type b \ - --fasta-ref ~{reference} \ - ~{vcf} \ - | bcftools sort \ - --output-type b \ - --output ~{vcf_basename}.norm.bcf - - bcftools index \ - --threads ~{threads - 1} \ - ~{vcf_basename}.norm.bcf - - # slivar has no version option - slivar expr 2>&1 | grep -Eo 'slivar version: [0-9.]+ [0-9a-f]+' - - pslivar \ - --processes ~{threads} \ - --fasta ~{reference} \ - --pass-only \ - --js ~{slivar_js} \ - --info '~{sep=" && " info_expr}' \ - --family-expr '~{sep=" && " family_recessive_expr}' \ - --family-expr '~{sep=" && " family_x_recessive_expr}' \ - --family-expr '~{sep=" && " family_dominant_expr}' \ - --sample-expr '~{sep=" && " sample_expr}' \ - --gnotate ~{gnomad_af} \ - --gnotate ~{hprc_af} \ - --vcf ~{vcf_basename}.norm.bcf \ - --ped ~{pedigree} \ - | bcftools csq \ - --local-csq \ - --samples - \ - --ncsq 40 \ - --gff-annot ~{gff} \ - --fasta-ref ~{reference} \ - - \ - --output-type z \ - --output ~{vcf_basename}.norm.slivar.vcf.gz - - bcftools index \ - --threads ~{threads - 1} \ - --tbi ~{vcf_basename}.norm.slivar.vcf.gz - - slivar \ - compound-hets \ - --skip ~{sep=',' skip_list} \ - --vcf ~{vcf_basename}.norm.slivar.vcf.gz \ - --sample-field comphet_side \ - --ped ~{pedigree} \ - --allow-non-trios \ - | add_comphet_phase.py \ - | bcftools view \ - --output-type z \ - --output ~{vcf_basename}.norm.slivar.compound_hets.vcf.gz - - bcftools index \ - --threads ~{threads - 1} \ - --tbi ~{vcf_basename}.norm.slivar.compound_hets.vcf.gz - - slivar tsv \ - --info-field ~{sep=' --info-field ' info_fields} \ - --sample-field dominant \ - --sample-field recessive \ - --sample-field x_recessive \ - --csq-field BCSQ \ - --gene-description ~{lof_lookup} \ - --gene-description ~{clinvar_lookup} \ - --gene-description ~{phrank_lookup} \ - --ped ~{pedigree} \ - --out /dev/stdout \ - ~{vcf_basename}.norm.slivar.vcf.gz \ - | sed '1 s/gene_description_1/lof/;s/gene_description_2/clinvar/;s/gene_description_3/phrank/;' \ - > ~{vcf_basename}.norm.slivar.tsv - - slivar tsv \ - --info-field ~{sep=' --info-field ' info_fields} \ - --sample-field slivar_comphet \ - --info-field slivar_comphet \ - --csq-field BCSQ \ - --gene-description ~{lof_lookup} \ - --gene-description ~{clinvar_lookup} \ - --gene-description ~{phrank_lookup} \ - --ped ~{pedigree} \ - --out /dev/stdout \ - ~{vcf_basename}.norm.slivar.compound_hets.vcf.gz \ - | sed '1 s/gene_description_1/lof/;s/gene_description_2/clinvar/;s/gene_description_3/phrank/;' \ - > ~{vcf_basename}.norm.slivar.compound_hets.tsv - >>> - - output { - File filtered_vcf = "~{vcf_basename}.norm.slivar.vcf.gz" - File filtered_vcf_index = "~{vcf_basename}.norm.slivar.vcf.gz.tbi" - File compound_het_vcf = "~{vcf_basename}.norm.slivar.compound_hets.vcf.gz" - File compound_het_vcf_index = "~{vcf_basename}.norm.slivar.compound_hets.vcf.gz.tbi" - File filtered_tsv = "~{vcf_basename}.norm.slivar.tsv" - File compound_het_tsv = "~{vcf_basename}.norm.slivar.compound_hets.tsv" - } - - runtime { - docker: "~{runtime_attributes.container_registry}/slivar@sha256:0a09289ccb760da310669906c675be02fd16b18bbedc971605a587275e34966c" - cpu: threads - memory: "16 GB" - disk: disk_size + " GB" - disks: "local-disk " + disk_size + " HDD" - preemptible: runtime_attributes.preemptible_tries - maxRetries: runtime_attributes.max_retries - awsBatchRetryAttempts: runtime_attributes.max_retries - queueArn: runtime_attributes.queue_arn - zones: runtime_attributes.zones - } -} - -task svpack_filter_annotated { - input { - File sv_vcf - File pedigree - - Array[File] population_vcfs - Array[File] population_vcf_indices - - File gff - - RuntimeAttributes runtime_attributes - } - - String sv_vcf_basename = basename(sv_vcf, ".vcf.gz") - Int disk_size = ceil(size(sv_vcf, "GB") * 2 + 20) - - command <<< - set -euo pipefail - - echo "svpack version:" - cat /opt/svpack/.git/HEAD - - affected=$(awk -F'\t' '$6 ~ /2/ {{ print $2 }}' ~{pedigree} | paste -sd',') - - svpack \ - filter \ - --pass-only \ - --min-svlen 50 \ - ~{sv_vcf} \ - ~{sep=' ' prefix('| svpack match -v - ', population_vcfs)} \ - | svpack \ - consequence \ - - \ - ~{gff} \ - | svpack \ - tagzygosity \ - --samples "${affected}" \ - - \ - > ~{sv_vcf_basename}.svpack.vcf - - bgzip --version - - bgzip ~{sv_vcf_basename}.svpack.vcf - - tabix --version - - tabix -p vcf ~{sv_vcf_basename}.svpack.vcf.gz - >>> - - output { - File svpack_vcf = "~{sv_vcf_basename}.svpack.vcf.gz" - File svpack_vcf_index = "~{sv_vcf_basename}.svpack.vcf.gz.tbi" - } - - runtime { - docker: "~{runtime_attributes.container_registry}/svpack@sha256:5966de1434bc5fc04cc97d666126be46ebacb4a27191770bf9debfc9a6ab08bb" - cpu: 2 - memory: "16 GB" - disk: disk_size + " GB" - disks: "local-disk " + disk_size + " HDD" - preemptible: runtime_attributes.preemptible_tries - maxRetries: runtime_attributes.max_retries - awsBatchRetryAttempts: runtime_attributes.max_retries - queueArn: runtime_attributes.queue_arn - zones: runtime_attributes.zones - } -} - -task slivar_svpack_tsv { - input { - File filtered_vcf - - File pedigree - File lof_lookup - File clinvar_lookup - File phrank_lookup - - RuntimeAttributes runtime_attributes - } - - Array[String] info_fields = [ - 'SVTYPE', - 'SVLEN', - 'SVANN', - 'CIPOS', - 'MATEID', - 'END' - ] - - String filtered_vcf_basename = basename(filtered_vcf, ".vcf.gz") - Int disk_size = ceil((size(filtered_vcf, "GB") + size(lof_lookup, "GB") + size(clinvar_lookup, "GB") + size(phrank_lookup, "GB")) * 2 + 20) - - command <<< - set -euo pipefail - - # slivar has no version option - slivar expr 2>&1 | grep -Eo 'slivar version: [0-9.]+ [0-9a-f]+' - - slivar tsv \ - --info-field ~{sep=' --info-field ' info_fields} \ - --sample-field hetalt \ - --sample-field homalt \ - --csq-field BCSQ \ - --gene-description ~{lof_lookup} \ - --gene-description ~{clinvar_lookup} \ - --gene-description ~{phrank_lookup} \ - --ped ~{pedigree} \ - --out /dev/stdout \ - ~{filtered_vcf} \ - | sed '1 s/gene_description_1/lof/;s/gene_description_2/clinvar/;s/gene_description_3/phrank/;' \ - > ~{filtered_vcf_basename}.tsv - >>> - - output { - File svpack_tsv = "~{filtered_vcf_basename}.tsv" - } - - runtime { - docker: "~{runtime_attributes.container_registry}/slivar@sha256:0a09289ccb760da310669906c675be02fd16b18bbedc971605a587275e34966c" - cpu: 2 - memory: "4 GB" - disk: disk_size + " GB" - disks: "local-disk " + disk_size + " HDD" - preemptible: runtime_attributes.preemptible_tries - maxRetries: runtime_attributes.max_retries - awsBatchRetryAttempts: runtime_attributes.max_retries - queueArn: runtime_attributes.queue_arn - zones: runtime_attributes.zones - } -} diff --git a/workflows/upstream/inputs.json b/workflows/upstream/inputs.json new file mode 100644 index 00000000..ac0324b3 --- /dev/null +++ b/workflows/upstream/inputs.json @@ -0,0 +1,18 @@ +{ + "upstream.sample_id": "String", + "upstream.sex": "String? (optional)", + "upstream.hifi_reads": "Array[File]", + "upstream.ref_map_file": "File", + "upstream.deepvariant_version": "String", + "upstream.custom_deepvariant_model_tar": "File? (optional)", + "upstream.single_sample": "Boolean (optional, default = false)", + "upstream.gpu": "Boolean", + "upstream.default_runtime_attributes": { + "max_retries": "Int", + "container_registry": "String", + "gpuType": "String", + "backend": "String", + "preemptible_tries": "Int", + "zones": "String" + } +} \ No newline at end of file diff --git a/workflows/upstream/upstream.wdl b/workflows/upstream/upstream.wdl new file mode 100644 index 00000000..93635376 --- /dev/null +++ b/workflows/upstream/upstream.wdl @@ -0,0 +1,281 @@ +version 1.0 + +import "../wdl-common/wdl/structs.wdl" +import "../wdl-common/wdl/tasks/pbmm2.wdl" as Pbmm2 +import "../wdl-common/wdl/tasks/merge_bam_stats.wdl" as MergeBamStats +import "../wdl-common/wdl/tasks/pbsv.wdl" as Pbsv +import "../wdl-common/wdl/tasks/bcftools.wdl" as Bcftools +import "../wdl-common/wdl/workflows/deepvariant/deepvariant.wdl" as DeepVariant +import "../wdl-common/wdl/tasks/samtools.wdl" as Samtools +import "../wdl-common/wdl/tasks/mosdepth.wdl" as Mosdepth +import "../wdl-common/wdl/tasks/trgt.wdl" as Trgt +import "../wdl-common/wdl/tasks/paraphase.wdl" as Paraphase +import "../wdl-common/wdl/tasks/hificnv.wdl" as Hificnv +import "../wdl-common/wdl/workflows/get_pbsv_splits/get_pbsv_splits.wdl" as Pbsv_splits + +workflow upstream { + meta { + description: "Given a set of HiFi reads for a human sample, run steps upstream of phasing." + } + + parameter_meta { + sample_id: { + name: "Sample ID" + } + sex: { + name: "Sample sex", + choices: ["MALE", "FEMALE"] + } + hifi_reads: { + name: "HiFi reads (BAMs)" + } + ref_map_file: { + name: "TSV containing reference genome information" + } + deepvariant_version: { + name: "DeepVariant version" + } + custom_deepvariant_model_tar: { + name: "Custom DeepVariant model tarball" + } + single_sample: { + name: "Single sample workflow" + } + gpu: { + name: "Use GPU for DeepVariant" + } + default_runtime_attributes: { + name: "Runtime attribute structure" + } + } + + input { + String sample_id + String? sex + Array[File] hifi_reads + + File ref_map_file + + String deepvariant_version + File? custom_deepvariant_model_tar + + Boolean single_sample = false + + Boolean gpu + + RuntimeAttributes default_runtime_attributes + } + + Map[String, String] ref_map = read_map(ref_map_file) + + scatter (hifi_read_bam in hifi_reads) { + call Pbmm2.pbmm2_align_wgs as pbmm2_align { + input: + sample_id = sample_id, + bam = hifi_read_bam, + ref_fasta = ref_map["fasta"], # !FileCoercion + ref_index = ref_map["fasta_index"], # !FileCoercion + ref_name = ref_map["name"], + runtime_attributes = default_runtime_attributes + } + call Pbsv.pbsv_discover { + input: + aligned_bam = pbmm2_align.aligned_bam, + aligned_bam_index = pbmm2_align.aligned_bam_index, + trf_bed = ref_map["pbsv_tandem_repeat_bed"], # !FileCoercion + runtime_attributes = default_runtime_attributes + } + } + + call MergeBamStats.merge_bam_stats { + input: + sample_id = sample_id, + bam_stats = pbmm2_align.bam_stats, + runtime_attributes = default_runtime_attributes + } + + # merge aligned bams if there are multiple + if (length(pbmm2_align.aligned_bam) > 1) { + call Samtools.samtools_merge { + input: + bams = pbmm2_align.aligned_bam, + out_prefix = "~{sample_id}.~{ref_map['name']}", + runtime_attributes = default_runtime_attributes + } + } + + # select the merged bam if it exists, otherwise select the first (only) aligned bam + File aligned_bam_data = select_first([samtools_merge.merged_bam, pbmm2_align.aligned_bam[0]]) + File aligned_bam_index = select_first([samtools_merge.merged_bam_index, pbmm2_align.aligned_bam_index[0]]) + + call Mosdepth.mosdepth { + input: + sample_id = sample_id, + ref_name = ref_map["name"], + aligned_bam = aligned_bam_data, + aligned_bam_index = aligned_bam_index, + infer_sex = true, + runtime_attributes = default_runtime_attributes + } + + call DeepVariant.deepvariant { + input: + sample_id = sample_id, + aligned_bams = [aligned_bam_data], + aligned_bam_indices = [aligned_bam_index], + ref_fasta = ref_map["fasta"], # !FileCoercion + ref_index = ref_map["fasta_index"], # !FileCoercion + ref_name = ref_map["name"], + deepvariant_version = deepvariant_version, + custom_deepvariant_model_tar = custom_deepvariant_model_tar, + gpu = gpu, + default_runtime_attributes = default_runtime_attributes + } + + call Trgt.trgt { + input: + sample_id = sample_id, + sex = select_first([sex, mosdepth.inferred_sex]), + aligned_bam = aligned_bam_data, + aligned_bam_index = aligned_bam_index, + ref_fasta = ref_map["fasta"], # !FileCoercion + ref_index = ref_map["fasta_index"], # !FileCoercion + trgt_bed = ref_map["trgt_tandem_repeat_bed"], # !FileCoercion + out_prefix = "~{sample_id}.~{ref_map['name']}", + runtime_attributes = default_runtime_attributes + } + + call Trgt.coverage_dropouts { + input: + aligned_bam = aligned_bam_data, + aligned_bam_index = aligned_bam_index, + trgt_bed = ref_map["trgt_tandem_repeat_bed"], # !FileCoercion + out_prefix = "~{sample_id}.~{ref_map['name']}", + runtime_attributes = default_runtime_attributes + } + + call Paraphase.paraphase { + input: + aligned_bam = aligned_bam_data, + aligned_bam_index = aligned_bam_index, + ref_fasta = ref_map["fasta"], # !FileCoercion + ref_index = ref_map["fasta_index"], # !FileCoercion + sample_id = sample_id, + runtime_attributes = default_runtime_attributes + } + + call Hificnv.hificnv { + input: + sample_id = sample_id, + sex = select_first([sex, mosdepth.inferred_sex]), + aligned_bam = aligned_bam_data, + aligned_bam_index = aligned_bam_index, + vcf = deepvariant.vcf, + vcf_index = deepvariant.vcf_index, + ref_fasta = ref_map["fasta"], # !FileCoercion + ref_index = ref_map["fasta_index"], # !FileCoercion + ref_name = ref_map["name"], + exclude_bed = ref_map["hificnv_exclude_bed"], # !FileCoercion + exclude_bed_index = ref_map["hificnv_exclude_bed_index"], # !FileCoercion + expected_male_bed = ref_map["hificnv_expected_bed_male"], # !FileCoercion + expected_female_bed = ref_map["hificnv_expected_bed_female"], # !FileCoercion + runtime_attributes = default_runtime_attributes + } + + if (single_sample) { + call Pbsv_splits.get_pbsv_splits { + input: + pbsv_splits_file = ref_map["pbsv_splits"], # !FileCoercion + default_runtime_attributes = default_runtime_attributes + } + + scatter (shard_index in range(length(get_pbsv_splits.pbsv_splits))) { + Array[String] region_set = get_pbsv_splits.pbsv_splits[shard_index] + + call Pbsv.pbsv_call { + input: + sample_id = sample_id, + svsigs = pbsv_discover.svsig, + ref_fasta = ref_map["fasta"], # !FileCoercion + ref_index = ref_map["fasta_index"], # !FileCoercion + ref_name = ref_map["name"], + shard_index = shard_index, + regions = region_set, + runtime_attributes = default_runtime_attributes + } + } + + # concatenate pbsv vcfs + call Bcftools.concat_pbsv_vcf { + input: + vcfs = pbsv_call.vcf, + vcf_indices = pbsv_call.vcf_index, + out_prefix = "~{sample_id}.~{ref_map['name']}.structural_variants", + runtime_attributes = default_runtime_attributes + } + } + + output { + # bam stats + File read_length_and_quality = merge_bam_stats.read_length_and_quality + File read_length_plot = merge_bam_stats.read_length_plot + File read_quality_plot = merge_bam_stats.read_quality_plot + String stat_num_reads = merge_bam_stats.stat_num_reads + String stat_read_length_mean = merge_bam_stats.stat_read_length_mean + String stat_read_length_median = merge_bam_stats.stat_read_length_median + String stat_read_quality_mean = merge_bam_stats.stat_read_quality_mean + String stat_read_quality_median = merge_bam_stats.stat_read_quality_median + + # alignments + File out_bam = aligned_bam_data + File out_bam_index = aligned_bam_index + + # mosdepth outputs + File mosdepth_summary = mosdepth.summary + File mosdepth_region_bed = mosdepth.region_bed + File mosdepth_region_bed_index = mosdepth.region_bed_index + File mosdepth_depth_distribution_plot = mosdepth.depth_distribution_plot + String inferred_sex = mosdepth.inferred_sex + String stat_mean_depth = mosdepth.stat_mean_depth + + # per movie sv signatures + # if we've already called variants, no need to keep these + Array[File] svsigs = if single_sample then [] else pbsv_discover.svsig + + # pbsv outputs for single sample + File? sv_vcf = concat_pbsv_vcf.concatenated_vcf + File? sv_vcf_index = concat_pbsv_vcf.concatenated_vcf_index + + # small variant outputs + File small_variant_vcf = deepvariant.vcf + File small_variant_vcf_index = deepvariant.vcf_index + File small_variant_gvcf = deepvariant.gvcf + File small_variant_gvcf_index = deepvariant.gvcf_index + + # trgt outputs + File trgt_vcf = trgt.vcf + File trgt_vcf_index = trgt.vcf_index + File trgt_spanning_reads = trgt.bam + File trgt_spanning_reads_index = trgt.bam_index + File trgt_coverage_dropouts = coverage_dropouts.dropouts + String stat_trgt_genotyped_count = trgt.stat_genotyped_count + String stat_trgt_uncalled_count = trgt.stat_uncalled_count + + # paraphase outputs + File paraphase_output_json = paraphase.out_json + File paraphase_realigned_bam = paraphase.bam + File paraphase_realigned_bam_index = paraphase.bam_index + File? paraphase_vcfs = paraphase.vcfs_tar + + # per sample hificnv outputs + File cnv_vcf = hificnv.cnv_vcf + File cnv_vcf_index = hificnv.cnv_vcf_index + File cnv_copynum_bedgraph = hificnv.copynum_bedgraph + File cnv_depth_bw = hificnv.depth_bw + File cnv_maf_bw = hificnv.maf_bw + String stat_cnv_DUP_count = hificnv.stat_DUP_count + String stat_cnv_DEL_count = hificnv.stat_DEL_count + String stat_cnv_DUP_sum = hificnv.stat_DUP_sum + String stat_cnv_DEL_sum = hificnv.stat_DEL_sum + } +} diff --git a/workflows/wdl-common b/workflows/wdl-common index fef058b8..f362948d 160000 --- a/workflows/wdl-common +++ b/workflows/wdl-common @@ -1 +1 @@ -Subproject commit fef058b879d04c15c3da2626b320afdd8ace6c2e +Subproject commit f362948dd0b71c11bc23b5dbf15eed102ceb8441