-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathvariant_index.py
321 lines (288 loc) · 11 KB
/
variant_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
"""DAG that generates a variant index dataset based on several sources."""
from __future__ import annotations
import os
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Any
import pandas as pd
from common_airflow import (
create_batch_job,
create_cluster,
create_task_spec,
delete_cluster,
install_dependencies,
read_yaml_config,
shared_dag_args,
shared_dag_kwargs,
submit_step,
)
from google.cloud import batch_v1
from airflow.decorators import task
from airflow.models.dag import DAG
from airflow.providers.google.cloud.operators.cloud_batch import (
CloudBatchSubmitJobOperator,
)
from airflow.providers.google.cloud.operators.gcs import GCSListObjectsOperator
from airflow.utils.trigger_rule import TriggerRule
PROJECT_ID = "open-targets-genetics-dev"
REGION = "europe-west1"
GCS_BUCKET = "genetics_etl_python_playground"
CONFIG_FILE_PATH = Path(__file__).parent / "configs" / "variant_sources.yaml"
GENTROPY_DOCKER_IMAGE = "europe-west1-docker.pkg.dev/open-targets-genetics-dev/gentropy-app/gentropy:il-variant-idx" # TODO: change to dev
VEP_DOCKER_IMAGE = "europe-west1-docker.pkg.dev/open-targets-genetics-dev/gentropy-app/custom_ensembl_vep:dev"
VEP_CACHE_BUCKET = f"gs://{GCS_BUCKET}/vep/cache"
RELEASE = "XX.XX" # This needs to be updated to the latest release
VCF_DST_PATH = f"gs://{GCS_BUCKET}/{RELEASE}/variant_vcf"
VCF_MERGED_DST_PATH = f"{VCF_DST_PATH}/merged"
VEP_OUTPUT_BUCKET = f"gs://{GCS_BUCKET}/{RELEASE}/vep_output"
VARIANT_INDEX_BUCKET = f"gs://{GCS_BUCKET}/{RELEASE}/variant_index"
GNOMAD_ANNOTATION_PATH = f"gs://{GCS_BUCKET}/static_assets/gnomad_variants"
# Internal parameters for the docker image:
MOUNT_DIR = "/mnt/disks/share"
CLUSTER_NAME = "otg-variant-index"
AUTOSCALING = "eqtl-preprocess"
@task(task_id="vcf_creation")
def create_vcf(**kwargs: Any) -> None:
"""Task that sends the ConvertToVcfStep job to Google Batch.
Args:
**kwargs (Any): Keyword arguments
"""
sources = read_yaml_config(CONFIG_FILE_PATH)
task_env = [
batch_v1.Environment(
variables={
"SOURCE_NAME": source["name"],
"SOURCE_PATH": source["location"],
"SOURCE_FORMAT": source["format"],
}
)
for source in sources["sources_inclusion_list"]
]
commands = [
"-c",
rf"poetry run gentropy step=variant_to_vcf step.source_path=$SOURCE_PATH step.source_format=$SOURCE_FORMAT step.vcf_path={VCF_DST_PATH}/$SOURCE_NAME +step.session.extended_spark_conf={{spark.jars:https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop3-latest.jar}}",
]
task = create_task_spec(
GENTROPY_DOCKER_IMAGE, commands, options="-e HYDRA_FULL_ERROR=1"
)
batch_task = CloudBatchSubmitJobOperator(
task_id="vep_batch_job",
project_id=PROJECT_ID,
region=REGION,
job_name=f"vcf-job-{time.strftime('%Y%m%d-%H%M%S')}",
job=create_batch_job(
task,
"VEPMACHINE",
task_env,
),
deferrable=False,
)
batch_task.execute(context=kwargs)
@task(task_id="merge_vcfs")
def merge_vcfs(chunk_size: int = 2000, **kwargs: Any) -> None:
"""Task that merges the information from all the VCF files into a single one so that we only submit one VEP job.
Args:
chunk_size (int): Partition size of the merged file. Defaults to 2000.
**kwargs (Any): Keyword arguments
"""
ti = kwargs["ti"]
input_vcfs = [
f"gs://{GCS_BUCKET}/{listed_file}"
for listed_file in ti.xcom_pull(
task_ids="get_vcf_per_source", key="return_value"
)
]
merged_df = (
pd.concat(
pd.read_csv(
file,
sep="\t",
dtype={
"#CHROM": str,
"POS": int,
"ID": str,
"REF": str,
"ALT": str,
"QUAL": str,
"FILTER": str,
"INFO": str,
},
)
for file in input_vcfs
)
.drop_duplicates(subset=["#CHROM", "POS", "REF", "ALT"])
.sort_values(by=["#CHROM", "POS"])
.reset_index(drop=True)
)
# Partition the merged file into chunks of 2000 variants to run the VEP jobs in parallel
chunks = 0
for i in range(0, len(merged_df), chunk_size):
merged_df[i : i + chunk_size].to_csv(
f"{VCF_MERGED_DST_PATH}/chunk_{i + 1}-{i + chunk_size}.vcf",
index=False,
header=True,
sep="\t",
)
chunks += 1
expected_chunks_count = len(merged_df) // chunk_size + 1
assert (
chunks == expected_chunks_count
), f"Expected {expected_chunks_count} chunks but got {chunks} chunks"
@dataclass
class PathManager:
"""It is quite complicated to keep track of all the input/output buckets, the corresponding mounting points prefixes etc..."""
VCF_INPUT_BUCKET: str
VEP_OUTPUT_BUCKET: str
VEP_CACHE_BUCKET: str
MOUNT_DIR_ROOT: str
# Derived parameters to find the list of files to process:
input_path: str | None = None
input_bucket: str | None = None
# Derived parameters to initialise the docker image:
path_dictionary: dict[str, dict[str, str]] | None = None
# Derived parameters to point to the right mouting points:
cache_dir: str | None = None
input_dir: str | None = None
output_dir: str | None = None
def __post_init__(self: PathManager) -> None:
"""Build paths based on the input parameters."""
self.path_dictionary = {
"input": {
"remote_path": self.VCF_INPUT_BUCKET.replace("gs://", ""),
"mount_point": f"{self.MOUNT_DIR_ROOT}/input",
},
"output": {
"remote_path": self.VEP_OUTPUT_BUCKET.replace("gs://", ""),
"mount_point": f"{self.MOUNT_DIR_ROOT}/output",
},
"cache": {
"remote_path": self.VEP_CACHE_BUCKET.replace("gs://", ""),
"mount_point": f"{self.MOUNT_DIR_ROOT}/cache",
},
}
# Parameters for fetching files:
self.input_path = self.VCF_INPUT_BUCKET.replace("gs://", "") + "/"
self.input_bucket = self.VCF_INPUT_BUCKET.split("/")[2]
# Parameters for VEP:
self.cache_dir = f"{self.MOUNT_DIR_ROOT}/cache"
self.input_dir = f"{self.MOUNT_DIR_ROOT}/input"
self.output_dir = f"{self.MOUNT_DIR_ROOT}/output"
def get_mount_config(self) -> list[dict[str, str]]:
"""Return the mount configuration.
Returns:
list[dict[str, str]]: The mount configuration.
"""
assert self.path_dictionary is not None, "Path dictionary not initialized."
return list(self.path_dictionary.values())
@task(task_id="vep_annotation")
def vep_annotation(pm: PathManager, **kwargs: Any) -> None:
"""Submit a Batch job to annotate VCFs with a local VEP docker image.
Args:
pm (PathManager): The path manager with all the required path related information.
**kwargs (Any): Keyword arguments.
"""
# Get the filenames to process:
ti = kwargs["ti"]
filenames = [
os.path.basename(os.path.splitext(path)[0])
for path in ti.xcom_pull(task_ids="get_vep_todo_list", key="return_value")
]
# Stop process if no files was found:
assert filenames, "No files found to process."
# Based on the filenames, build the environment variables for the batch job:
task_env = [
batch_v1.Environment(
variables={
"INPUT_FILE": f"{filename}.vcf",
"OUTPUT_FILE": f"{filename}.json",
}
)
for filename in filenames
]
# Build the command to run in the container:
command = [
"-c",
rf"vep --cache --offline --format vcf --force_overwrite \
--no_stats \
--dir_cache {pm.cache_dir} \
--input_file {pm.input_dir}/$INPUT_FILE \
--output_file {pm.output_dir}/$OUTPUT_FILE --json \
--dir_plugins {pm.cache_dir}/VEP_plugins \
--sift b \
--polyphen b \
--fasta {pm.cache_dir}/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz \
--mane_select \
--appris \
--hgvsg \
--pick_order mane_select,canonical \
--per_gene \
--uniprot \
--check_existing \
--exclude_null_alleles \
--canonical \
--plugin TSSDistance \
--distance 500000 \
--plugin LoF,loftee_path:{pm.cache_dir}/VEP_plugins,gerp_bigwig:{pm.cache_dir}/gerp_conservation_scores.homo_sapiens.GRCh38.bw,human_ancestor_fa:{pm.cache_dir}/human_ancestor.fa.gz,conservation_file:/opt/vep/loftee.sql \
--plugin AlphaMissense,file={pm.cache_dir}/AlphaMissense_hg38.tsv.gz,transcript_match=1 \
--plugin CADD,snv={pm.cache_dir}/CADD_GRCh38_whole_genome_SNVs.tsv.gz",
]
task = create_task_spec(VEP_DOCKER_IMAGE, command)
batch_task = CloudBatchSubmitJobOperator(
task_id="vep_batch_job",
project_id=PROJECT_ID,
region=REGION,
job_name=f"vep-job-{time.strftime('%Y%m%d-%H%M%S')}",
job=create_batch_job(task, "VEPMACHINE", task_env, pm.get_mount_config()),
deferrable=False,
)
batch_task.execute(context=kwargs)
with DAG(
dag_id=Path(__file__).stem,
description="Open Targets Genetics — create VCF file from datasets that contain variant information",
default_args=shared_dag_args,
**shared_dag_kwargs,
) as dag:
pm = PathManager(
VCF_MERGED_DST_PATH,
VEP_OUTPUT_BUCKET,
VEP_CACHE_BUCKET,
MOUNT_DIR,
)
(
create_vcf()
>> GCSListObjectsOperator(
task_id="get_vcf_per_source",
bucket=GCS_BUCKET,
prefix=VCF_DST_PATH.replace(f"gs://{GCS_BUCKET}/", ""),
trigger_rule=TriggerRule.ALL_SUCCESS,
match_glob="**.csv",
)
>> merge_vcfs()
>> GCSListObjectsOperator(
task_id="get_vep_todo_list",
bucket=GCS_BUCKET,
prefix=VCF_MERGED_DST_PATH.replace(f"gs://{GCS_BUCKET}/", ""),
trigger_rule=TriggerRule.ALL_SUCCESS,
match_glob="**.vcf",
)
>> vep_annotation(pm)
>> create_cluster(
CLUSTER_NAME,
autoscaling_policy=AUTOSCALING,
num_workers=4,
worker_machine_type="n1-highmem-8",
)
>> install_dependencies(CLUSTER_NAME)
>> submit_step(
cluster_name=CLUSTER_NAME,
step_id="ot_variant_index",
task_id="ot_variant_index",
other_args=[
f"step.vep_output_json_path={VEP_OUTPUT_BUCKET}",
f"step.variant_index_path={VARIANT_INDEX_BUCKET}",
f"step.gnomad_variant_annotations_path={GNOMAD_ANNOTATION_PATH}",
],
)
>> delete_cluster(CLUSTER_NAME)
)