opentargets · ireneisdoomed · Apr 13, 2023 · Apr 6, 2023 · Apr 6, 2023 · Apr 6, 2023
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,6 @@ outputs/
 coverage.xml
 docs/assets/schemas/
 .cache/
+mock_data/
+notebooks/wandb
+src/wandb
diff --git a/config/datasets/gcp.yaml b/config/datasets/gcp.yaml
@@ -34,4 +34,4 @@ ld_index: ${datasets.outputs}/ld_index
 catalog_study_index: ${datasets.outputs}/catalog_study_index
 catalog_study_locus: ${datasets.outputs}/catalog_study_locus
 #templates
-ld_index_template: ${datasets.outputs}/gnomad_r2.1.1.{POP}.common.ld.variant_indices.parquet
+ld_index_template: ${datasets.outputs}/ld_indices/gnomad_r2.1.1.{POP}.common.ld.variant_indices
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -72,6 +72,7 @@ flake8-class-attributes-order = "^0.1.3"
 ipykernel = "^6.19.0"
 flake8-pytest-style = "^1.7.2"
 google-cloud-dataproc = "^5.4.1"
+mypy = "1.2.0"
 
 [build-system]
 requires = ["poetry-core>=1.0.0"]

diff --git a/src/otg/common/schemas.py b/src/otg/common/schemas.py
@@ -4,7 +4,7 @@
 import importlib.resources as pkg_resources
 import json
 
-from pyspark.sql.types import StructType
+from pyspark.sql.types import ArrayType, StructType
 
 from otg.assets import schemas
 
@@ -22,3 +22,37 @@ def parse_spark_schema(schema_json: str) -> StructType:
         pkg_resources.read_text(schemas, schema_json, encoding="utf-8")
     )
     return StructType.fromJson(core_schema)
+
+
+def flatten_schema(schema: StructType, prefix: str = "") -> list:
+    """It takes a Spark schema and returns a list of all fields in the schema once flattened.
+
+    Args:
+        schema: The schema of the dataframe
+        prefix: The prefix to prepend to the field names.
+
+    Returns:
+        list: A list of all the columns in the dataframe.
+
+    Examples:
+        >>> from pyspark.sql.types import ArrayType, StringType, StructField, StructType
+        >>> schema = StructType(
+        ...     [
+        ...        StructField("studyLocusId", StringType(), False),
+        ...        StructField("credibleSet", ArrayType(StructType([StructField("tagVariantId", StringType(), False)])), False)
+        ...    ]
+        ... )
+        >>> df = spark.createDataFrame([("A", [{"tagVariantId": "varA"}]), ("B", [{"tagVariantId": "varB"}])], schema)
+        >>> flatten_schema(df.schema)
+        [('studyLocusId', StringType), ('credibleSet', ArrayType(StructType(List(StructField(tagVariantId,StringType,false))),true)), ('credibleSet.tagVariantId', StringType)]
+    """
+    fields = []
+    for field in schema.fields:
+        name = f"{prefix}.{field.name}" if prefix else field.name
+        dtype = field.dataType
+        fields.append((name, dtype))
+        if isinstance(dtype, StructType):
+            fields += flatten_schema(dtype, prefix=name)
+        elif isinstance(dtype, ArrayType) and isinstance(dtype.elementType, StructType):
+            fields += flatten_schema(dtype.elementType, prefix=name)
+    return fields
diff --git a/src/otg/config.py b/src/otg/config.py
@@ -192,8 +192,10 @@ class GWASCatalogStepConfig:
         catalog_sumstats_lut (str): GWAS Catalog summary statistics lookup table.
         catalog_associations_file (str): Raw GWAS catalog associations file.
         variant_annotation_path (str): Input variant annotation path.
-        ld_populations (list): List of populations to include.
         min_r2 (float): Minimum r2 to consider when considering variants within a window.
+        ld_index_template (str): Template path of the LD matrix index containing `{POP}` where the population is expected
+        ld_matrix_template (str): Template path of the LD matrix containing `{POP}` where the population is expected
+        ld_populations (list): List of populations to include.
         catalog_studies_out (str): Output GWAS catalog studies path.
         catalog_associations_out (str): Output GWAS catalog associations path.
     """

diff --git a/src/otg/dataset/dataset.py b/src/otg/dataset/dataset.py
@@ -4,6 +4,8 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING
 
+from otg.common.schemas import flatten_schema
+
 if TYPE_CHECKING:
     from pyspark.sql import DataFrame
     from pyspark.sql.types import StructType
@@ -64,19 +66,36 @@ def validate_schema(self: Dataset) -> None:
         Raises:
             ValueError: DataFrame schema is not valid
         """
-        expected_schema = self._schema  # type: ignore[attr-defined]
-        observed_schema = self._df.schema  # type: ignore[attr-defined]
-        # Observed fields no    t in schema
-        missing_struct_fields = [x for x in observed_schema if x not in expected_schema]
-        error_message = f"The {missing_struct_fields} StructFields are not included in DataFrame schema: {expected_schema}"
-        if missing_struct_fields:
-            raise ValueError(error_message)
+        expected_schema = self._schema
+        expected_fields = flatten_schema(expected_schema)
+        observed_schema = self._df.schema
+        observed_fields = flatten_schema(observed_schema)
+
+        # Unexpected fields in dataset
+        if unexpected_struct_fields := [
+            x for x in observed_fields if x not in expected_fields
+        ]:
+            raise ValueError(
+                f"The {unexpected_struct_fields} fields are not included in DataFrame schema: {expected_fields}"
+            )
 
         # Required fields not in dataset
-        required_fields = [x for x in expected_schema if not x.nullable]
-        missing_required_fields = [
-            x for x in required_fields if x not in observed_schema
+        required_fields = [
+            (x.name, x.dataType) for x in expected_schema if not x.nullable
         ]
-        error_message = f"The {missing_required_fields} StructFields are required but missing from the DataFrame schema: {expected_schema}"
-        if missing_required_fields:
-            raise ValueError(error_message)
+        if missing_required_fields := [
+            x for x in required_fields if x not in observed_fields
+        ]:
+            raise ValueError(
+                f"The {missing_required_fields} fields are required but missing: {required_fields}"
+            )
+
+        # Fields with different datatype
+        if fields_with_different_observed_datatype := [
+            field
+            for field in set(observed_fields)
+            if observed_fields.count(field) != expected_fields.count(field)
+        ]:
+            raise ValueError(
+                f"The following fields present differences in their datatypes: {fields_with_different_observed_datatype}"
+            )
diff --git a/src/otg/dataset/ld_index.py b/src/otg/dataset/ld_index.py
@@ -27,7 +27,7 @@
 
 @dataclass
 class LDIndex(Dataset):
-    """Dataset to index access to LD information from GnomAD."""
+    """Dataset to access to LD information from GnomAD."""
 
     _schema: StructType = parse_spark_schema("ld_index.json")
 
@@ -175,6 +175,7 @@ def create(
                     "alternateAllele"
                 ),
             )
+            # Convert gnomad position to Ensembl position (1-based for indels)
             .withColumn(
                 "position",
                 convert_gnomad_position_to_ensembl(
@@ -195,7 +196,6 @@ def create(
             )
             .withColumn("start_idx", f.lit(None).cast(t.LongType()))
             .withColumn("stop_idx", f.lit(None).cast(t.LongType()))
-            # Convert gnomad position to Ensembl position (1-based for indels)
             .repartition(400, "chromosome")
             .sortWithinPartitions("position")
             .persist()