Merge pull request #70 from bioml-tools/afdb-fixes

v0.1.2: Example upload script fixes
bioml-tools · Nov 15, 2024 · 3bf1aeb · 3bf1aeb
2 parents 7b1f5ca + e45b954
commit 3bf1aeb
Show file tree

Hide file tree

Showing 22 changed files with 5,313 additions and 82 deletions.
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -40,6 +40,7 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           pip install -e .
+          python build_cython.py
       - name: Get current CCD for hashing
         run: wget -P ./src/bio_datasets/structure/library/ https://files.wwpdb.org/pub/pdb/data/monomers/components.cif.gz
       - name: Get current CCD frequency file for hashing

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -5,29 +5,29 @@ repos:
       - id: isort
         args: ["--profile", "black"]
         types: [python]
-        exclude: '^data'
+        exclude: '^data|^src/bio_datasets/structure/pdbx'
 
   - repo: https://github.com/psf/black
     rev: 22.10.0
     hooks:
       - id: black
         types: [python]
-        exclude: '^data|.*\.pdb$|.*\.cif|.*\.bcif'
+        exclude: '^data|.*\.pdb$|.*\.cif|.*\.bcif|^src/bio_datasets/structure/pdbx'
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v2.3.0
     hooks:
       - id: check-yaml
-        exclude: '^data|.*\.pdb$|.*\.cif$|.*\.bcif|^src/bio_datasets/structure/protein/library'
+        exclude: '^data|.*\.pdb$|.*\.cif$|.*\.bcif|^src/bio_datasets/structure/protein/library|^src/bio_datasets/structure/pdbx'
       - id: end-of-file-fixer
-        exclude: '^data|.*\.pdb$|.*\.cif$|.*\.bcif|^src/bio_datasets/structure/protein/library'
+        exclude: '^data|.*\.pdb$|.*\.cif$|.*\.bcif|^src/bio_datasets/structure/protein/library|^src/bio_datasets/structure/pdbx'
       - id: trailing-whitespace
-        exclude: '^data|.*\.pdb$|.*\.cif$|.*\.bcif|^src/bio_datasets/structure/protein/library'
+        exclude: '^data|.*\.pdb$|.*\.cif$|.*\.bcif|^src/bio_datasets/structure/protein/library|^src/bio_datasets/structure/pdbx'
         # exclude: '^data|^scripts/gvp'
   - repo: https://github.com/pycqa/flake8
     rev: 6.0.0  # Use the latest stable version
     hooks:
       - id: flake8
-        exclude: '^examples'
+        exclude: '^examples|^src/bio_datasets/structure/pdbx'
         name: "Linter"
         types: [python]
         args:

diff --git a/README.md b/README.md
@@ -45,14 +45,32 @@ This makes it easy to share datasets in efficient storage formats, while allowin
 
 To illustrate, we provide examples of datasets pre-configured with Bio Datasets Feature types that can be downloaded from the hub.
 
+#### Generic biomolecular structure data
+
+```python
+from bio_datasets import load_dataset
+
+dataset = load_dataset(
+    "biodatasets/pdb",
+    split="train",
+)
+ex = dataset[0]
+print(type(ex["structure"]))  # a dict with keys `id` and `structure` (a `biotite.structure.AtomArray`)
+```
+```
+biotite.structure.AtomArray
+```
+
+#### Protein structure data (e.g. from afdb)
+
 ```python
 from bio_datasets import load_dataset
 
 dataset = load_dataset(
     "biodatasets/afdb_e_coli",
     split="train",
 )
-ex = dataset[0]  # a dict with keys `name` and `structure` (a `biotite.structure.AtomArray` wrapped in a `bio_datasets.Protein` object for standardisation.)
+ex = dataset[0]  # a dict with keys `name` and `structure` (a `biotite.structure.AtomArray` wrapped in a `bio_datasets.ProteinChain` object for standardisation.)
 print(type(ex["structure"]))
 ```
 ```
@@ -69,7 +87,7 @@ print(dataset.info.features)
 ```
 ```
 {'name': Value(dtype='string', id=None),
- 'structure': ProteinStructureFeature(requires_encoding=True, requires_decoding=True, decode=True, id=None, with_occupancy=False, with_b_factor=True, with_atom_id=False, with_charge=False, encode_with_foldcomp=False)}
+ 'structure': ProteinStructureFeature(requires_encoding=True, requires_decoding=True, decode=True, load_as='chain', constructor_kwargs=None, load_assembly=False, fill_missing_residues=False, include_bonds=False, with_occupancy=False, with_b_factor=True, with_atom_id=False, with_charge=False, encode_with_foldcomp=False, compression=None)}
 ```
 
 To summarise: this dataset contains two features: 'name', which is a string, and 'structure' which is a `bio_datasets.ProteinStructureFeature`. Features of this type will automatically be loaded as `bio_datasets.Protein` instances when the Bio Datasets library is installed; and as dictionaries containing the fields `path`, `bytes` (the file contents) and `type` (the file format e.g. 'pdb', 'cif', etc.) fields when loaded with `datasets.load_dataset` by a user who does not have Bio Datasets installed.
@@ -137,8 +155,10 @@ that supports blazingly fast iteration over fully featurised samples.
 Let's convert the `bio_datasets.StructureFeature` data to the `bio_datasets.AtomArrayFeature` type, and compare iteration speed:
 
 
+<!-- TODO: same for bcif pdb (and others) -->
 ```python
-from bio_datasets import Features, Value, load_dataset AtomArrayFeature
+import timeit
+from bio_datasets import AtomArrayFeature, Features, Value, load_dataset
 
 dataset = load_dataset(
     "biodatasets/afdb_e_coli",

diff --git a/build_cython.py b/build_cython.py
@@ -0,0 +1,56 @@
+"""Compile the Cython code for the encoding module in place to support editable installs."""
+
+import glob
+import os
+import shutil
+from distutils.command.build_ext import build_ext
+from distutils.core import Distribution
+
+import numpy
+from Cython.Build import cythonize
+from setuptools import Extension
+
+# Define the extension
+extensions = [
+    Extension(
+        name="bio_datasets.structure.pdbx.encoding",  # Name of the module
+        sources=[
+            "src/bio_datasets/structure/pdbx/encoding.pyx"
+        ],  # Path to your Cython file
+        include_dirs=[numpy.get_include()],  # Include NumPy headers if needed
+    )
+]
+
+cythonized_extensions = cythonize(
+    extensions, compiler_directives={"language_level": 3, "boundscheck": False}
+)
+
+# Create a distribution object
+dist = Distribution({"ext_modules": cythonized_extensions})
+dist.script_name = "setup.py"
+dist.script_args = ["build_ext", "--inplace", "--verbose"]
+
+# Run the build_ext command
+cmd = build_ext(dist)
+cmd.ensure_finalized()
+cmd.run()
+
+# Define the source pattern and target path
+source_pattern = os.path.join(
+    "build", "lib.*", "bio_datasets", "structure", "pdbx", "encoding*.so"
+)
+target_dir = os.path.join("src", "bio_datasets", "structure", "pdbx")
+
+# Find the .so file with the potential suffix
+so_files = glob.glob(source_pattern)
+
+# Ensure that exactly one .so file is found
+if len(so_files) == 1:
+    source_path = so_files[0]
+    target_path = os.path.join(target_dir, os.path.basename(source_path))
+    # Copy the .so file from the build directory to the target directory
+    shutil.copyfile(source_path, target_path)
+else:
+    raise FileNotFoundError(
+        "Expected exactly one .so file, found: {}".format(len(so_files))
+    )
diff --git a/examples/upload_foldcomp_db.py b/examples/upload_foldcomp_db.py
@@ -11,8 +11,8 @@
 
 from bio_datasets import Dataset, Features, NamedSplit, Value
 from bio_datasets.features import ProteinAtomArrayFeature, ProteinStructureFeature
-from bio_datasets.features.atom_array import load_structure
-from bio_datasets.structure import ProteinChain
+from bio_datasets.structure.parsing import load_structure
+from bio_datasets.structure.protein import ProteinDictionary
 
 
 def examples_generator(
@@ -58,7 +58,11 @@ def main(
             "afdb", backbone_only=backbone_only
         )
         if as_array
-        else ProteinStructureFeature(with_b_factor=True),
+        else ProteinStructureFeature(
+            with_b_factor=True,
+            load_as="chain",
+            residue_dictionary=ProteinDictionary.from_preset("protein", keep_oxt=True),
+        ),
     )
     import tempfile
 

diff --git a/examples/upload_pdb.py b/examples/upload_pdb.py
@@ -13,7 +13,6 @@
 import argparse
 import glob
 import os
-import shutil
 import subprocess
 import tempfile
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,35 +1,36 @@
-[build-system]
-requires = ["poetry-core>=1.0.0"]  # Use poetry-core for build-system requirements
-build-backend = "poetry.core.masonry.api"
-
 [tool.poetry]
 name = "datasets-bio"
-version = "0.1.1"
+version = "0.1.2"
 description = "Fast, convenient and shareable datasets for BioML"
-authors = [
-    "Alex Hawkins-Hooker",
-]
-requires-python = ">=3.7"
+authors = ["Alex Hawkins-Hooker"]
+license = "Apache-2.0"
+readme = "README.md"
+homepage = "https://github.com/bioml-tools/bio-datasets"
+repository = "https://github.com/bioml-tools/bio-datasets"
 packages = [
     { include = "bio_datasets", from = "src" },
-    { include = "bio_datasets_cli", from="src" }
+    { include = "bio_datasets_cli", from = "src" },
 ]
-long_description = "Bringing bio (molecules and more) to the HuggingFace Datasets library. This (unofficial!) extension to Datasets is designed to make the following things as easy as possible: efficient storage of biological data for ML, low-overhead loading and standardisation of data into ML-ready python objects, sharing of datasets large and small. We aim to do these three things and *no more*, leaving you to get on with the science!"
-long_description_content_type = "text/markdown"
 
 [tool.poetry.dependencies]
-pytest = ">=8.2.0"
+python = ">=3.7"
 foldcomp = ">=0.0.7"
 biotite = ">=1.0.1"
 huggingface_hub = ">=0.26.2"
 datasets-fast = ">=3.1.3"
 packaging = ">=23.2"
+pytest = ">=8.2.0"
+Cython = "3.0.11"
 
 [tool.poetry.scripts]
 cif2bcif = "bio_datasets_cli.cif2bcif:main"
 cifs2bcifs = "bio_datasets_cli.cif2bcif:dir_main"
 
-[tool.poetry.source]
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
+
+[project.source]
 name = "pypi"
 url = "https://pypi.org/simple"
 

diff --git a/setup_ccd.py b/setup_ccd.py
@@ -10,7 +10,8 @@
 
 import numpy as np
 import requests
-from biotite.structure.io.pdbx import *
+
+from bio_datasets.structure.pdbx import *
 
 OUTPUT_CCD = (
     Path(__file__).parent

diff --git a/src/bio_datasets/__init__.py b/src/bio_datasets/__init__.py
@@ -1,6 +1,16 @@
 # flake8: noqa: E402, F401
+import os
+from pathlib import Path
+
+# Change cache location - n.b. this will also affect the datasets cache in same session
+# this prevents issues with pre-cached datasets downloaded with datasets instead of bio_datasets
+DEFAULT_XDG_CACHE_HOME = "~/.cache"
+XDG_CACHE_HOME = os.getenv("XDG_CACHE_HOME", DEFAULT_XDG_CACHE_HOME)
+DEFAULT_HF_CACHE_HOME = os.path.join(XDG_CACHE_HOME, "huggingface")
+HF_CACHE_HOME = os.path.expanduser(os.getenv("HF_HOME", DEFAULT_HF_CACHE_HOME))
+os.environ["HF_DATASETS_CACHE"] = os.path.join(HF_CACHE_HOME, "bio_datasets")
+
 import importlib
-import inspect
 import json
 import logging
 from pathlib import Path

diff --git a/src/bio_datasets/features/atom_array.py b/src/bio_datasets/features/atom_array.py
@@ -966,15 +966,20 @@ class ProteinStructureFeature(StructureFeature):
 
     load_as: str = "complex"  # biomolecule or chain or complex or biotite; if chain must be monomer
     _type: str = field(default="ProteinStructureFeature", init=False, repr=False)
+    residue_dictionary: Optional[Union[ResidueDictionary, Dict]] = None
 
     def __post_init__(self):
         # residue_dictionary will be set to default if not provided in constructor_kwargs
-        if "residue_dictionary" not in (
-            self.constructor_kwargs or {}
-        ) and self.load_as in ["chain", "complex"]:
+        if self.residue_dictionary is None and self.load_as in ["chain", "complex"]:
+            self.residue_dictionary = ProteinDictionary.from_preset("protein")
             logger.info(
                 "No residue_dictionary provided for ProteinStructureFeature, default ProteinDictionary will be used to decode."
             )
+        self.deserialize()
+
+    def deserialize(self):
+        if isinstance(self.residue_dictionary, dict):
+            self.residue_dictionary = ProteinDictionary(**self.residue_dictionary)
 
     def encode_example(self, value: Union[ProteinMixin, dict, bs.AtomArray]) -> dict:
         if isinstance(value, bs.AtomArray):
@@ -995,9 +1000,13 @@ def _decode_example(
                 "Returning biomolecule for protein-specific feature not supported."
             )
         elif self.load_as == "chain":
-            return ProteinChain(atoms, **constructor_kwargs)
+            return ProteinChain(
+                atoms, residue_dictionary=self.residue_dictionary, **constructor_kwargs
+            )
         elif self.load_as == "complex":
-            return ProteinComplex.from_atoms(atoms, **constructor_kwargs)
+            return ProteinComplex.from_atoms(
+                atoms, residue_dictionary=self.residue_dictionary, **constructor_kwargs
+            )
         else:
             raise ValueError(f"Unsupported load_as: {self.load_as}")
 
@@ -1041,10 +1050,6 @@ def __post_init__(self):
     def deserialize(self):
         if isinstance(self.residue_dictionary, dict):
             self.residue_dictionary = ProteinDictionary(**self.residue_dictionary)
-        elif self.all_atoms_present:
-            assert isinstance(
-                self.residue_dictionary, ProteinDictionary
-            ), "residue_dictionary must be a ProteinDictionary"
 
     @classmethod
     def from_preset(cls, preset: str, **kwargs):
@@ -1060,6 +1065,7 @@ def from_preset(cls, preset: str, **kwargs):
                 all_atoms_present=True,
                 with_element=False,
                 with_hetero=False,
+                load_as="chain",
                 **kwargs,
             )
         elif preset == "pdb":

diff --git a/src/bio_datasets/info.py b/src/bio_datasets/info.py
@@ -2,7 +2,7 @@
 import dataclasses
 import json
 from dataclasses import asdict, dataclass
-from typing import ClassVar, Dict, List, Optional
+from typing import ClassVar, Dict, List
 
 from datasets.info import DatasetInfo
 from datasets.splits import SplitDict
@@ -21,35 +21,20 @@ class DatasetInfo(DatasetInfo):
     but during serialisation, features needs to be fallback features (compatible with standard Datasets lib).
     """
 
-    bio_features: Optional[Features] = None
-
     _INCLUDED_INFO_IN_YAML: ClassVar[List[str]] = [
         "config_name",
         "download_size",
         "dataset_size",
         "features",
-        "bio_features",
         "splits",
     ]
 
-    def __post_init__(self):
-        super().__post_init__()
-        if self.bio_features is None and self.features is not None:
-            self.bio_features = self.features
-        if self.bio_features is not None and not isinstance(
-            self.bio_features, Features
-        ):
-            self.bio_features = Features.from_dict(self.bio_features)
-        if self.bio_features is not None:
-            self.features = self.bio_features
-
     def _to_yaml_dict(self) -> dict:
         # sometimes features are None
-        if self.bio_features is not None:
-            self.features = self.bio_features.to_fallback()
+        datasets_features = self.features.to_fallback()
         ret = super()._to_yaml_dict()
-        if self.bio_features is not None:
-            self.features = self.bio_features
+        ret["bio_features"] = ret["features"]
+        ret["features"] = datasets_features._to_yaml_list()
         return ret
 
     @classmethod
@@ -73,10 +58,8 @@ def _dump_info(self, file, pretty_print=False):
     def _from_yaml_dict(cls, yaml_data: dict) -> "DatasetInfo":
         yaml_data = copy.deepcopy(yaml_data)
         if yaml_data.get("bio_features") is not None:
-            yaml_data["bio_features"] = Features._from_yaml_list(
-                yaml_data["bio_features"]
-            )
-        if yaml_data.get("features") is not None:
+            yaml_data["features"] = Features._from_yaml_list(yaml_data["bio_features"])
+        elif yaml_data.get("features") is not None:
             yaml_data["features"] = Features._from_yaml_list(yaml_data["features"])
         if yaml_data.get("splits") is not None:
             yaml_data["splits"] = SplitDict._from_yaml_list(yaml_data["splits"])