Skip to content

Commit

Permalink
Merge pull request #70 from bioml-tools/afdb-fixes
Browse files Browse the repository at this point in the history
v0.1.2: Example upload script fixes
  • Loading branch information
alex-hh authored Nov 15, 2024
2 parents 7b1f5ca + e45b954 commit 3bf1aeb
Show file tree
Hide file tree
Showing 22 changed files with 5,313 additions and 82 deletions.
1 change: 1 addition & 0 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ jobs:
run: |
python -m pip install --upgrade pip
pip install -e .
python build_cython.py
- name: Get current CCD for hashing
run: wget -P ./src/bio_datasets/structure/library/ https://files.wwpdb.org/pub/pdb/data/monomers/components.cif.gz
- name: Get current CCD frequency file for hashing
Expand Down
12 changes: 6 additions & 6 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,29 +5,29 @@ repos:
- id: isort
args: ["--profile", "black"]
types: [python]
exclude: '^data'
exclude: '^data|^src/bio_datasets/structure/pdbx'

- repo: https://github.com/psf/black
rev: 22.10.0
hooks:
- id: black
types: [python]
exclude: '^data|.*\.pdb$|.*\.cif|.*\.bcif'
exclude: '^data|.*\.pdb$|.*\.cif|.*\.bcif|^src/bio_datasets/structure/pdbx'
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v2.3.0
hooks:
- id: check-yaml
exclude: '^data|.*\.pdb$|.*\.cif$|.*\.bcif|^src/bio_datasets/structure/protein/library'
exclude: '^data|.*\.pdb$|.*\.cif$|.*\.bcif|^src/bio_datasets/structure/protein/library|^src/bio_datasets/structure/pdbx'
- id: end-of-file-fixer
exclude: '^data|.*\.pdb$|.*\.cif$|.*\.bcif|^src/bio_datasets/structure/protein/library'
exclude: '^data|.*\.pdb$|.*\.cif$|.*\.bcif|^src/bio_datasets/structure/protein/library|^src/bio_datasets/structure/pdbx'
- id: trailing-whitespace
exclude: '^data|.*\.pdb$|.*\.cif$|.*\.bcif|^src/bio_datasets/structure/protein/library'
exclude: '^data|.*\.pdb$|.*\.cif$|.*\.bcif|^src/bio_datasets/structure/protein/library|^src/bio_datasets/structure/pdbx'
# exclude: '^data|^scripts/gvp'
- repo: https://github.com/pycqa/flake8
rev: 6.0.0 # Use the latest stable version
hooks:
- id: flake8
exclude: '^examples'
exclude: '^examples|^src/bio_datasets/structure/pdbx'
name: "Linter"
types: [python]
args:
Expand Down
26 changes: 23 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,32 @@ This makes it easy to share datasets in efficient storage formats, while allowin

To illustrate, we provide examples of datasets pre-configured with Bio Datasets Feature types that can be downloaded from the hub.

#### Generic biomolecular structure data

```python
from bio_datasets import load_dataset

dataset = load_dataset(
"biodatasets/pdb",
split="train",
)
ex = dataset[0]
print(type(ex["structure"])) # a dict with keys `id` and `structure` (a `biotite.structure.AtomArray`)
```
```
biotite.structure.AtomArray
```

#### Protein structure data (e.g. from afdb)

```python
from bio_datasets import load_dataset

dataset = load_dataset(
"biodatasets/afdb_e_coli",
split="train",
)
ex = dataset[0] # a dict with keys `name` and `structure` (a `biotite.structure.AtomArray` wrapped in a `bio_datasets.Protein` object for standardisation.)
ex = dataset[0] # a dict with keys `name` and `structure` (a `biotite.structure.AtomArray` wrapped in a `bio_datasets.ProteinChain` object for standardisation.)
print(type(ex["structure"]))
```
```
Expand All @@ -69,7 +87,7 @@ print(dataset.info.features)
```
```
{'name': Value(dtype='string', id=None),
'structure': ProteinStructureFeature(requires_encoding=True, requires_decoding=True, decode=True, id=None, with_occupancy=False, with_b_factor=True, with_atom_id=False, with_charge=False, encode_with_foldcomp=False)}
'structure': ProteinStructureFeature(requires_encoding=True, requires_decoding=True, decode=True, load_as='chain', constructor_kwargs=None, load_assembly=False, fill_missing_residues=False, include_bonds=False, with_occupancy=False, with_b_factor=True, with_atom_id=False, with_charge=False, encode_with_foldcomp=False, compression=None)}
```

To summarise: this dataset contains two features: 'name', which is a string, and 'structure' which is a `bio_datasets.ProteinStructureFeature`. Features of this type will automatically be loaded as `bio_datasets.Protein` instances when the Bio Datasets library is installed; and as dictionaries containing the fields `path`, `bytes` (the file contents) and `type` (the file format e.g. 'pdb', 'cif', etc.) fields when loaded with `datasets.load_dataset` by a user who does not have Bio Datasets installed.
Expand Down Expand Up @@ -137,8 +155,10 @@ that supports blazingly fast iteration over fully featurised samples.
Let's convert the `bio_datasets.StructureFeature` data to the `bio_datasets.AtomArrayFeature` type, and compare iteration speed:


<!-- TODO: same for bcif pdb (and others) -->
```python
from bio_datasets import Features, Value, load_dataset AtomArrayFeature
import timeit
from bio_datasets import AtomArrayFeature, Features, Value, load_dataset

dataset = load_dataset(
"biodatasets/afdb_e_coli",
Expand Down
56 changes: 56 additions & 0 deletions build_cython.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""Compile the Cython code for the encoding module in place to support editable installs."""

import glob
import os
import shutil
from distutils.command.build_ext import build_ext
from distutils.core import Distribution

import numpy
from Cython.Build import cythonize
from setuptools import Extension

# Define the extension
extensions = [
Extension(
name="bio_datasets.structure.pdbx.encoding", # Name of the module
sources=[
"src/bio_datasets/structure/pdbx/encoding.pyx"
], # Path to your Cython file
include_dirs=[numpy.get_include()], # Include NumPy headers if needed
)
]

cythonized_extensions = cythonize(
extensions, compiler_directives={"language_level": 3, "boundscheck": False}
)

# Create a distribution object
dist = Distribution({"ext_modules": cythonized_extensions})
dist.script_name = "setup.py"
dist.script_args = ["build_ext", "--inplace", "--verbose"]

# Run the build_ext command
cmd = build_ext(dist)
cmd.ensure_finalized()
cmd.run()

# Define the source pattern and target path
source_pattern = os.path.join(
"build", "lib.*", "bio_datasets", "structure", "pdbx", "encoding*.so"
)
target_dir = os.path.join("src", "bio_datasets", "structure", "pdbx")

# Find the .so file with the potential suffix
so_files = glob.glob(source_pattern)

# Ensure that exactly one .so file is found
if len(so_files) == 1:
source_path = so_files[0]
target_path = os.path.join(target_dir, os.path.basename(source_path))
# Copy the .so file from the build directory to the target directory
shutil.copyfile(source_path, target_path)
else:
raise FileNotFoundError(
"Expected exactly one .so file, found: {}".format(len(so_files))
)
10 changes: 7 additions & 3 deletions examples/upload_foldcomp_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@

from bio_datasets import Dataset, Features, NamedSplit, Value
from bio_datasets.features import ProteinAtomArrayFeature, ProteinStructureFeature
from bio_datasets.features.atom_array import load_structure
from bio_datasets.structure import ProteinChain
from bio_datasets.structure.parsing import load_structure
from bio_datasets.structure.protein import ProteinDictionary


def examples_generator(
Expand Down Expand Up @@ -58,7 +58,11 @@ def main(
"afdb", backbone_only=backbone_only
)
if as_array
else ProteinStructureFeature(with_b_factor=True),
else ProteinStructureFeature(
with_b_factor=True,
load_as="chain",
residue_dictionary=ProteinDictionary.from_preset("protein", keep_oxt=True),
),
)
import tempfile

Expand Down
1 change: 0 additions & 1 deletion examples/upload_pdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
import argparse
import glob
import os
import shutil
import subprocess
import tempfile

Expand Down
29 changes: 15 additions & 14 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,35 +1,36 @@
[build-system]
requires = ["poetry-core>=1.0.0"] # Use poetry-core for build-system requirements
build-backend = "poetry.core.masonry.api"

[tool.poetry]
name = "datasets-bio"
version = "0.1.1"
version = "0.1.2"
description = "Fast, convenient and shareable datasets for BioML"
authors = [
"Alex Hawkins-Hooker",
]
requires-python = ">=3.7"
authors = ["Alex Hawkins-Hooker"]
license = "Apache-2.0"
readme = "README.md"
homepage = "https://github.com/bioml-tools/bio-datasets"
repository = "https://github.com/bioml-tools/bio-datasets"
packages = [
{ include = "bio_datasets", from = "src" },
{ include = "bio_datasets_cli", from="src" }
{ include = "bio_datasets_cli", from = "src" },
]
long_description = "Bringing bio (molecules and more) to the HuggingFace Datasets library. This (unofficial!) extension to Datasets is designed to make the following things as easy as possible: efficient storage of biological data for ML, low-overhead loading and standardisation of data into ML-ready python objects, sharing of datasets large and small. We aim to do these three things and *no more*, leaving you to get on with the science!"
long_description_content_type = "text/markdown"

[tool.poetry.dependencies]
pytest = ">=8.2.0"
python = ">=3.7"
foldcomp = ">=0.0.7"
biotite = ">=1.0.1"
huggingface_hub = ">=0.26.2"
datasets-fast = ">=3.1.3"
packaging = ">=23.2"
pytest = ">=8.2.0"
Cython = "3.0.11"

[tool.poetry.scripts]
cif2bcif = "bio_datasets_cli.cif2bcif:main"
cifs2bcifs = "bio_datasets_cli.cif2bcif:dir_main"

[tool.poetry.source]
[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"

[project.source]
name = "pypi"
url = "https://pypi.org/simple"

Expand Down
3 changes: 2 additions & 1 deletion setup_ccd.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@

import numpy as np
import requests
from biotite.structure.io.pdbx import *

from bio_datasets.structure.pdbx import *

OUTPUT_CCD = (
Path(__file__).parent
Expand Down
12 changes: 11 additions & 1 deletion src/bio_datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
# flake8: noqa: E402, F401
import os
from pathlib import Path

# Change cache location - n.b. this will also affect the datasets cache in same session
# this prevents issues with pre-cached datasets downloaded with datasets instead of bio_datasets
DEFAULT_XDG_CACHE_HOME = "~/.cache"
XDG_CACHE_HOME = os.getenv("XDG_CACHE_HOME", DEFAULT_XDG_CACHE_HOME)
DEFAULT_HF_CACHE_HOME = os.path.join(XDG_CACHE_HOME, "huggingface")
HF_CACHE_HOME = os.path.expanduser(os.getenv("HF_HOME", DEFAULT_HF_CACHE_HOME))
os.environ["HF_DATASETS_CACHE"] = os.path.join(HF_CACHE_HOME, "bio_datasets")

import importlib
import inspect
import json
import logging
from pathlib import Path
Expand Down
24 changes: 15 additions & 9 deletions src/bio_datasets/features/atom_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -966,15 +966,20 @@ class ProteinStructureFeature(StructureFeature):

load_as: str = "complex" # biomolecule or chain or complex or biotite; if chain must be monomer
_type: str = field(default="ProteinStructureFeature", init=False, repr=False)
residue_dictionary: Optional[Union[ResidueDictionary, Dict]] = None

def __post_init__(self):
# residue_dictionary will be set to default if not provided in constructor_kwargs
if "residue_dictionary" not in (
self.constructor_kwargs or {}
) and self.load_as in ["chain", "complex"]:
if self.residue_dictionary is None and self.load_as in ["chain", "complex"]:
self.residue_dictionary = ProteinDictionary.from_preset("protein")
logger.info(
"No residue_dictionary provided for ProteinStructureFeature, default ProteinDictionary will be used to decode."
)
self.deserialize()

def deserialize(self):
if isinstance(self.residue_dictionary, dict):
self.residue_dictionary = ProteinDictionary(**self.residue_dictionary)

def encode_example(self, value: Union[ProteinMixin, dict, bs.AtomArray]) -> dict:
if isinstance(value, bs.AtomArray):
Expand All @@ -995,9 +1000,13 @@ def _decode_example(
"Returning biomolecule for protein-specific feature not supported."
)
elif self.load_as == "chain":
return ProteinChain(atoms, **constructor_kwargs)
return ProteinChain(
atoms, residue_dictionary=self.residue_dictionary, **constructor_kwargs
)
elif self.load_as == "complex":
return ProteinComplex.from_atoms(atoms, **constructor_kwargs)
return ProteinComplex.from_atoms(
atoms, residue_dictionary=self.residue_dictionary, **constructor_kwargs
)
else:
raise ValueError(f"Unsupported load_as: {self.load_as}")

Expand Down Expand Up @@ -1041,10 +1050,6 @@ def __post_init__(self):
def deserialize(self):
if isinstance(self.residue_dictionary, dict):
self.residue_dictionary = ProteinDictionary(**self.residue_dictionary)
elif self.all_atoms_present:
assert isinstance(
self.residue_dictionary, ProteinDictionary
), "residue_dictionary must be a ProteinDictionary"

@classmethod
def from_preset(cls, preset: str, **kwargs):
Expand All @@ -1060,6 +1065,7 @@ def from_preset(cls, preset: str, **kwargs):
all_atoms_present=True,
with_element=False,
with_hetero=False,
load_as="chain",
**kwargs,
)
elif preset == "pdb":
Expand Down
29 changes: 6 additions & 23 deletions src/bio_datasets/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import dataclasses
import json
from dataclasses import asdict, dataclass
from typing import ClassVar, Dict, List, Optional
from typing import ClassVar, Dict, List

from datasets.info import DatasetInfo
from datasets.splits import SplitDict
Expand All @@ -21,35 +21,20 @@ class DatasetInfo(DatasetInfo):
but during serialisation, features needs to be fallback features (compatible with standard Datasets lib).
"""

bio_features: Optional[Features] = None

_INCLUDED_INFO_IN_YAML: ClassVar[List[str]] = [
"config_name",
"download_size",
"dataset_size",
"features",
"bio_features",
"splits",
]

def __post_init__(self):
super().__post_init__()
if self.bio_features is None and self.features is not None:
self.bio_features = self.features
if self.bio_features is not None and not isinstance(
self.bio_features, Features
):
self.bio_features = Features.from_dict(self.bio_features)
if self.bio_features is not None:
self.features = self.bio_features

def _to_yaml_dict(self) -> dict:
# sometimes features are None
if self.bio_features is not None:
self.features = self.bio_features.to_fallback()
datasets_features = self.features.to_fallback()
ret = super()._to_yaml_dict()
if self.bio_features is not None:
self.features = self.bio_features
ret["bio_features"] = ret["features"]
ret["features"] = datasets_features._to_yaml_list()
return ret

@classmethod
Expand All @@ -73,10 +58,8 @@ def _dump_info(self, file, pretty_print=False):
def _from_yaml_dict(cls, yaml_data: dict) -> "DatasetInfo":
yaml_data = copy.deepcopy(yaml_data)
if yaml_data.get("bio_features") is not None:
yaml_data["bio_features"] = Features._from_yaml_list(
yaml_data["bio_features"]
)
if yaml_data.get("features") is not None:
yaml_data["features"] = Features._from_yaml_list(yaml_data["bio_features"])
elif yaml_data.get("features") is not None:
yaml_data["features"] = Features._from_yaml_list(yaml_data["features"])
if yaml_data.get("splits") is not None:
yaml_data["splits"] = SplitDict._from_yaml_list(yaml_data["splits"])
Expand Down
Loading

0 comments on commit 3bf1aeb

Please sign in to comment.