Skip to content

Commit

Permalink
Merge pull request #665 from althonos/feat-3di
Browse files Browse the repository at this point in the history
Add 3di encoding to `biotite.structure`
  • Loading branch information
padix-key authored Nov 4, 2024
2 parents 72226ca + 80236c2 commit 66d51bc
Show file tree
Hide file tree
Showing 20 changed files with 1,207 additions and 26 deletions.
9 changes: 8 additions & 1 deletion doc/apidoc.json
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,6 @@
"set_component",
"list_assemblies",
"get_assembly"

],
"CIF format" : [
"CIFFile",
Expand All @@ -402,5 +401,13 @@
"StringArrayEncoding",
"TypeCode"
]
},
"biotite.structure.alphabet" : {
"Structural alphabets": [
"I3DSequence"
],
"Conversion Function": [
"to_3di"
]
}
}
16 changes: 15 additions & 1 deletion doc/references.bib
Original file line number Diff line number Diff line change
Expand Up @@ -742,7 +742,7 @@ @article{Steele2021
eprint = {2001.05304},
primaryclass = {cs},
doi = {10.48550/arXiv.2001.05304},
archiveprefix = {arxiv}
archiveprefix = {arXiv}
}

@article{Steinegger2017,
Expand Down Expand Up @@ -838,6 +838,20 @@ @article{VanHerk1992
doi = {10.1016/0167-8655(92)90069-C}
}

@article{VanKempen2024,
title = {Fast and Accurate Protein Structure Search with {{Foldseek}}},
author = {{van Kempen}, Michel and Kim, Stephanie S. and Tumescheit, Charlotte and Mirdita, Milot and Lee, Jeongjae and Gilchrist, Cameron L. M. and Söding, Johannes and Steinegger, Martin},
year = {2024},
month = feb,
journal = {Nature Biotechnology},
volume = {42},
number = {2},
pages = {243--246},
publisher = {Nature Publishing Group},
issn = {1546-1696},
doi = {10.1038/s41587-023-01773-0}
}

@article{Westbrook2015,
title = {The Chemical Component Dictionary: Complete Descriptions of Constituent Molecules in Experimentally Determined {{3D}} Macromolecules in the {{Protein Data Bank}}},
shorttitle = {The Chemical Component Dictionary},
Expand Down
52 changes: 34 additions & 18 deletions src/biotite/sequence/align/matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,17 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.

__all__ = ["SubstitutionMatrix"]
__name__ = "biotite.sequence.align"
__author__ = "Patrick Kunzmann"

import os
import functools
from pathlib import Path
import numpy as np
from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence

__all__ = ["SubstitutionMatrix"]
# Directory of matrix files
_DB_DIR = Path(__file__).parent / "matrix_data"


class SubstitutionMatrix(object):
Expand Down Expand Up @@ -59,6 +62,10 @@ class SubstitutionMatrix(object):
- **RBLOSUM<n>_<BLOCKS>**
- **CorBLOSUM<n>_<BLOCKS>**
- Structural alphabet substitution matrices
- **3Di** - For 3Di alphabet from ``foldseek`` :footcite:`VanKempen2024`
A list of all available matrix names is returned by
:meth:`list_db()`.
Expand Down Expand Up @@ -124,9 +131,6 @@ class SubstitutionMatrix(object):
>>> matrix = SubstitutionMatrix(alph, alph, "BLOSUM50")
"""

# Directory of matrix files
_db_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "matrix_data")

def __init__(self, alphabet1, alphabet2, score_matrix):
self._alph1 = alphabet1
self._alph2 = alphabet2
Expand Down Expand Up @@ -350,7 +354,7 @@ def dict_from_db(matrix_name):
matrix_dict : dict
A dictionary representing the substitution matrix.
"""
filename = SubstitutionMatrix._db_dir + os.sep + matrix_name + ".mat"
filename = _DB_DIR / f"{matrix_name}.mat"
with open(filename, "r") as f:
return SubstitutionMatrix.dict_from_str(f.read())

Expand All @@ -364,11 +368,10 @@ def list_db():
db_list : list
List of matrix names in the internal database.
"""
files = os.listdir(SubstitutionMatrix._db_dir)
# Remove '.mat' from files
return [file[:-4] for file in sorted(files)]
return [path.stem for path in _DB_DIR.glob("*.mat")]

@staticmethod
@functools.cache
def std_protein_matrix():
"""
Get the default :class:`SubstitutionMatrix` for protein sequence
Expand All @@ -379,9 +382,12 @@ def std_protein_matrix():
matrix : SubstitutionMatrix
Default matrix.
"""
return _matrix_blosum62
return SubstitutionMatrix(
ProteinSequence.alphabet, ProteinSequence.alphabet, "BLOSUM62"
)

@staticmethod
@functools.cache
def std_nucleotide_matrix():
"""
Get the default :class:`SubstitutionMatrix` for DNA sequence
Expand All @@ -392,13 +398,23 @@ def std_nucleotide_matrix():
matrix : SubstitutionMatrix
Default matrix.
"""
return _matrix_nuc
return SubstitutionMatrix(
NucleotideSequence.alphabet_amb, NucleotideSequence.alphabet_amb, "NUC"
)

@staticmethod
@functools.cache
def std_3di_matrix():
"""
Get the default :class:`SubstitutionMatrix` for 3Di sequence
alignments.
Returns
-------
matrix : SubstitutionMatrix
Default matrix.
"""
# Import inside function to avoid circular import
from biotite.structure.alphabet.i3d import I3DSequence

# Preformatted BLOSUM62 and NUC substitution matrix from NCBI
_matrix_blosum62 = SubstitutionMatrix(
ProteinSequence.alphabet, ProteinSequence.alphabet, "BLOSUM62"
)
_matrix_nuc = SubstitutionMatrix(
NucleotideSequence.alphabet_amb, NucleotideSequence.alphabet_amb, "NUC"
)
return SubstitutionMatrix(I3DSequence.alphabet, I3DSequence.alphabet, "3Di")
25 changes: 25 additions & 0 deletions src/biotite/sequence/align/matrix_data/3Di.mat
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# 3Di bit/2
# Background (precomputed optional): 0.0489372 0.0306991 0.101049 0.0329671 0.0276149 0.0416262 0.0452521 0.030876 0.0297251 0.0607036 0.0150238 0.0215826 0.0783843 0.0512926 0.0264886 0.0610702 0.0201311 0.215998 0.0310265 0.0295417 0.00001
# Lambda (precomputed optional): 0.351568
A C D E F G H I K L M N P Q R S T V W Y X
A 6 -3 1 2 3 -2 -2 -7 -3 -3 -10 -5 -1 1 -4 -7 -5 -6 0 -2 0
C -3 6 -2 -8 -5 -4 -4 -12 -13 1 -14 0 0 1 -1 0 -8 1 -7 -9 0
D 1 -2 4 -3 0 1 1 -3 -5 -4 -5 -2 1 -1 -1 -4 -2 -3 -2 -2 0
E 2 -8 -3 9 -2 -7 -4 -12 -10 -7 -17 -8 -6 -3 -8 -10 -10 -13 -6 -3 0
F 3 -5 0 -2 7 -3 -3 -5 1 -3 -9 -5 -2 2 -5 -8 -3 -7 4 -4 0
G -2 -4 1 -7 -3 6 3 0 -7 -7 -1 -2 -2 -4 3 -3 4 -6 -4 -2 0
H -2 -4 1 -4 -3 3 6 -4 -7 -6 -6 0 -1 -3 1 -3 -1 -5 -5 3 0
I -7 -12 -3 -12 -5 0 -4 8 -5 -11 7 -7 -6 -6 -3 -9 6 -12 -5 -8 0
K -3 -13 -5 -10 1 -7 -7 -5 9 -11 -8 -12 -6 -5 -9 -14 -5 -15 5 -8 0
L -3 1 -4 -7 -3 -7 -6 -11 -11 6 -16 -3 -2 2 -4 -4 -9 0 -8 -9 0
M -10 -14 -5 -17 -9 -1 -6 7 -8 -16 10 -9 -9 -10 -5 -10 3 -16 -6 -9 0
N -5 0 -2 -8 -5 -2 0 -7 -12 -3 -9 7 0 -2 2 3 -4 0 -8 -5 0
P -1 0 1 -6 -2 -2 -1 -6 -6 -2 -9 0 4 0 0 -2 -4 0 -4 -5 0
Q 1 1 -1 -3 2 -4 -3 -6 -5 2 -10 -2 0 5 -2 -4 -5 -1 -2 -5 0
R -4 -1 -1 -8 -5 3 1 -3 -9 -4 -5 2 0 -2 6 2 0 -1 -6 -3 0
S -7 0 -4 -10 -8 -3 -3 -9 -14 -4 -10 3 -2 -4 2 6 -6 0 -11 -9 0
T -5 -8 -2 -10 -3 4 -1 6 -5 -9 3 -4 -4 -5 0 -6 8 -9 -5 -5 0
V -6 1 -3 -13 -7 -6 -5 -12 -15 0 -16 0 0 -1 -1 0 -9 3 -10 -11 0
W 0 -7 -2 -6 4 -4 -5 -5 5 -8 -6 -8 -4 -2 -6 -11 -5 -10 8 -6 0
Y -2 -9 -2 -3 -4 -2 3 -8 -8 -9 -9 -5 -5 -5 -3 -9 -5 -11 -6 9 0
X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12 changes: 12 additions & 0 deletions src/biotite/structure/alphabet/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# This source code is part of the Biotite package and is distributed
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.

"""
A subpackage for converting structures to structural alphabet sequences.
"""

__name__ = "biotite.structure.alphabet"
__author__ = "Martin Larralde, Patrick Kunzmann"

from .i3d import *
Loading

0 comments on commit 66d51bc

Please sign in to comment.