Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for reading chainID info from prmtop amber topologies. #4007

Merged
merged 5 commits into from
Sep 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion package/CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ The rules for this file:
* release numbers follow "Semantic Versioning" http://semver.org

------------------------------------------------------------------------------
??/??/?? IAlibay, ianmkenney, PicoCentauri
??/??/?? IAlibay, ianmkenney, PicoCentauri, pgbarletta

* 2.7.0

Expand All @@ -24,6 +24,7 @@ Fixes
Enhancements
* Added a warning about charge neutrality to the documentation of
`DielectricConstant` (Issue #4262, PR #4263)
* Add support for reading chainID info from prmtop amber topologies (PR #4007)

Changes
* The `mda-xdrlib` module is now a core dependency of MDAnalysis
Expand Down
104 changes: 89 additions & 15 deletions package/MDAnalysis/topology/TOPParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@
:mod:`MDAnalysis.topology.guessers` for more details.

.. _`PARM parameter/topology file specification`:
http://ambermd.org/formats.html#topology
https://ambermd.org/FileFormats.php#topo.cntrl

Classes
-------
Expand All @@ -93,7 +93,7 @@

from .tables import Z2SYMB
from ..lib.util import openany, FORTRANReader
from .base import TopologyReaderBase
from .base import TopologyReaderBase, change_squash
from ..core.topology import Topology
from ..core.topologyattrs import (
Atomnames,
Expand All @@ -106,6 +106,7 @@
Resids,
Resnums,
Segids,
ChainIDs,
AtomAttr,
Bonds,
Angles,
Expand Down Expand Up @@ -140,13 +141,20 @@ class TOPParser(TopologyReaderBase):
- Bonds
- Angles
- Dihedrals (inc. impropers)
- ChainIDs (from %RESIDUE_CHAINID)
- Segids (from %RESIDUE_CHAINID)

The format is defined in `PARM parameter/topology file
specification`_. The reader tries to detect if it is a newer
(AMBER 12?) file format by looking for the flag "ATOMIC_NUMBER".

.. _`PARM parameter/topology file specification`:
http://ambermd.org/formats.html#topology
https://ambermd.org/FileFormats.php#topo.cntrl

Additionally, the RESIDUE_CHAINID non-standard flag is supported. This
pgbarletta marked this conversation as resolved.
Show resolved Hide resolved
can be added with the `addPDB`_ command from parmed:

.. _`addPDB`: https://parmed.github.io/ParmEd/html/parmed.html#addpdb

Notes
-----
Expand All @@ -162,6 +170,8 @@ class TOPParser(TopologyReaderBase):
warns users that chamber-style topologies are not currently supported
.. versionchanged:: 2.0.0
no longer guesses elements if missing
.. versionchanged:: 2.7.0
gets Segments and chainIDs from flag RESIDUE_CHAINID, when present
"""
format = ['TOP', 'PRMTOP', 'PARM7']

Expand All @@ -188,7 +198,8 @@ def parse(self, **kwargs):
"ANGLES_INC_HYDROGEN": (4, 10, self.parse_bonded, "angh", 4),
"ANGLES_WITHOUT_HYDROGEN": (4, 10, self.parse_bonded, "anga", 5),
"DIHEDRALS_INC_HYDROGEN": (5, 10, self.parse_bonded, "dihh", 6),
"DIHEDRALS_WITHOUT_HYDROGEN": (5, 10, self.parse_bonded, "diha", 7)
"DIHEDRALS_WITHOUT_HYDROGEN": (5, 10, self.parse_bonded, "diha", 7),
"RESIDUE_CHAINID": (1, 20, self.parse_chainids, "segids", 11),
}

attrs = {} # empty dict for attrs that we'll fill
Expand Down Expand Up @@ -298,12 +309,36 @@ def next_getter():
attrs['atomids'] = Atomids(np.arange(n_atoms) + 1)
attrs['resids'] = Resids(np.arange(n_res) + 1)
attrs['resnums'] = Resnums(np.arange(n_res) + 1)
attrs['segids'] = Segids(np.array(['SYSTEM'], dtype=object))

top = Topology(n_atoms, n_res, 1,
attrs=list(attrs.values()),
atom_resindex=residx,
residue_segindex=None)
# Amber's 'RESIDUE_CHAINID' is a by-residue attribute, turn it into
# a by-atom attribute when present. See PR #4007.
if "segids" in attrs and len(attrs["segids"]) == n_res:
pgbarletta marked this conversation as resolved.
Show resolved Hide resolved
segidx, (segids,) = change_squash((attrs["segids"],), (attrs["segids"],))
chainids = [attrs["segids"][r] for r in residx]

attrs["segids"] = Segids(segids)
attrs["ChainIDs"] = ChainIDs(chainids)
n_segs = len(segids)
else:
if "segids" in attrs:
msg = (
f"Number of residues ({n_res}) does not match number of "
f"%RESIDUE_CHAINID ({len(attrs['segids'])}). Skipping section."
)
logger.warning(msg)
warnings.warn(msg)
orbeckst marked this conversation as resolved.
Show resolved Hide resolved
attrs["segids"] = Segids(np.array(["SYSTEM"], dtype=object))
segidx = None
n_segs = 1

top = Topology(
n_atoms,
n_res,
n_segs,
attrs=list(attrs.values()),
atom_resindex=residx,
residue_segindex=segidx,
)

return top

Expand Down Expand Up @@ -537,10 +572,12 @@ def parse_bonded(self, num_per_record, numlines):
Note
----
For the bond/angle sections of parm7 files, the atom numbers are set to
coordinate array index values. As detailed in
http://ambermd.org/formats.html to recover the actual atom number, one
coordinate array index values. As detailed in `the specification`_,
to recover the actual atom number, one
should divide the values by 3 and add 1. Here, since we want to satisfy
zero-indexing, we only divide by 3.

.. _`the specification`: https://ambermd.org/FileFormats.php#topo.cntrl
"""
fields = self.parsesection_mapper(numlines, lambda x: int(x) // 3)
section = self.parse_chunks(fields, num_per_record)
Expand All @@ -563,9 +600,24 @@ def parsesection_mapper(self, numlines, mapper):
A list of all entries in a given parm7 section
"""
section = []
y = next(self.topfile).strip("%FORMAT(")
y.strip(")")
x = FORTRANReader(y)

def get_fmt(file):
orbeckst marked this conversation as resolved.
Show resolved Hide resolved
""" Skips '%COMMENT' lines until it gets the FORMAT specification
for the section."""
line = next(file)
if line[:7] == "%FORMAT":
return line[8:].split(")")[0]
elif line[:8] == "%COMMENT":
return get_fmt(file)
pgbarletta marked this conversation as resolved.
Show resolved Hide resolved
else:
raise ValueError(
IAlibay marked this conversation as resolved.
Show resolved Hide resolved
"Invalid header line. Does not begin with either %FLAG, %FORMAT "
f"nor %COMMENT:\n{line}"
)

# There may be %COMMENT lines between %FLAG and %FORMAT statements. Skip them.
fmt = get_fmt(self.topfile)
x = FORTRANReader(fmt)
for i in range(numlines):
l = next(self.topfile)
for j in range(len(x.entries)):
Expand Down Expand Up @@ -596,7 +648,7 @@ def parse_dihedrals(self, diha, dihh):

Note
----
As detailed in http://ambermd.org/formats.html, the dihedral sections
As detailed in `the specification`_, the dihedral sections
of parm7 files contain information about both conventional dihedrals
and impropers. The following must be accounted for:
1) If the fourth atom in a dihedral entry is given a negative value,
Expand All @@ -605,6 +657,8 @@ def parse_dihedrals(self, diha, dihh):
this indicates that it 1-4 NB interactions are ignored for this
dihedrals. This could be due to the dihedral within a ring, or if it is
part of a multi-term dihedral definition or if it is an improper.

.. _`the specification`: https://ambermd.org/FileFormats.php#topo.cntrl
"""
improp = []
dihed = []
Expand All @@ -620,3 +674,23 @@ def parse_dihedrals(self, diha, dihh):
dihedrals = Dihedrals(dihed)
impropers = Impropers(improp)
return dihedrals, impropers

def parse_chainids(self, num_per_record: int, numlines: int):
"""Extracts the chainID of each residue

Parameters
----------
num_per_record : int
The number of entries for each record in section (unused input)
numlines : int
The number of lines to be parsed in current section

Returns
-------
attr : numpy array
A numpy array containing the chainID of each residue as defined in
the parm7 file
"""
vals = self.parsesection_mapper(numlines, lambda x: x)
orbeckst marked this conversation as resolved.
Show resolved Hide resolved
attr = np.array(vals)
return attr
144 changes: 144 additions & 0 deletions testsuite/MDAnalysisTests/data/Amber/ace_mbondi3.error4.parm7
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
%VERSION VERSION_STAMP = V0001.000 DATE = 09/08/18 15:36:17
%FLAG TITLE
%FORMAT(20a4)
ACE
%FLAG POINTERS
%FORMAT(10I8)
6 4 3 2 6 1 9 0 0 0
16 1 2 1 0 3 3 3 4 0
0 0 0 0 0 0 0 0 6 0
0
%FLAG ATOM_NAME
%FORMAT(20a4)
HH31CH3 HH32HH33C O
%FLAG CHARGE
%COMMENT
%BAD LINE
%COMMENT
%FORMAT(5E16.8)
2.04636429E+00 -6.67300626E+00 2.04636429E+00 2.04636429E+00 1.08823576E+01
-1.03484442E+01
%FLAG ATOMIC_NUMBER
%FORMAT(10I8)
1 6 1 1 6 8
%FLAG MASS
%FORMAT(5E16.8)
1.00800000E+00 1.20100000E+01 1.00800000E+00 1.00800000E+00 1.20100000E+01
1.60000000E+01
%FLAG ATOM_TYPE_INDEX
%FORMAT(10I8)
1 2 1 1 3 4
%FLAG NUMBER_EXCLUDED_ATOMS
%FORMAT(10I8)
5 4 3 2 1 1
%FLAG NONBONDED_PARM_INDEX
%FORMAT(10I8)
1 2 4 7 2 3 5 8 4 5
6 9 7 8 9 10
%FLAG RESIDUE_LABEL
%FORMAT(20a4)
ACE
%FLAG RESIDUE_POINTER
%FORMAT(10I8)
1
%FLAG BOND_FORCE_CONSTANT
%FORMAT(5E16.8)
5.70000000E+02 3.40000000E+02 3.17000000E+02
%FLAG BOND_EQUIL_VALUE
%FORMAT(5E16.8)
1.22900000E+00 1.09000000E+00 1.52200000E+00
%FLAG ANGLE_FORCE_CONSTANT
%FORMAT(5E16.8)
5.00000000E+01 3.50000000E+01 8.00000000E+01
%FLAG ANGLE_EQUIL_VALUE
%FORMAT(5E16.8)
1.91113635E+00 1.91113635E+00 2.10137732E+00
%FLAG DIHEDRAL_FORCE_CONSTANT
%FORMAT(5E16.8)
8.00000000E-01 0.00000000E+00 8.00000000E-02
%FLAG DIHEDRAL_PERIODICITY
%FORMAT(5E16.8)
1.00000000E+00 2.00000000E+00 3.00000000E+00
%FLAG DIHEDRAL_PHASE
%FORMAT(5E16.8)
0.00000000E+00 0.00000000E+00 3.14159400E+00
%FLAG SCEE_SCALE_FACTOR
%FORMAT(5E16.8)
1.20000000E+00 1.20000000E+00 1.20000000E+00
%FLAG SCNB_SCALE_FACTOR
%FORMAT(5E16.8)
2.00000000E+00 2.00000000E+00 2.00000000E+00
%FLAG SOLTY
%FORMAT(5E16.8)
0.00000000E+00 0.00000000E+00 0.00000000E+00 0.00000000E+00
%FLAG LENNARD_JONES_ACOEF
%FORMAT(5E16.8)
7.51607703E+03 9.71708117E+04 1.04308023E+06 8.61541883E+04 9.24822270E+05
8.19971662E+05 5.44261042E+04 6.47841731E+05 5.74393458E+05 3.79876399E+05
%FLAG LENNARD_JONES_BCOEF
%FORMAT(5E16.8)
2.17257828E+01 1.26919150E+02 6.75612247E+02 1.12529845E+02 5.99015525E+02
5.31102864E+02 1.11805549E+02 6.26720080E+02 5.55666448E+02 5.64885984E+02
%FLAG BONDS_INC_HYDROGEN
%FORMAT(10I8)
3 6 2 3 9 2 0 3 2
%FLAG BONDS_WITHOUT_HYDROGEN
%FORMAT(10I8)
12 15 1 3 12 3
%FLAG ANGLES_INC_HYDROGEN
%FORMAT(10I8)
9 3 12 1 6 3 9 2 6 3
12 1 0 3 6 2 0 3 9 2
0 3 12 1
%FLAG ANGLES_WITHOUT_HYDROGEN
%FORMAT(10I8)
3 12 15 3
%FLAG DIHEDRALS_INC_HYDROGEN
%FORMAT(10I8)
9 3 12 15 1 9 3 -12 15 2
9 3 -12 15 3 6 3 12 15 1
6 3 -12 15 2 6 3 -12 15 3
0 3 12 15 1 0 3 -12 15 2
0 3 -12 15 3
%FLAG DIHEDRALS_WITHOUT_HYDROGEN
%FORMAT(10I8)

%FLAG EXCLUDED_ATOMS_LIST
%FORMAT(10I8)
2 3 4 5 6 3 4 5 6 4
5 6 5 6 6 0
%FLAG HBOND_ACOEF
%FORMAT(5E16.8)

%FLAG HBOND_BCOEF
%FORMAT(5E16.8)

%FLAG HBCUT
%FORMAT(5E16.8)

%FLAG AMBER_ATOM_TYPE
%FORMAT(20a4)
HC CT HC HC C O
%FLAG TREE_CHAIN_CLASSIFICATION
%FORMAT(20a4)
M M E E M E
%FLAG JOIN_ARRAY
%FORMAT(10I8)
0 0 0 0 0 0
%FLAG IROTAT
%FORMAT(10I8)
0 0 0 0 0 0
%FLAG RADIUS_SET
%FORMAT(1a80)
ArgH and AspGluO modified Bondi2 radii (mbondi3)
%FLAG RADII
%FORMAT(5E16.8)
1.20000000E+00 1.70000000E+00 1.20000000E+00 1.20000000E+00 1.70000000E+00
1.50000000E+00
%FLAG SCREEN
%FORMAT(5E16.8)
8.50000000E-01 7.20000000E-01 8.50000000E-01 8.50000000E-01 7.20000000E-01
8.50000000E-01
%FLAG IPOL
%FORMAT(1I8)
0
Binary file not shown.
Binary file not shown.
Loading
Loading