diff --git a/package/CHANGELOG b/package/CHANGELOG index ab7649bc74e..3cd1f619d30 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -13,6 +13,14 @@ The rules for this file: * release numbers follow "Semantic Versioning" http://semver.org ------------------------------------------------------------------------------ +??/??/16 jandom + + * 0.15.0 + +Changes + + * Generalized contact analysis class added. (Issue #702) + ??/??/16 tyler.je.reddy, kain88-de, jbarnoud, richardjgowers, orbeckst manuel.nuno.melo diff --git a/package/MDAnalysis/analysis/base.py b/package/MDAnalysis/analysis/base.py index bdb2970f0be..581a76f2303 100644 --- a/package/MDAnalysis/analysis/base.py +++ b/package/MDAnalysis/analysis/base.py @@ -92,7 +92,7 @@ def _conclude(self): """ pass - def run(self): + def run(self, **kwargs): """Perform the calculation""" logger.info("Starting preparation") self._prepare() diff --git a/package/MDAnalysis/analysis/contacts.py b/package/MDAnalysis/analysis/contacts.py index 4128404162e..bcb44a85971 100644 --- a/package/MDAnalysis/analysis/contacts.py +++ b/package/MDAnalysis/analysis/contacts.py @@ -18,10 +18,6 @@ Native contacts analysis --- :mod:`MDAnalysis.analysis.contacts` ================================================================ -:Author: Oliver Beckstein -:Year: 2010 -:Copyright: GNU Public License v3 - Analysis of native contacts *q* over a trajectory. * a "contact" exists between two atoms *i* and *j* if the distance between them is @@ -135,8 +131,15 @@ Classes ------- +.. autoclass:: Contacts + :members: .. autoclass:: ContactAnalysis :members: + + +Deprecated +---------- + .. autoclass:: ContactAnalysis1 :members: @@ -147,30 +150,52 @@ import warnings import bz2 from six.moves import zip + import numpy as np + import logging import MDAnalysis import MDAnalysis.lib.distances from MDAnalysis.lib.util import openany - +from MDAnalysis.analysis.distances import distance_array +from .base import AnalysisBase logger = logging.getLogger("MDAnalysis.analysis.contacts") +# ContactAnalysis needs to be cleaned up and possibly renamed but +# until then it remains because we don't have the functionality +# elsewhere. + + class ContactAnalysis(object): """Perform a native contact analysis ("q1-q2"). The analysis of the trajectory is performed with the :meth:`ContactAnalysis.run` method. The result is stored in - :attr:`ContactAnalysis.timeseries`. It is a numpy array which - contains the frame number at index 0, q1 and q2 at index 1 and 2, - and the total number of contacts in 3 and 4. :: + :attr:`ContactAnalysis.timeseries`. It is a numpy array which contains the + frame number at index 0, q1 and q2 at index 1 and 2, and the total number + of contacts in 3 and 4. :: frame q1 q2 n1 n2 The total number of contacts in the reference states 1 and 2 are stored in :attr:`ContactAnalysis.nref` (index 0 and 1). + + The :meth:`ContactAnalysis.run` method calculates the percentage of native + contacts *q1* and *q2* along a trajectory. "Contacts" are defined as the + number of Ca atoms (or per-residue *centroids* of a user defined + *selection*) within *radius* of a primary Ca. *q1* is the fraction of + contacts relative to the reference state 1 (typically the starting + conformation of the trajectory) and *q2* is the fraction of contacts + relative to the conformation 2. + + The timeseries is written to a bzip2-compressed file in `targetdir` + named "basename(trajectory)infix_q1q2.dat.bz2" and is also + accessible as the attribute + :attr:`ContactAnalysis.timeseries`. + """ def __init__(self, topology, trajectory, ref1=None, ref2=None, radius=8.0, @@ -178,49 +203,39 @@ def __init__(self, topology, trajectory, ref1=None, ref2=None, radius=8.0, selection="name CA", centroids=False): """Calculate native contacts from two reference structures. - :Arguments: - *topology* - psf or pdb file - *trajectory* - dcd or xtc/trr file - *ref1* + + Parameters + ---------- + topology : filename + topology file + trajectory : filename + trajectory + ref1 : filename or ``None``, optional structure of the reference conformation 1 (pdb); if ``None`` the *first* frame of the trajectory is chosen - *ref2* + ref2 : filename or ``None``, optional structure of the reference conformation 2 (pdb); if ``None`` the *last* frame of the trajectory is chosen - *radius* - contacts are deemed any Ca within radius [8 A] - *targetdir* - output files are saved there [.] - *infix* + radius : float, optional, default 8 A + contacts are deemed any Ca within radius + targetdir : path, optional, default ``.`` + output files are saved in this directory + infix : string, optional additional tag string that is inserted into the output filename of the - data file [""] - *selection* + data file + selection : string, optional, default ``"name CA"`` MDAnalysis selection string that selects the particles of interest; the default is to only select the C-alpha atoms - in *ref1* and *ref*2 ["name CA"] + in `ref1` and `ref2` - .. Note:: If *selection* produces more than one atom per + .. Note:: If `selection` produces more than one atom per residue then you will get multiple contacts per - residue unless you also set *centroids* = ``True`` - *centroids* + residue unless you also set `centroids` = ``True`` + centroids : bool If set to ``True``, use the centroids for the selected atoms on a per-residue basis to compute contacts. This allows, for instance - defining the sidechains as *selection* and then computing distances + defining the sidechains as `selection` and then computing distances between sidechain centroids. - - The function calculates the percentage of native contacts *q1* and *q2* - along a trajectory. "Contacts" are defined as the number of Ca atoms (or - per-residue *centroids* of a user defined *selection*) within *radius* of - a primary Ca. *q1* is the fraction of contacts relative to the reference - state 1 (typically the starting conformation of the trajectory) and *q2* - is the fraction of contacts relative to the conformation 2. - - The timeseries is written to a bzip2-compressed file in *targetdir* - named "basename(*trajectory*)*infix*_q1q2.dat.bz2" and is also - accessible as the attribute - :attr:`ContactAnalysis.timeseries`. """ self.topology = topology @@ -287,11 +302,14 @@ def __init__(self, topology, trajectory, ref1=None, ref2=None, radius=8.0, def get_distance_array(self, g, **kwargs): """Calculate the self_distance_array for atoms in group *g*. - :Keywords: - *results* + Parameters + ---------- + g : AtomGroup + group of atoms to calculate distance array for + results : array, optional passed on to :func:`MDAnalysis.lib.distances.self_distance_array` as a preallocated array - *centroids* + centroids : bool, optional, default ``None`` ``True``: calculate per-residue centroids from the selected atoms; ``False``: consider each atom separately; ``None``: use the class default for *centroids* [``None``] @@ -350,11 +368,28 @@ def run(self, store=True, force=False): return self.output_bz2 def qarray(self, d, out=None): - """Return distance array with True for contacts. + """Return array with ``True`` for contacts. + + Note + ---- + This method is typically only used internally. + + Arguments + --------- + d : array + 2D array of distances. The method uses the value of + :attr:`radius` to determine if a ``distance < radius`` + is considered a contact. + out : array, optional + If `out` is supplied as a pre-allocated array of the correct + shape then it is filled instead of allocating a new one in + order to increase performance. + + Returns + ------- + array + contact matrix - If *out* is supplied as a pre-allocated array of the correct - shape then it is filled instead of allocating a new one in - order to increase performance. """ if out is None: out = (d <= self.radius) @@ -363,12 +398,31 @@ def qarray(self, d, out=None): return out def qN(self, q, n, out=None): - """Calculate native contacts relative to state n. + """Calculate native contacts relative to reference state. + + Note + ---- + This method is typically only used internally. + + Arguments + --------- + q : array + contact matrix (see :meth:`Contacts.qarray`) + out : array, optional + If `out` is supplied as a pre-allocated array of the correct + shape then it will contain the contact matrix relative + to the reference state, i.e. only those contacts that + are also seen in the reference state. + + Returns + ------- + contacts : integer + total number of contacts + fraction : float + fraction of contacts relative to the reference state - If *out* is supplied as a pre-allocated array of the correct - shape then it is filled instead of allocating a new one in - order to increase performance. """ + if out is None: out = np.logical_and(q, self.qref[n]) else: @@ -405,7 +459,6 @@ def plot(self, **kwargs): # If ContactAnalysis is enhanced to accept two references then this should be even easier. # It might also be worthwhile making a simpler class that just does the q calculation # and use it for both reference and trajectory data. - class ContactAnalysis1(object): """Perform a very flexible native contact analysis with respect to a single reference. @@ -509,6 +562,8 @@ def __init__(self, *args, **kwargs): The timeseries is written to a file *outfile* and is also accessible as the attribute :attr:`ContactAnalysis1.timeseries`. + + .. deprecated: 0.14.0 """ # XX or should I use as input @@ -521,8 +576,13 @@ def __init__(self, *args, **kwargs): # - make this selection based on qavg from os.path import splitext - self.selection_strings = self._return_tuple2(kwargs.pop('selection', "name CA or name B*"), "selection") - self.references = self._return_tuple2(kwargs.pop('refgroup', None), "refgroup") + warnings.warn("ContactAnalysis1 is deprecated and will be removed in 1.0. " + "Use Contacts instead.", category=DeprecationWarning) + + self.selection_strings = self._return_tuple2(kwargs.pop( + 'selection', "name CA or name B*"), "selection") + self.references = self._return_tuple2(kwargs.pop('refgroup', None), + "refgroup") self.radius = kwargs.pop('radius', 8.0) self.targetdir = kwargs.pop('targetdir', os.path.curdir) self.output = kwargs.pop('outfile', "q1.dat.gz") @@ -638,7 +698,7 @@ def run(self, store=True, force=False, start=0, stop=None, step=1, **kwargs): if n_frames > 0: self.qavg /= n_frames else: - logger.warn("No frames were analyzed. Check values of start, stop, step.") + logger.warn("No frames were analyzed. Check values of start, stop, step.") logger.debug("start={start} stop={stop} step={step}".format(**vars())) np.savetxt(self.outarray, self.qavg, fmt="%8.6f") return self.output @@ -773,3 +833,377 @@ def plot_qavg(self, filename=None, **kwargs): if filename is not None: savefig(filename) + + +def best_hummer_q(r, r0, beta=5.0, lambda_constant=1.8): + """Calculate the Best-Hummer fraction of native contacts (Q) + + A soft-cutoff contacts analysis + + Parameters + ---------- + r: array + Contact distances at time t + r0: array + Contact distances at time t=0, reference distances + beta: float (default 5.0 Angstrom) + Softness of the switching function + lambda_constant: float (default 1.8, unitless) + Reference distance tolerance + + Returns + ------- + Q : float + Fraction of native contacts + result : array + Intermediate, r-r0 array transformed by the switching function + """ + result = 1/(1 + np.exp(beta*(r - lambda_constant * r0))) + + return result.sum() / len(r0), result + + +class Contacts(AnalysisBase): + """Calculate fraction of native contacts (Q) from a trajectory + + Inputs + ------ + Two string selections for the contacting AtomGroups, + the groups could be protein-lipid or protein-protein. + + Use two reference AtomGroups to obtain reference distances (r0) + for the cotacts. + + Methods available + ----------------- + Supports either hard-cutoff or soft-cutoff (Best-Hummer like [1]_) + contacts. + + Returns + ------- + list + Returns a list of following structure:: + { + [[t1, q1], [t2, q2], ... [tn, qn]] + } + where t is time in ps and q is the fraction of native contacts + + Examples + -------- + + 1. Protein folding:: + + ref = Universe("villin.gro") + u = Universe("conf_protein.gro", "traj_protein.xtc") + Q = calculate_contacts(u, ref, "protein and not name H*", "protein and not name H*") + + 2. A pair of helices:: + + ref = Universe("glycophorin_dimer.pdb") + u = Universe("conf_protein.gro", "traj_protein.xtc") + Q = calculate_contacts(u, ref, \ + "protein and resid 75-92 and not name H* and segid A", \ + "protein and resid 75-92 and not name H* and segid B") + + Parameter choices + ----------------- + There are recommendations and not strict orders. + These parameters should be insensitive to small changes. + * For all-atom simulations, radius = 4.5 A and lambda_constant = 1.8 (unitless) + * For coarse-grained simulations, radius = 6.0 A and lambda_constant = 1.5 (unitless) + + Additional + ---------- + Supports writing and reading the analysis results to and from a text file. + Supports simple plotting operations, for exploratory data analysis. + + Notes + ----- + We use the definition of Best et al [1]_, namely Eq. (1) of the SI + defines the expression for the fraction of native contacts, + $Q(X)$: + + .. math:: + + Q(X) = \frac{1}{|S|} \sum_{(i,j) \in S} + \frac{1}{1 + \exp[\beta(r_{ij}(X) - \lambda r_{ij}^0)]} + + where: + + * :math:`X` is a conformation, + * :math:`r_{ij}(X)` is the distance between atoms $i$ and $j$ in + conformation $X$, + * :math:`r^0_{ij}` is the distance from heavy atom i to j in the + native state conformation, + * :math:`S` is the set of all pairs of heavy atoms $(i,j)$ + belonging to residues $\theta_i$ and $\theta_j$ such that + $|\theta_i - \theta_j| > 3$ and $r^0_{i,} < 4.5 + \unicode{x212B}$, + * :math:`\beta=5 \unicode{x212B}^{-1}, + * :math:`\lambda=1.8` for all-atom simulations + + References + ---------- + + .. [1] RB Best, G Hummer, and WA Eaton, "Native contacts determine + protein folding mechanisms in atomistic simulations" _PNAS_ + **110** (2013), 17874–17879. `10.1073/pnas.1311599110 + `_. + + """ + def __init__(self, u, selection, refgroup, method="cutoff", radius=4.5, outfile=None, + start=None, stop=None, step=None, **kwargs): + """Calculate the persistence length for polymer chains + + Parameters + ---------- + u: Universe + trajectory + selection: tuple(string, string) + two contacting groups that change over time + refgroup: tuple(AtomGroup, AtomGroup) + two contacting groups in their reference conformation + radius: float, optional (4.5 Angstroms) + radius within which contacts exist + method: string + either 'cutoff' or 'best-hummer' + + start : int, optional + First frame of trajectory to analyse, Default: 0 + stop : int, optional + Last frame of trajectory to analyse, Default: -1 + step : int, optional + Step between frames to analyse, Default: 1 + + Parameters for 'best-hummer' method + ----------------------------------- + lambda_constant: float, optional (1.8 unitless) + contact is considered formed between (lambda*r0,r0) + beta: float, optional (5 Angstroms^-1) + softness of the switching function, the lower the softer + + Attributes + ---------- + results: list + Fraction of native contacts for each frame + """ + + # check method + if not method in ("cutoff", "best-hummer"): + raise ValueError("method has to be 'cutoff' or 'best-hummer'") + self._method = method + + # method-specific parameters + if method == "best-hummer": + self.beta = kwargs.get('beta', 5.0) + self.lambda_constant = kwargs.get('lambda_constant', 1.8) + + # steup boilerplate + self.u = u + self._setup_frames(self.u.trajectory, + start=start, + stop=stop, + step=step) + + self.selection = selection + grA, grB = u.select_atoms(selection[0]), u.select_atoms(selection[1]) + self.grA, self.grB = grA, grB + refA, refB = refgroup + + # contacts formed in reference + r0 = distance_array(refA.positions, refB.positions) + self.r0 = r0 + self.mask = r0 < radius + + self.contact_matrix = [] + self.timeseries = [] + self.outfile = outfile + + def load(self, filename): + """Load the data file. + + Arguments + --------- + filename : string + name of the data file to be read (can be compressed + or a stream, see :func:`~MDAnalysis.lib.util.openany` + for what is possible) + """ + records = [] + with openany(filename) as data: + for line in data: + if line.startswith('#'): continue + records.append(map(float, line.split())) + return np.array(records) + + def _single_frame(self): + grA, grB, r0, mask = self.grA, self.grB, self.r0, self.mask + + # compute distance array for a frame + d = distance_array(grA.positions, grB.positions) + + # select only the contacts that were formed in the reference state + # r, r0 are 1D array + r, r0 = d[mask], r0[mask] + + if self._method == "cutoff": + y = r <= r0 + y = float(y.sum())/mask.sum() + elif self._method == "best-hummer": + y, _ = best_hummer_q(r, r0, self.beta, self.lambda_constant) + else: + raise ValueError("Unknown method type, has to be 'cutoff' or 'best-hummer'") + + cm = np.zeros((grA.positions.shape[0], grB.positions.shape[0])) + cm[mask] = y + self.contact_matrix.append(cm) + self.timeseries.append((self._ts.frame , y, mask.sum())) + + def _conclude(self): + """Finalise the timeseries you've gathered. + + Called at the end of the run() method to finish everything up. + """ + # write output + if not self.outfile: return + with open(self.outfile, "w") as f: + f.write("# q1 analysis\n# nref = {0:d}\n".format(self.mask.sum())) + f.write("# frame q1 n1\n") + for frame, q1, n1 in self.timeseries: + f.write("{frame:4d} {q1:8.6f} {n1:5d}\n".format(**vars())) + + def contact_matrix(self, d, out=None): + """Return array with ``True`` for contacts. + + Note + ---- + This method is typically only used internally. + + Arguments + --------- + d : array + 2D array of distances. The method uses the value of + :attr:`radius` to determine if a ``distance < radius`` + is considered a contact. + out : array, optional + If `out` is supplied as a pre-allocated array of the correct + shape then it is filled instead of allocating a new one in + order to increase performance. + + Returns + ------- + array + contact matrix + """ + if out: + out[:] = (d <= self.radius) + else: + out = (d <= self.radius) + return out + + def fraction_native(q, out=None): + """Calculate native contacts relative to reference state. + + Note + ---- + This method is typically only used internally. + + Arguments + --------- + q : array + contact matrix (see :meth:`Contacts.qarray`) + out : array, optional + If `out` is supplied as a pre-allocated array of the correct + shape then it will contain the contact matrix relative + to the reference state, i.e. only those contacts that + are also seen in the reference state. + + Returns + ------- + contacts : integer + total number of contacts + fraction : float + fraction of contacts relative to the reference state + """ + if out: + np.logical_and(q, self.mask, out) + else: + out = np.logical_and(q, self.mask) + contacts = out.sum() + return contacts, float(contacts) / self.mask.sum() + + def plot(self, filename=None, **kwargs): + """Plot the time series of fractional native contacts. + + Arguments + --------- + filename : string, optional + If `filename` is supplied then the figure is also written + to file (the suffix determines the file type, e.g. pdf, + png, eps, ...). + kwargs : optional + All other keyword arguments are passed on to + :func:`matplotlib.pyplot.plot`. + """ + if not self.timeseries : + raise ValueError("No timeseries data; do 'Contacts.run()' first.") + x, y, _ = zip(*self.timeseries) + + import matplotlib.pyplot as plt + kwargs.setdefault('color', 'black') + kwargs.setdefault('linewidth', 2) + + fig = plt.figure() + ax = fig.add_subplot(111) + ax.plot(x, y, **kwargs) + ax.set_xlabel(r"frame number $t$") + ax.set_ylabel(r"contacts $q_1$") + + if filename: + fig.savefig(filename) + else: + fig.show() + + def plot_qavg(self, filename=None, **kwargs): + """Plot the matrix of average contacts. + + Convenience function to plot :attr:`qavg`. + + Arguments + --------- + filename : string, optional + If `filename` is supplied then the figure is also written + to file (the suffix determines the file type, e.g. pdf, + png, eps, ...). + **kwargs + All other keyword arguments are passed on to + :func:`matplotlib.pyplot.imshow`. + """ + if not self.contact_matrix : + raise ValueError("No timeseries data; do 'Contacts.run()' first.") + # collapse on the time-axis + data = np.array(self.contact_matrix) + data = data.mean(axis=0) + + import matplotlib.pyplot as plt + import matplotlib.cm as cm + + kwargs['origin'] = 'lower' + kwargs.setdefault('aspect', 'equal') + kwargs.setdefault('interpolation', 'nearest') + kwargs.setdefault('vmin', 0) + kwargs.setdefault('vmax', 1) + kwargs.setdefault('cmap', cm.hot) + + fig = plt.figure() + ax = fig.add_subplot(111) + cax = ax.imshow(data, **kwargs) + + cbar = fig.colorbar(cax) + + if filename: + fig.savefig(filename) + else: + fig.show() + + diff --git a/package/MDAnalysis/analysis/distances.py b/package/MDAnalysis/analysis/distances.py index adea4299565..1c1d1fe4be4 100644 --- a/package/MDAnalysis/analysis/distances.py +++ b/package/MDAnalysis/analysis/distances.py @@ -31,41 +31,81 @@ __all__ = ['distance_array', 'self_distance_array', 'contact_matrix', 'dist'] import numpy as np -from scipy import sparse from MDAnalysis.lib.distances import distance_array, self_distance_array from MDAnalysis.lib.c_distances import contact_matrix_no_pbc, contact_matrix_pbc +import warnings import logging - logger = logging.getLogger("MDAnalysis.analysis.distances") +try: + from scipy import sparse +except ImportError: + sparse = None + msg = "scipy.sparse could not be imported: some functionality will " \ + "not be available in contact_matrix()" + warnings.warn(msg, category=ImportWarning) + logger.warn(msg) + del msg def contact_matrix(coord, cutoff=15.0, returntype="numpy", box=None): - '''Calculates a matrix of contacts within a numpy array of type float32. + '''Calculates a matrix of contacts. There is a fast, high-memory-usage version for small systems (*returntype* = 'numpy'), and a slower, low-memory-usage version for larger systems (*returntype* = 'sparse'). - If *box* dimensions are passed (``box = [Lx, Ly, Lz]``), then - periodic boundary conditions are applied. Only orthorhombic boxes - are currently supported. + If *box* dimensions are passed then periodic boundary conditions + are applied. + + Parameters + --------- + coord : array + Array of coordinates of shape ``(N, 3)`` and dtype float32. + cutoff : float, optional, default 15 + Particles within `cutoff` are considered to form a contact. + returntype : string, optional, default "numpy" + Select how the contact matrix is returned. + * ``"numpy"``: return as an ``(N. N)`` :class:`numpy.ndarray` + * ``"sparse"``: return as a :class:`scipy.sparse.lil_matrix` + box : array-like or ``None``, optional, default ``None`` + Simulation cell dimensions in the form of + :attr:`MDAnalysis.trajectory.base.Timestep.dimensions` when + periodic boundary conditions should be taken into account for + the calculation of contacts. + + Returns + ------- + array or sparse matrix + The contact matrix is returned in a format determined by the `returntype` + keyword. + + + Note + ---- + :module:`scipy.sparse` is require for using *sparse* matrices; if it cannot + be imported then an `ImportError` is raised. .. versionchanged:: 0.11.0 Keyword *suppress_progmet* and *progress_meter_freq* were removed. + ''' + if returntype == "numpy": adj = (distance_array(coord, coord, box=box) < cutoff) return adj elif returntype == "sparse": + if sparse is None: + raise ImportError("contact_matrix function requires the scipy " + "package to be installed") # Initialize square List of Lists matrix of dimensions equal to number of coordinates passed sparse_contacts = sparse.lil_matrix((len(coord), len(coord)), dtype='bool') if box is not None: - # if PBC + # with PBC contact_matrix_pbc(coord, sparse_contacts, box, cutoff) else: - # if no PBC + # without PBC contact_matrix_no_pbc(coord, sparse_contacts, cutoff) return sparse_contacts @@ -75,27 +115,32 @@ def dist(A, B, offset=0): The distance is calculated atom-wise. The residue ids are also returned because a typical use case is to look at CA distances - before and after an alignment. Using the *offset* keyword one can + before and after an alignment. Using the `offset` keyword one can also add a constant offset to the resids which facilitates comparison with PDB numbering. - :Arguments: - *A*, *B* - :class:`~MDAnalysis.core.AtomGroup.AtomGroup` with the - same number of atoms - - :Keywords: - *offset* : integer - The *offset* is added to *resids_A* and *resids_B* (see - below) in order to produce PDB numbers. The default is 0. - - *offset* : tuple - *offset[0]* is added to *resids_A* and *offset[1]* to - *resids_B*. Note that one can actually supply numpy arrays - of the same length as the atom group so that an individual - offset is added to each resid. - - :Returns: NumPy `array([resids_A, resids_B, distances])` + Parameters + ---------- + A, B: AtomGroup instances + :class:`~MDAnalysis.core.AtomGroup.AtomGroup` with the + same number of atoms + offset : integer or tuple, optional, default 0 + An integer `offset` is added to *resids_A* and *resids_B* (see + below) in order to produce PDB numbers. + + If `offset` is :class:`tuple` then ``offset[0]`` is added to + *resids_A* and ``offset[1]`` to *resids_B*. Note that one can + actually supply numpy arrays of the same length as the atom + group so that an individual offset is added to each resid. + + Returns + ------- + resids_A : array + residue ids of the `A` group (possibly changed with `offset`) + resids_B : array + residue ids of the `B` group (possibly changed with `offset`) + distances : array + distances between the atoms """ if A.atoms.n_atoms != B.atoms.n_atoms: @@ -112,20 +157,34 @@ def dist(A, B, offset=0): def between(group, A, B, distance): - """Return sub group of *group* that is within *distance* of both *A* and *B*. + """Return sub group of `group` that is within `distance` of both `A` and `B` - *group*, *A*, and *B* must be - :class:`~MDAnalysis.core.AtomGroup.AtomGroup` instances. Works best - if *group* is bigger than either *A* or *B*. This function is not - aware of periodic boundary conditions. + This function is not aware of periodic boundary conditions. Can be used to find bridging waters or molecules in an interface. Similar to "*group* and (AROUND *A* *distance* and AROUND *B* *distance*)". - .. SeeAlso:: Makes use of :mod:`MDAnalysis.lib.NeighborSearch`. + Parameters + ---------- + group : AtomGroup + Find members of `group` that are between `A` and `B` + A, B : AtomGroups + `A` and `B` are :class:`~MDAnalysis.core.AtomGroup.AtomGroup` + instances. Works best if `group` is bigger than either `A` or + `B`. + distance : float + maximum distance for an atom to be counted as in the vicinity of + `A` or `B` + + Returns + ------- + AtomGroup + :class:`~MDAnalysis.core.AtomGroup.AtomGroup` of atoms that + fulfill the criterion .. versionadded: 0.7.5 + """ from MDAnalysis.core.AtomGroup import AtomGroup diff --git a/package/MDAnalysis/lib/NeighborSearch.py b/package/MDAnalysis/lib/NeighborSearch.py index f2e6ff6779e..2a90c65ef5e 100644 --- a/package/MDAnalysis/lib/NeighborSearch.py +++ b/package/MDAnalysis/lib/NeighborSearch.py @@ -27,7 +27,6 @@ from MDAnalysis.core.AtomGroup import AtomGroup - class AtomNeighborSearch(object): """This class can be used to find all atoms/residues/segements within the radius of a given query position. diff --git a/testsuite/MDAnalysisTests/analysis/test_contacts.py b/testsuite/MDAnalysisTests/analysis/test_contacts.py index 7be5d8fa8b4..81b776613da 100644 --- a/testsuite/MDAnalysisTests/analysis/test_contacts.py +++ b/testsuite/MDAnalysisTests/analysis/test_contacts.py @@ -17,6 +17,7 @@ import MDAnalysis import MDAnalysis.analysis.contacts +from MDAnalysis.analysis.distances import distance_array from MDAnalysis import SelectionError from numpy.testing import (TestCase, dec, @@ -28,9 +29,48 @@ import os import tempdir -from MDAnalysisTests.datafiles import PSF, DCD +from MDAnalysisTests.datafiles import ( + PSF, + DCD, + contacts_villin_folded, + contacts_villin_unfolded, + contacts_file, +) + from MDAnalysisTests import executable_not_found, parser_not_found + + +def best_hummer_q(ref, u, selA, selB, radius=4.5, beta=5.0, lambda_constant=1.8): + """ + Reference implementation for testing + """ + # reference groups A and B from selection strings + refA, refB = ref.select_atoms(selA), ref.select_atoms(selB) + + # 2D float array, reference distances (r0) + dref = distance_array(refA.positions, refB.positions) + + # 2D bool array, select reference distances that are less than the cutoff radius + mask = dref < radius + #print("ref has {:d} contacts within {:.2f}".format(mask.sum(), radius)) + + # group A and B in a trajectory + grA, grB = u.select_atoms(selA), u.select_atoms(selB) + results = [] + + + for ts in u.trajectory: + d = distance_array(grA.positions, grB.positions) + r, r0 = d[mask], dref[mask] + x = 1/(1 + np.exp(beta*(r - lambda_constant * r0))) + + # average/normalize and append to results + results.append(( ts.time, x.sum()/mask.sum() )) + + #results = pd.DataFrame(results, columns=["Time (ps)", "Q"]) + return results + class TestContactAnalysis1(TestCase): @dec.skipif(parser_not_found('DCD'), 'DCD parser not available. Are you using python 3?') @@ -38,9 +78,12 @@ def setUp(self): self.universe = MDAnalysis.Universe(PSF, DCD) self.trajectory = self.universe.trajectory + self.folded = MDAnalysis.Universe(contacts_villin_folded) + self.unfolded = MDAnalysis.Universe(contacts_villin_unfolded) + def tearDown(self): - del self.universe - del self.trajectory + del self.universe, self.trajectory + del self.folded, self.unfolded def _run_ContactAnalysis1(self, **runkwargs): sel_basic = "(resname ARG or resname LYS) and (name NH* or name NZ)" @@ -48,10 +91,10 @@ def _run_ContactAnalysis1(self, **runkwargs): acidic = self.universe.select_atoms(sel_acidic) basic = self.universe.select_atoms(sel_basic) outfile = 'qsalt.dat' - CA1 = MDAnalysis.analysis.contacts.ContactAnalysis1( + CA1 = MDAnalysis.analysis.contacts.Contacts( self.universe, selection=(sel_acidic, sel_basic), refgroup=(acidic, basic), - radius=6.0, outfile=outfile) + radius=6.0, outfile=outfile, **runkwargs) kwargs = runkwargs.copy() kwargs['force'] = True CA1.run(**kwargs) @@ -61,7 +104,7 @@ def test_startframe(self): """test_startframe: TestContactAnalysis1: start frame set to 0 (resolution of Issue #624)""" with tempdir.in_tempdir(): CA1 = self._run_ContactAnalysis1() - self.assertEqual(CA1.timeseries.shape[1], self.universe.trajectory.n_frames) + self.assertEqual(len(CA1.timeseries), self.universe.trajectory.n_frames) def test_end_zero(self): """test_end_zero: TestContactAnalysis1: stop frame 0 is not ignored""" @@ -74,4 +117,84 @@ def test_slicing(self): with tempdir.in_tempdir(): CA1 = self._run_ContactAnalysis1(start=start, stop=stop, step=step) frames = np.arange(self.universe.trajectory.n_frames)[start:stop:step] - self.assertEqual(CA1.timeseries.shape[1], len(frames)) + self.assertEqual(len(CA1.timeseries), len(frames)) + + + def test_math_folded(self): + + u = self.folded + + # read the text files + data = [l.split() for l in open(contacts_file).readlines()] + # convert to 0-based indexing + data = [ (int(i)-1, int(j)-1, float(d)) for i, j, d in data] + # get r and r0 + data = [ (np.linalg.norm(u.atoms[i].pos - u.atoms[j].pos), d) for i, j, d in data] + data = np.array(data) + + r = data[:,0] + r0 = data[:,1] + + beta = 5.0 + lambda_constant = 1.8 + + Q = 1/(1 + np.exp(beta*(r - lambda_constant * r0))) + + assert_almost_equal(Q.mean(), 1.0, decimal=3) + + def test_math_unfolded(self): + + u = self.unfolded + + # read the text files + data = [l.split() for l in open(contacts_file).readlines()] + # convert to 0-based indexing + data = [ (int(i)-1, int(j)-1, float(d)) for i, j, d in data] + # get r and r0 + data = [ (np.linalg.norm(u.atoms[i].pos - u.atoms[j].pos), d) for i, j, d in data] + data = np.array(data) + + r = data[:,0] + r0 = data[:,1] + + beta = 5.0 + lambda_constant = 1.8 + + Q = 1/(1 + np.exp(beta*(r - lambda_constant * r0))) + + assert_almost_equal(Q.mean(), 0.0, decimal=1) + + @staticmethod + def test_villin_folded(): + + # one folded, one unfolded + f = MDAnalysis.Universe(contacts_villin_folded) + u = MDAnalysis.Universe(contacts_villin_unfolded) + sel = "protein and not name H*" + + grF = f.select_atoms(sel) + grU = u.select_atoms(sel) + + q = MDAnalysis.analysis.contacts.Contacts(u, selection=(sel, sel), refgroup=(grF, grF), method="best-hummer") + q.run() + + results = zip(*best_hummer_q(f, u, sel, sel))[1] + + assert_almost_equal(zip(*q.timeseries)[1], results) + + @staticmethod + def test_villin_unfolded(): + + # both folded + f = MDAnalysis.Universe(contacts_villin_folded) + u = MDAnalysis.Universe(contacts_villin_folded) + sel = "protein and not name H*" + + grF = f.select_atoms(sel) + grU = u.select_atoms(sel) + + q = MDAnalysis.analysis.contacts.Contacts(u, selection=(sel, sel), refgroup=(grF, grF), method="best-hummer") + q.run() + + results = zip(*best_hummer_q(f, u, sel, sel)) [1] + assert_almost_equal(zip(*q.timeseries)[1], results) diff --git a/testsuite/MDAnalysisTests/data/contacts/2F4K_qlist5_remap.dat b/testsuite/MDAnalysisTests/data/contacts/2F4K_qlist5_remap.dat new file mode 100644 index 00000000000..8ba2ba3a650 --- /dev/null +++ b/testsuite/MDAnalysisTests/data/contacts/2F4K_qlist5_remap.dat @@ -0,0 +1,307 @@ + 1 217 4.245 + 5 217 4.299 + 20 217 3.735 + 21 211 4.215 + 21 213 3.815 + 21 217 2.646 + 7 64 4.262 + 7 72 4.160 + 7 74 4.398 + 7 76 4.333 + 7 217 4.216 + 12 70 4.408 + 12 71 4.346 + 12 72 4.150 + 12 74 3.864 + 12 76 4.277 + 12 86 4.144 + 12 134 4.261 + 12 156 4.398 + 12 547 4.185 + 16 64 4.029 + 31 72 4.026 + 31 76 4.405 + 32 72 2.824 + 32 74 3.642 + 32 76 3.301 + 35 211 4.434 + 43 92 4.160 + 43 99 4.482 + 43 102 4.443 + 44 92 3.038 + 44 94 3.923 + 44 96 3.573 + 44 99 3.395 + 44 102 3.645 + 40 208 4.155 + 40 211 4.207 + 41 205 4.072 + 41 208 3.173 + 41 211 3.051 + 41 213 4.273 + 58 114 4.116 + 59 114 3.010 + 59 116 3.902 + 59 118 3.697 + 70 124 4.106 + 70 134 4.373 + 71 124 2.970 + 71 126 3.868 + 71 128 3.624 + 71 134 3.311 + 90 140 4.108 + 90 154 4.463 + 90 183 4.167 + 91 140 2.920 + 91 142 3.771 + 91 158 4.376 + 91 144 3.628 + 91 147 4.063 + 91 154 3.587 + 91 160 4.139 + 91 183 3.971 + 76 205 4.307 + 79 154 4.047 + 79 205 3.880 + 80 144 4.464 + 80 154 3.924 + 80 183 3.505 + 80 196 3.961 + 80 197 4.286 + 80 198 3.678 + 80 200 3.908 + 80 202 4.448 + 80 205 3.961 + 86 154 3.918 + 86 156 4.030 + 86 205 4.147 + 86 213 4.060 + 86 214 3.805 + 86 217 4.370 + 82 144 4.138 + 82 147 4.227 + 82 154 3.736 + 82 183 3.749 + 82 196 3.771 + 82 197 3.714 + 82 198 3.630 + 82 200 3.556 + 82 202 4.370 + 82 205 4.320 + 82 247 3.893 + 82 250 4.227 + 82 251 4.437 + 88 154 3.703 + 88 156 3.728 + 88 205 4.480 + 88 214 4.014 + 84 147 4.277 + 84 154 3.622 + 84 156 4.014 + 84 198 4.489 + 84 200 3.972 + 84 247 3.777 + 84 250 3.675 + 84 251 3.946 + 84 257 4.082 + 92 183 3.914 + 94 160 4.358 + 94 167 4.215 + 94 182 4.270 + 94 183 3.398 + 112 160 3.747 + 112 162 4.234 + 112 167 4.455 + 112 183 4.430 + 113 160 2.755 + 113 162 3.092 + 113 165 3.808 + 113 167 3.764 + 113 183 4.450 + 96 183 3.972 + 99 182 3.950 + 99 183 3.550 + 99 184 4.115 + 99 186 3.967 + 99 192 4.245 + 102 192 4.064 + 138 522 4.457 + 139 516 4.211 + 139 519 3.803 + 139 522 3.368 + 139 525 3.913 + 130 513 3.897 + 130 516 3.881 + 130 519 4.332 + 130 547 3.800 + 134 547 4.443 + 147 251 4.472 + 148 251 4.408 + 148 253 4.137 + 148 438 4.070 + 148 516 4.010 + 150 253 4.167 + 150 451 4.006 + 150 452 3.798 + 150 438 4.056 + 150 453 4.261 + 150 455 4.387 + 150 513 4.178 + 150 516 3.754 + 156 547 4.152 + 152 547 4.025 + 171 237 3.952 + 174 237 4.329 + 174 243 4.145 + 174 245 4.432 + 174 247 4.221 + 177 241 4.317 + 177 243 3.777 + 177 245 3.658 + 177 247 3.804 + 177 250 4.443 + 177 251 4.063 + 177 296 3.767 + 178 296 4.133 + 178 443 3.986 + 196 243 4.132 + 197 243 2.955 + 197 245 3.895 + 197 247 3.586 + 220 263 4.154 + 221 263 3.007 + 221 265 3.901 + 221 267 3.608 + 231 273 4.388 + 232 273 3.289 + 232 275 4.209 + 232 277 3.901 + 242 287 3.421 + 242 289 4.448 + 242 291 4.453 + 242 294 3.827 + 261 394 4.411 + 262 389 3.459 + 262 392 3.622 + 262 393 4.469 + 262 394 3.440 + 251 443 4.139 + 257 389 4.030 + 257 392 4.231 + 257 393 4.247 + 253 384 4.003 + 253 397 4.403 + 253 398 3.918 + 253 386 4.318 + 253 438 4.069 + 253 443 3.991 + 259 384 4.287 + 259 386 3.710 + 259 389 3.682 + 259 392 3.979 + 259 393 3.777 + 259 466 4.157 + 255 384 3.692 + 255 397 3.986 + 255 398 3.588 + 255 386 3.606 + 255 389 3.958 + 255 460 4.111 + 255 466 4.008 + 265 392 4.499 + 265 394 3.917 + 271 394 3.893 + 272 394 3.560 + 287 394 4.377 + 304 389 4.422 + 304 394 3.904 + 305 389 3.915 + 305 392 3.817 + 305 394 2.859 + 291 367 4.401 + 291 389 4.079 + 296 370 4.098 + 296 443 3.702 + 318 382 4.204 + 318 389 4.284 + 318 394 4.254 + 319 382 3.019 + 319 384 4.057 + 319 386 3.959 + 319 389 3.578 + 319 392 4.253 + 319 394 4.214 + 337 399 4.120 + 337 406 4.356 + 338 399 2.921 + 338 401 3.855 + 338 403 3.631 + 338 406 3.371 + 327 406 4.186 + 329 406 3.867 + 329 409 4.212 + 329 410 3.829 + 361 416 4.055 + 361 420 4.345 + 362 416 2.998 + 362 418 3.748 + 362 420 3.302 + 380 434 4.077 + 380 441 4.476 + 381 434 2.908 + 381 436 3.811 + 381 438 3.555 + 381 441 3.395 + 381 443 3.771 + 370 443 4.362 + 373 441 4.463 + 376 441 4.157 + 376 443 4.443 + 376 447 4.229 + 397 453 4.161 + 397 460 4.377 + 398 453 3.002 + 398 455 3.936 + 398 457 3.710 + 398 460 3.370 + 398 463 3.925 + 398 466 4.420 + 401 488 4.395 + 414 472 4.221 + 414 488 4.054 + 415 472 3.077 + 415 474 4.016 + 415 476 3.703 + 415 479 3.463 + 415 482 4.063 + 415 485 4.487 + 415 488 3.641 + 403 488 3.739 + 411 488 3.852 + 432 494 4.110 + 432 498 4.370 + 433 494 2.932 + 433 496 3.694 + 433 498 3.326 + 451 509 4.142 + 451 516 4.409 + 452 509 2.978 + 452 511 3.838 + 452 513 3.541 + 452 516 3.327 + 455 556 4.086 + 470 531 4.197 + 470 538 4.117 + 470 555 4.471 + 470 556 3.691 + 471 531 3.064 + 471 533 3.699 + 471 536 3.811 + 471 538 3.000 + 471 540 3.877 + 471 555 3.862 + 471 556 3.220 + 471 542 4.353 + 457 555 4.381 + 457 556 3.736 + 463 561 4.488 diff --git a/testsuite/MDAnalysisTests/data/contacts/villin_folded.gro.bz2 b/testsuite/MDAnalysisTests/data/contacts/villin_folded.gro.bz2 new file mode 100644 index 00000000000..168bd8bbede Binary files /dev/null and b/testsuite/MDAnalysisTests/data/contacts/villin_folded.gro.bz2 differ diff --git a/testsuite/MDAnalysisTests/data/contacts/villin_unfolded.gro.bz2 b/testsuite/MDAnalysisTests/data/contacts/villin_unfolded.gro.bz2 new file mode 100644 index 00000000000..272acffd5db Binary files /dev/null and b/testsuite/MDAnalysisTests/data/contacts/villin_unfolded.gro.bz2 differ diff --git a/testsuite/MDAnalysisTests/datafiles.py b/testsuite/MDAnalysisTests/datafiles.py index e5f415f04ab..220d1b6c7a9 100644 --- a/testsuite/MDAnalysisTests/datafiles.py +++ b/testsuite/MDAnalysisTests/datafiles.py @@ -85,6 +85,7 @@ "merge_protein", "merge_ligand", "merge_water", "mol2_molecules", "mol2_molecule", "mol2_broken_molecule", "capping_input", "capping_output", "capping_ace", "capping_nma", + "contacts_villin_folded", "contacts_villin_unfolded", "contacts_file", "LAMMPSdata", "trz4data", "LAMMPSdata_mini", "LAMMPSdata2", "LAMMPSdcd2", "LAMMPScnt", "LAMMPScnt2", # triclinic box @@ -260,6 +261,10 @@ capping_ace = resource_filename(__name__, "data/capping/ace.pdb") capping_nma = resource_filename(__name__, "data/capping/nma.pdb") +contacts_villin_folded = resource_filename(__name__, "data/contacts/villin_folded.gro.bz2") +contacts_villin_unfolded = resource_filename(__name__, "data/contacts/villin_unfolded.gro.bz2") +contacts_file = resource_filename(__name__, "data/contacts/2F4K_qlist5_remap.dat") + trz4data = resource_filename(__name__, "data/lammps/datatest.trz") LAMMPSdata = resource_filename(__name__, "data/lammps/datatest.data") LAMMPSdata_mini = resource_filename(__name__, "data/lammps/mini.data") diff --git a/testsuite/setup.py b/testsuite/setup.py index b050c464b6e..b4444cb41d2 100755 --- a/testsuite/setup.py +++ b/testsuite/setup.py @@ -108,6 +108,7 @@ 'data/merge/2zmm/*.pdb', 'data/*.trz', 'data/mol2/*.mol2', + 'data/contacts/*.gro.bz2', 'data/contacts/*.dat', 'data/capping/*.gro', 'data/capping/*.pdb', 'data/lammps/*.data', 'data/lammps/*.data.bz2', 'data/lammps/*.data2',