diff --git a/package/CHANGELOG b/package/CHANGELOG index b6433cfce90..658330e50e6 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -39,7 +39,9 @@ Changes * Added protected variable _frame_index to to keep track of frame iteration number in AnalysisBase * Added new AlignTraj class for alignment that follows the new analysis API known - as Bauhaus style. + as Bauhaus style. (Issue #845) + * Added new diffusionmap module for dimension reduction that follows the + Bauhaus API. (Issue #857) Deprecations (Issue #599) * Use of rms_fit_trj deprecated in favor of AlignTraj class (Issue #845) diff --git a/package/MDAnalysis/analysis/diffusionmap.py b/package/MDAnalysis/analysis/diffusionmap.py new file mode 100644 index 00000000000..a4337b50fcc --- /dev/null +++ b/package/MDAnalysis/analysis/diffusionmap.py @@ -0,0 +1,363 @@ +# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*- +# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 +# +# MDAnalysis --- http://www.MDAnalysis.org +# Copyright (c) 2006-2015 Naveen Michaud-Agrawal, Elizabeth J. Denning, Oliver Beckstein +# and contributors (see AUTHORS for the full list) +# +# Released under the GNU Public Licence, v2 or any higher version +# +# Please cite your use of MDAnalysis in published work: +# +# N. Michaud-Agrawal, E. J. Denning, T. B. Woolf, and O. Beckstein. +# MDAnalysis: A Toolkit for the Analysis of Molecular Dynamics Simulations. +# J. Comput. Chem. 32 (2011), 2319--2327, doi:10.1002/jcc.21787 +# + +""" +Diffusion map --- :mod:`MDAnalysis.analysis.diffusionmap` +===================================================================== + +:Authors: Eugen Hruska, John Detlefs +:Year: 2016 +:Copyright: GNU Public License v2 + +This module contains the non-linear dimension reduction method diffusion map. +The eigenvectors of a diffusion matrix represent the 'collective coordinates' +of a molecule; the largest eigenvalues are the more dominant collective +coordinates. Assigning phyiscal meaning to the 'collective coordinates' is a +fundamentally difficult problem. The time complexity of the diffusion map is +:math:`O(N^3)`, where N is the number of frames in the trajectory, and the in-memory +storage complexity is :math:`O(N^2)`. Instead of a single trajectory a sample of +protein structures can be used. The sample should be equiblibrated, at least +locally. The order of the sampled structures in the trajectory is irrelevant. + +The :ref:`Diffusion-Map-tutorial` shows how to use diffusion map for dimension +reduction. + +More details about diffusion maps are in [deLaPorte1]_, [Lafon1]_ , +[Ferguson1]_, and [Clementi1]_. + +.. _Diffusion-Map-tutorial: + +Diffusion Map tutorial +---------------------- + +The example uses files provided as part of the MDAnalysis test suite +(in the variables :data:`~MDAnalysis.tests.datafiles.PSF` and +:data:`~MDAnalysis.tests.datafiles.DCD`). This tutorial shows how to use the +Diffusion Map class. + +First load all modules and test data :: + + >>> import MDAnalysis as mda + >>> import MDAnalysis.analysis.diffusionmap as diffusionmap + >>> from MDAnalysis.tests.datafiles import PSF, DCD + +Given a universe or atom group, we can create and eigenvalue decompose +the Diffusion Matrix from that trajectory using :class:`DiffusionMap`:: and get +the corresponding eigenvalues and eigenvectors. + + >>> u = mda.Universe(PSF,DCD) + +We leave determination of the appropriate scale parameter epsilon to the user, +[Clementi1]_ uses a complex method involving the k-nearest-neighbors of a +trajectory frame, whereas others simple use a trial-and-error approach with +a constant epsilon. Currently, the constant epsilon method is implemented +by MDAnalysis. + + >>> dmap = diffusionmap.DiffusionMap(u, select='backbone', epsilon=2) + >>> dmap.run() + +From here we can perform an embedding onto the k dominant eigenvectors. The +non-linearity of the map means there is no explicit relationship between the +lower dimensional space and our original trajectory. However, this is an +isometry (distance preserving map), which means that points close in the lower +dimensional space are close in the higher-dimensional space and vice versa. +In order to embed into the most relevant low-dimensional space, there should +exist some number of dominant eigenvectors, whose corresponding eigenvalues +diminish at a constant rate until falling off, this is referred to as a +spectral gap and should be somewhat apparent for a system at equilibrium with a +high number of frames. + + >>> # first cell of a jupyter notebook should contain: %matplotlib inline + >>> import matplotlib.pyplot as plt + >>> f, ax = plt.subplots() + >>> upper_limit = # some reasonably high number less than the n_eigenvectors + >>> ax.plot(dmap.eigenvalues[:upper_limit]) + >>> ax.set(xlabel ='eigenvalue index', ylabel='eigenvalue') + >>> plt.tight_layout() + +From here we can transform into the diffusion space + + >>> num_eigenvectors = # some number less than the number of frames after + >>> # inspecting for the spectral gap + >>> fit = dmap.transform(num_eigenvectors, time=1) + +It can be difficult to interpret the data, and is left as a task +for the user. The `diffusion distance` between frames i and j is best +approximated by the euclidean distance between rows i and j of +self.diffusion_space. + + +.. _Distance-Matrix-tutorial: + +Distance Matrix tutorial +------------------------ + +Often a, a custom distance matrix could be useful for local +epsilon determination or other manipulations on the diffusion +map method. The :class:`DistanceMatrix` exists in +:mod:`~MDAnalysis.analysis.diffusionmap` and can be passed +as an initialization argument for :class:`DiffusionMap`. + >>> import MDAnalysis as mda + >>> import MDAnalysis.analysis.diffusionmap as diffusionmap + >>> from MDAnalysis.tests.datafiles import PSF, DCD + +Now create the distance matrix and pass it as an argument to +:class:`DiffusionMap`. + + >>> u = mda.Universe(PSF,DCD) + >>> dist_matrix = diffusionmap.DistanceMatrix(u, select='all') + >>> dist_matrix.run() + >>> dmap = diffusionmap.DiffusionMap(dist_matrix) + >>> dmap.run() + +Classes +------- + +.. autoclass:: DiffusionMap +.. autoclass:: DistanceMatrix + +References +---------- + +If you use this Dimension Reduction method in a publication, please +cite: + +.. [Lafon1] +Coifman, Ronald R., Lafon, Stephane Diffusion maps. Appl. Comput. Harmon. +Anal. 21, 5–30 (2006). + +For more information +-------------------- + +.. [deLaPorte1] +J. de la Porte, B. M. Herbst, W. Hereman, S. J. van der Walt. +An Introduction to Diffusion Maps. + +.. [Clementi1] +Rohrdanz, M. A, Zheng, W, Maggioni, M, & Clementi, C. +Determination of reaction coordinates via locally scaled +diffusion map. J. Chem. Phys. 134, 124116 (2011). + +.. [Ferguson1] +Ferguson, A. L.; Panagiotopoulos, A. Z.; Kevrekidis, I. G. +Debenedetti, P. G. Nonlinear dimensionality reduction in molecular simulation: +The diffusion map approach Chem. Phys. Lett. 509, 1−11 (2011) + +.. If you choose the default metric, this module uses the fast QCP algorithm +[Theobald2005]_ to calculate the root mean square distance (RMSD) between +two coordinate sets (as implemented +in :func:`MDAnalysis.lib.qcprot.CalcRMSDRotationalMatrix`). +When using this module in published work please cite [Theobald2005]_. + + +""" +from six.moves import range +import logging +import warnings + +from MDAnalysis.core.AtomGroup import Universe +import numpy as np + +from .rms import rmsd +from .base import AnalysisBase + +logger = logging.getLogger("MDAnalysis.analysis.diffusionmap") + + +class DistanceMatrix(AnalysisBase): + """Calculate the pairwise distance between each frame in a trajectory + using a given metric + + A distance matrix can be initialized on its own and used as an + initialization argument in :class:`DiffusionMap`. Refer to the + :ref:`Distance-Matrix-tutorial` for a demonstration. + + Attributes + ---------- + atoms : AtomGroup + Selected atoms in trajectory subject to dimension reduction + dist_matrix : array, (n_frames, n_frames) + Array of all possible ij metric distances between frames in trajectory. + This matrix is symmetric with zeros on the diagonal. + + Methods + ------- + save(filename) + Save the `dist_matrix` to a given filename + """ + def __init__(self, u, select='all', metric=rmsd, cutoff=1E0-5, + weights=None, start=None, stop=None, step=None): + """ + Parameters + ---------- + u : universe `~MDAnalysis.core.AtomGroup.Universe` + The MD Trajectory for dimension reduction, remember that + computational cost of eigenvalue decomposition + scales at O(N^3) where N is the number of frames. + Cost can be reduced by increasing step interval or specifying a + start and stop. + select: str, optional + Any valid selection string for + :meth:`~MDAnalysis.core.AtomGroup.AtomGroup.select_atoms` + This selection of atoms is used to calculate the RMSD between + different frames. Water should be excluded. + metric : function, optional + Maps two numpy arrays to a float, is positive definite and + symmetric. The API for a metric requires that the arrays must have + equal length, and that the function should have weights as an + optional argument. Weights give each index value its own weight for + the metric calculation over the entire arrays. Default: metric is + set to rms.rmsd(). + cutoff : float, optional + Specify a given cutoff for metric values to be considered equal, + Default: 1EO-5 + weights : array, optional + Weights to be given to coordinates for metric calculation + start : int, optional + First frame of trajectory to analyse, Default: 0 + stop : int, optional + Last frame of trajectory to analyse, Default: -1 + step : int, optional + Step between frames to analyse, Default: 1 + """ + self._u = u + traj = self._u.trajectory + self.atoms = self._u.select_atoms(select) + self._metric = metric + self._cutoff = cutoff + self._weights = weights + self._calculated = False + # remember that this must be called before referencing self.n_frames + self._setup_frames(traj, start, stop, step) + + def _prepare(self): + self.dist_matrix = np.zeros((self.n_frames, self.n_frames)) + + def _single_frame(self): + iframe = self._ts.frame + i_ref = self.atoms.positions + # diagonal entries need not be calculated due to metric(x,x) == 0 in + # theory, _ts not updated properly. Possible savings by setting a + # cutoff for significant decimal places to sparsify matrix + for j, ts in enumerate(self._u.trajectory[iframe:self.stop:self.step]): + self._ts = ts + j_ref = self.atoms.positions + dist = self._metric(i_ref, j_ref, weights=self._weights) + self.dist_matrix[self._frame_index, j+self._frame_index] = ( + dist if dist > self._cutoff else 0) + self.dist_matrix[j+self._frame_index, self._frame_index] = ( + self.dist_matrix[self._frame_index, j+self._frame_index]) + self._ts = self._u.trajectory[iframe] + + def _conclude(self): + self._calculated = True + + def save(self, filename): + np.save(filename, self.dist_matrix) + logger.info("Wrote the distance-squared matrix to file %r", filename) + + +class DiffusionMap(object): + """Non-linear dimension reduction method + + Dimension reduction with diffusion mapping of selected structures in a + trajectory. + + Attributes + ---------- + eigenvalues: array (n_frames,) + Eigenvalues of the diffusion map + + Methods + ------- + run() + Constructs an anisotropic diffusion kernel and performs eigenvalue + decomposition on it. + transform(n_eigenvectors, time) + Perform an embedding of a frame into the eigenvectors representing + the collective coordinates. + """ + + def __init__(self, u, epsilon=1, **kwargs): + """ + Parameters + ------------- + u : MDAnalysis Universe or DistanceMatrix object + Can be a Universe, in which case one must supply kwargs for the + initialization of a DistanceMatrix. Otherwise, this can be a + DistanceMatrix already initialized. Either way, this will be made + into a diffusion kernel. + epsilon : Float + Specifies the method used for the choice of scale parameter in the + diffusion map. More information in [Lafon1]_, [Ferguson1]_ and + [Clementi1]_, Default: 1. + **kwargs + Parameters to be passed for the initialization of a + :class:`DistanceMatrix`. + """ + if isinstance(u, Universe): + self._dist_matrix = DistanceMatrix(u, **kwargs) + elif isinstance(u, DistanceMatrix): + self._dist_matrix = u + else: + raise ValueError("U is not a Universe or DistanceMatrix and" + " so the DiffusionMap has no data to work with.") + self._epsilon = epsilon + # important for transform function and length of .run() method + self._n_frames = self._dist_matrix.n_frames + if self._n_frames > 5000: + warnings.warn("The distance matrix is very large, and can " + "be very slow to compute. Consider picking a larger " + "step size in distance matrix initialization.") + + + def run(self): + """ Create and decompose the diffusion matrix in preparation + for a diffusion map.""" + # run only if distance matrix not already calculated + if not self._dist_matrix._calculated: + self._dist_matrix.run() + self._scaled_matrix = (self._dist_matrix.dist_matrix ** 2 / + self._epsilon) + # take negative exponent of scaled matrix to create Isotropic kernel + self._kernel = np.exp(-self._scaled_matrix) + D_inv = np.diag(1 / self._kernel.sum(1)) + self._diff = np.dot(D_inv, self._kernel) + self._eigenvals, self._eigenvectors = np.linalg.eig(self._diff) + sort_idx = np.argsort(self._eigenvals)[::-1] + self.eigenvalues = self._eigenvals[sort_idx] + self._eigenvectors = self._eigenvectors[sort_idx] + self._calculated = True + return self + + def transform(self, n_eigenvectors, time): + """ Embeds a trajectory via the diffusion map + + Parameters + --------- + n_eigenvectors : int + The number of dominant eigenvectors to be used for + diffusion mapping + time : float + Exponent that eigenvalues are raised to for embedding, for large + values, more dominant eigenvectors determine diffusion distance. + Return + ------ + diffusion_space : array (n_frames, n_eigenvectors) + The diffusion map embedding as defined by [Ferguson1]_. + """ + return (self._eigenvectors[1:n_eigenvectors+1,].T * + (self.eigenvalues[1:n_eigenvectors+1]**time)) diff --git a/package/doc/sphinx/source/documentation_pages/analysis/diffusionmap.rst b/package/doc/sphinx/source/documentation_pages/analysis/diffusionmap.rst new file mode 100644 index 00000000000..1eefec1e079 --- /dev/null +++ b/package/doc/sphinx/source/documentation_pages/analysis/diffusionmap.rst @@ -0,0 +1 @@ +.. automodule:: MDAnalysis.analysis.diffusionmap diff --git a/package/doc/sphinx/source/documentation_pages/analysis_modules.rst b/package/doc/sphinx/source/documentation_pages/analysis_modules.rst index df3f66c98ad..3d6a3ec12b0 100644 --- a/package/doc/sphinx/source/documentation_pages/analysis_modules.rst +++ b/package/doc/sphinx/source/documentation_pages/analysis_modules.rst @@ -19,7 +19,7 @@ has to import them from :mod:`MDAnalysis.analysis`, for instance :: import MDAnalysis.analysis.align -.. Note:: +.. Note:: Some of the modules require additional Python packages such as :mod:`scipy` from the SciPy_ package or :mod:`networkx` from @@ -100,7 +100,7 @@ Structure Volumetric analysis -=================== +=================== .. toctree:: :maxdepth: 1 @@ -109,4 +109,9 @@ Volumetric analysis analysis/lineardensity analysis/waterdynamics +Dimension Reduction +=================== +.. toctree:: + :maxdepth: 1 + analysis/diffusionmap diff --git a/testsuite/MDAnalysisTests/analysis/test_diffusionmap.py b/testsuite/MDAnalysisTests/analysis/test_diffusionmap.py new file mode 100644 index 00000000000..0b5897bd27f --- /dev/null +++ b/testsuite/MDAnalysisTests/analysis/test_diffusionmap.py @@ -0,0 +1,69 @@ +# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*- +# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 fileencoding=utf-8 +# +# MDAnalysis --- http://www.MDAnalysis.org +# Copyright (c) 2006-2015 Naveen Michaud-Agrawal, Elizabeth J. Denning, Oliver +# Beckstein and contributors (see AUTHORS for the full list) +# +# Released under the GNU Public Licence, v2 or any higher version +# +# Please cite your use of MDAnalysis in published work: +# +# N. Michaud-Agrawal, E. J. Denning, T. B. Woolf, and O. Beckstein. +# MDAnalysis: A Toolkit for the Analysis of Molecular Dynamics Simulations. +# J. Comput. Chem. 32 (2011), 2319--2327, doi:10.1002/jcc.21787 +# +from __future__ import print_function +import numpy as np +import MDAnalysis +import MDAnalysis.analysis.diffusionmap as diffusionmap +from numpy.testing import (assert_almost_equal, assert_equal, + assert_array_almost_equal,raises) + + +from MDAnalysisTests.datafiles import PDB, XTC + + +class TestDiffusionmap(object): + def setUp(self): + self.u = MDAnalysis.Universe(PDB, XTC) + self.dist = diffusionmap.DistanceMatrix(self.u, select='backbone') + self.dmap = diffusionmap.DiffusionMap(self.dist) + self.dmap.run() + self.eigvals = self.dmap.eigenvalues + self.eigvects = self.dmap._eigenvectors + + def test_eg(self): + # number of frames is trajectory is now 10 vs. 98 + assert_equal(self.eigvals.shape, (self.dist.n_frames, )) + # makes no sense to test values here, no physical meaning + + def test_dist_weights(self): + backbone = self.u.select_atoms('backbone') + weights_atoms = np.ones(len(backbone.atoms)) + self.dist = diffusionmap.DistanceMatrix(self.u, select='backbone', + weights=weights_atoms, + step=3) + self.dist.run() + self.dmap = diffusionmap.DiffusionMap(self.dist) + self.dmap.run() + assert_array_almost_equal(self.dmap.eigenvalues, [1,1,1,1], 4) + assert_array_almost_equal(self.dmap._eigenvectors, + ([[ 0, 0, 1, 0], + [ 0, 0, 0, 1], + [ -.707,-.707, 0, 0], + [ .707,-.707, 0, 0]]) ,2) + + def test_different_steps(self): + self.dmap = diffusionmap.DiffusionMap(self.u, select='backbone', step=3) + self.dmap.run() + assert_equal(self.dmap._eigenvectors.shape, (4,4)) + + def test_transform(self): + self.n_eigenvectors = 4 + self.dmap = diffusionmap.DiffusionMap(self.u) + self.dmap.run() + diffusion_space = self.dmap.transform(self.n_eigenvectors,1) + assert_equal(diffusion_space.shape, + (self.eigvects.shape[0], + self.n_eigenvectors))