From b299d891789d7668b6ac5fde90118a41a19cd9b2 Mon Sep 17 00:00:00 2001 From: John Detlefs Date: Thu, 7 Jul 2016 14:22:10 -0700 Subject: [PATCH] Changed n_frames warning to 5000, eliminated reference to anisotropic kernel in docs fixed sphinx linking, documented metric API. Changes to documentation and some style fixes. More work on docs Change to license v2 Fixed tests Docs fixes, protected eigenvectors --- package/MDAnalysis/analysis/diffusionmap.py | 163 +++++++++++------- .../documentation_pages/analysis_modules.rst | 2 +- .../analysis/test_diffusionmap.py | 8 +- 3 files changed, 109 insertions(+), 64 deletions(-) diff --git a/package/MDAnalysis/analysis/diffusionmap.py b/package/MDAnalysis/analysis/diffusionmap.py index 52e747041f0..79e1dab2ebd 100644 --- a/package/MDAnalysis/analysis/diffusionmap.py +++ b/package/MDAnalysis/analysis/diffusionmap.py @@ -20,33 +20,28 @@ :Authors: Eugen Hruska, John Detlefs :Year: 2016 -:Copyright: GNU Public License v3 - -The module contains the non-linear dimension reduction method diffusion map. -The diffusion map provides an estimate of the slowest collective -coordinates for a trajectory. This non-linear dimension reduction method -assumes that the trajectory is long enough to represent a probability -distribution of a protein close to the equilibrium. Furthermore, the diffusion -map assumes that the diffusion coefficients associated with the dynamical -motion of molecules in the system are constant. The eigenvectors with -the largest eigenvalues are the more dominant collective coordinates. Assigning -phyiscal meaning of the 'collective coordinates' is a fundamentally difficult -problem. The time complexity of the diffusion map is O(N^3), where N is the -number of frames in the trajectory, and the in-memory storage complexity is -O(N^2). Instead of a single trajectory a sample of protein structures -can be used. The sample should be equiblibrated, at least locally. The order of -the sampled structures in the trajectory is irrelevant. +:Copyright: GNU Public License v2 + +This module contains the non-linear dimension reduction method diffusion map. +The eigenvectors of a diffusion matrix represent the 'collective coordinates' +of a molecule; the largest eigenvalues are the more dominant collective +coordinates. Assigning phyiscal meaning to the 'collective coordinates' is a +fundamentally difficult problem. The time complexity of the diffusion map is +:math:`O(N^3)`, where N is the number of frames in the trajectory, and the in-memory +storage complexity is :math:`O(N^2)`. Instead of a single trajectory a sample of +protein structures can be used. The sample should be equiblibrated, at least +locally. The order of the sampled structures in the trajectory is irrelevant. The :ref:`Diffusion-Map-tutorial` shows how to use diffusion map for dimension reduction. -More details about diffusion maps are in [Lafon1]_ , [Ferguson1]_, and -[Clementi1]_. +More details about diffusion maps are in [deLaPorte1]_, [Lafon1]_ , +[Ferguson1]_, and [Clementi1]_. .. _Diffusion-Map-tutorial: Diffusion Map tutorial --------------------- +---------------------- The example uses files provided as part of the MDAnalysis test suite (in the variables :data:`~MDAnalysis.tests.datafiles.PSF` and @@ -55,7 +50,7 @@ First load all modules and test data :: - >>> import MDAnalysis + >>> import MDAnalysis as mda >>> import numpy as np >>> import MDAnalysis.analysis.diffusionmap as diffusionmap >>> from MDAnalysis.tests.datafiles import PSF, DCD @@ -64,7 +59,7 @@ the Diffusion Matrix from that trajectory using :class:`DiffusionMap`:: and get the corresponding eigenvalues and eigenvectors. - >>> u = MDAnalysis.Universe(PSF,DCD) + >>> u = mda.Universe(PSF,DCD) We leave determination of the appropriate scale parameter epsilon to the user, [Clementi1]_ uses a complex method involving the k-nearest-neighbors of a @@ -75,8 +70,7 @@ >>> dmap = diffusionmap.DiffusionMap(u, select='backbone', epsilon=2) >>> dmap.run() -From here we can perform an embedding onto the k dominant eigenvectors. This -is similar to the idea of a transform in Principal Component Analysis, but the +From here we can perform an embedding onto the k dominant eigenvectors. The non-linearity of the map means there is no explicit relationship between the lower dimensional space and our original trajectory. However, this is an isometry (distance preserving map), which means that points close in the lower @@ -87,40 +81,83 @@ spectral gap and should be somewhat apparent for a system at equilibrium with a high number of frames. - >>> num_eigenvectors = # some number less than the number of frames + >>> # first cell of a jupyter notebook should contain: %matplotlib inline + >>> import matplotlib.pyplot as plt + >>> f, ax = plt.subplots() + >>> upper_limit = # some reasonably high number less than the n_eigenvectors + >>> ax.plot(dmap.eigenvalues[:upper_limit]) + >>> ax.set(xlabel ='eigenvalue index', ylabel='eigenvalue') + >>> plt.tight_layout() + +From here we can transform into the diffusion space + + >>> num_eigenvectors = # some number less than the number of frames after + >>> # inspecting for the spectral gap >>> fit = dmap.transform(num_eigenvectors, time=1) -From here it can be difficult to interpret the data, and is left as a task +It can be difficult to interpret the data, and is left as a task for the user. The `diffusion distance` between frames i and j is best approximated by the euclidean distance between rows i and j of -self.diffusion_space. A Jupyter [notebook](https://github.com/jdetle/dimension_reduction/blob/master/diffusionMaps/Diffusion_Map_Analysis_of_ADK.ipynb) -providing an analysis of protein pening and closing has been provided. +self.diffusion_space. + + +.. _Distance-Matrix-tutorial: + +Distance Matrix tutorial +------------------------ + +Often a, a custom distance matrix could be useful for local +epsilon determination or other manipulations on the diffusion +map method. The :class:`DistanceMatrix` exists in +:mod:`~MDAnalysis.analysis.diffusionmap` and can be passed +as an initialization argument for :class:`DiffusionMap`. + >>> import MDAnalysis as mda + >>> import numpy as np + >>> import MDAnalysis.analysis.diffusionmap as diffusionmap + >>> from MDAnalysis.tests.datafiles import PSF, DCD + +Now create the distance matrix and pass it as an argument to +:class:`DiffusionMap`. + + >>> u = mda.Universe(PSF,DCD) + >>> dist_matrix = diffusionmap.DistanceMatrix(u, select='all') + >>> dist_matrix.run() + >>> dmap = diffusionmap.DiffusionMap(dist_matrix) + >>> dmap.run() Classes ------- .. autoclass:: DiffusionMap -.. autoclass:: DistMatrix +.. autoclass:: DistanceMatrix References ---------- +---------- If you use this Dimension Reduction method in a publication, please -reference: -..[Ferguson1] Ferguson, A. L.; Panagiotopoulos, A. Z.; Kevrekidis, I. G. -Debenedetti, P. G. Nonlinear dimensionality reduction in molecular -simulation: The diffusion map approach Chem. Phys. Lett. 509, 1−11 (2011) -..[deLaPorte1] J. de la Porte, B. M. Herbst, W. Hereman, S. J. van der Walt. +cite: + +.. [Lafon1] +Coifman, Ronald R., Lafon, Stephane Diffusion maps. Appl. Comput. Harmon. +Anal. 21, 5–30 (2006). + +For more information +-------------------- + +.. [deLaPorte1] +J. de la Porte, B. M. Herbst, W. Hereman, S. J. van der Walt. An Introduction to Diffusion Maps. -..[Lafon1] Coifman, Ronald R., Lafon, Stephane Diffusion maps. -Appl. Comput. Harmon. Anal. 21, 5–30 (2006). -..[Lafon2] Boaz Nadler, Stéphane Lafon, Ronald R. Coifman, Ioannis G. -Kevrekidis. Diffusion maps, spectral clustering and reaction coordinates -of dynamical systems. Appl. Comput. Harmon. Anal. 21 (2006) 113–127 -..[Clementi1] Rohrdanz, M. A, Zheng, W, Maggioni, M, & Clementi, C. + +.. [Clementi1] +Rohrdanz, M. A, Zheng, W, Maggioni, M, & Clementi, C. Determination of reaction coordinates via locally scaled diffusion map. J. Chem. Phys. 134, 124116 (2011). +.. [Ferguson1] +Ferguson, A. L.; Panagiotopoulos, A. Z.; Kevrekidis, I. G. +Debenedetti, P. G. Nonlinear dimensionality reduction in molecular simulation: +The diffusion map approach Chem. Phys. Lett. 509, 1−11 (2011) + .. If you choose the default metric, this module uses the fast QCP algorithm [Theobald2005]_ to calculate the root mean square distance (RMSD) between two coordinate sets (as implemented @@ -133,7 +170,7 @@ import logging import warnings -import MDAnalysis as mda +from MDAnalysis.core.AtomGroup import Universe import numpy as np from .rms import rmsd @@ -143,8 +180,12 @@ class DistanceMatrix(AnalysisBase): - """ Calculate the pairwise distance between each frame in a trajectory using - a given metric + """Calculate the pairwise distance between each frame in a trajectory + using a given metric + + A distance matrix can be initialized on its own and used as an + initialization argument in :class:`DiffusionMap`. Refer to the + :ref:`Distance-Matrix-tutorial` for a demonstration. Attributes ---------- @@ -177,7 +218,11 @@ def __init__(self, u, select='all', metric=rmsd, cutoff=1E0-5, different frames. Water should be excluded. metric : function, optional Maps two numpy arrays to a float, is positive definite and - symmetric, Default: metric is set to rms.rmsd(). + symmetric. The API for a metric requires that the arrays must have + equal length, and that the function should have weights as an + optional argument. Weights give each index value its own weight for + the metric calculation over the entire arrays. Default: metric is + set to rms.rmsd(). cutoff : float, optional Specify a given cutoff for metric values to be considered equal, Default: 1EO-5 @@ -190,7 +235,6 @@ def __init__(self, u, select='all', metric=rmsd, cutoff=1E0-5, step : int, optional Step between frames to analyse, Default: 1 """ - self._u = u traj = self._u.trajectory self.atoms = self._u.select_atoms(select) @@ -198,11 +242,11 @@ def __init__(self, u, select='all', metric=rmsd, cutoff=1E0-5, self._cutoff = cutoff self._weights = weights self._calculated = False - # remember that this must be called before referencing self.nframes + # remember that this must be called before referencing self.n_frames self._setup_frames(traj, start, stop, step) def _prepare(self): - self.dist_matrix = np.zeros((self.nframes, self.nframes)) + self.dist_matrix = np.zeros((self.n_frames, self.n_frames)) def _single_frame(self): iframe = self._ts.frame @@ -238,11 +282,6 @@ class DiffusionMap(object): ---------- eigenvalues: array Eigenvalues of the diffusion map - eigenvectors: array - Eigenvectors of the diffusion map - diffusion_space : array - After calling `transform(n_eigenvectors)` the diffusion map embedding - into the lower dimensional diffusion space will exist here. Methods ------- @@ -265,9 +304,13 @@ def __init__(self, u, epsilon=1, **kwargs): into a diffusion kernel. epsilon : Float Specifies the method used for the choice of scale parameter in the - diffusion map. More information in [1], [2] and [3], Default: 1. - """ - if isinstance(u, mda.Universe): + diffusion map. More information in [Lafon1]_, [Ferguson1]_ and + [Clementi1]_, Default: 1. + **kwargs + Parameters to be passed for the initialization of a + :class:`DistanceMatrix`. + """ + if isinstance(u, Universe): self._dist_matrix = DistanceMatrix(u, **kwargs) elif isinstance(u, DistanceMatrix): self._dist_matrix = u @@ -276,14 +319,16 @@ def __init__(self, u, epsilon=1, **kwargs): " so the DiffusionMap has no data to work with.") self._epsilon = epsilon # important for transform function and length of .run() method - self._nframes = self._dist_matrix.nframes - if self._nframes > 2000: + self._n_frames = self._dist_matrix.n_frames + if self._n_frames > 5000: warnings.warn("The distance matrix is very large, and can " "be very slow to compute. Consider picking a larger " "step size in distance matrix initialization.") def run(self): + """ Create and decompose the diffusion matrix in preparation + for a diffusion map.""" # run only if distance matrix not already calculated if not self._dist_matrix._calculated: self._dist_matrix.run() @@ -296,7 +341,7 @@ def run(self): self._eigenvals, self._eigenvectors = np.linalg.eig(self._diff) sort_idx = np.argsort(self._eigenvals)[::-1] self.eigenvalues = self._eigenvals[sort_idx] - self.eigenvectors = self._eigenvectors[sort_idx] + self._eigenvectors = self._eigenvectors[sort_idx] self._calculated = True def transform(self, n_eigenvectors, time): @@ -315,5 +360,5 @@ def transform(self, n_eigenvectors, time): diffusion_space : array The diffusion map embedding as defined by [Ferguson1]_. """ - return (self.eigenvectors[1:n_eigenvectors+1,].T * + return (self._eigenvectors[1:n_eigenvectors+1,].T * (self.eigenvalues[1:n_eigenvectors+1]**time)) diff --git a/package/doc/sphinx/source/documentation_pages/analysis_modules.rst b/package/doc/sphinx/source/documentation_pages/analysis_modules.rst index c3d3c35814a..3d6a3ec12b0 100644 --- a/package/doc/sphinx/source/documentation_pages/analysis_modules.rst +++ b/package/doc/sphinx/source/documentation_pages/analysis_modules.rst @@ -113,5 +113,5 @@ Dimension Reduction =================== .. toctree:: :maxdepth: 1 - + analysis/diffusionmap diff --git a/testsuite/MDAnalysisTests/analysis/test_diffusionmap.py b/testsuite/MDAnalysisTests/analysis/test_diffusionmap.py index 2769e88607f..052af99dea9 100644 --- a/testsuite/MDAnalysisTests/analysis/test_diffusionmap.py +++ b/testsuite/MDAnalysisTests/analysis/test_diffusionmap.py @@ -31,24 +31,24 @@ def setUp(self): self.dmap = diffusionmap.DiffusionMap(self.dist) self.dmap.run() self.eigvals = self.dmap.eigenvalues - self.eigvects = self.dmap.eigenvectors + self.eigvects = self.dmap._eigenvectors def test_eg(self): # number of frames is trajectory is now 10 vs. 98 - assert_equal(self.eigvals.shape, (self.dist.nframes, )) + assert_equal(self.eigvals.shape, (self.dist.n_frames, )) # makes no sense to test values here, no physical meaning def test_dist_weights(self): backbone = self.u.select_atoms('backbone') weights_atoms = np.ones(len(backbone.atoms)) self.dist = diffusionmap.DistanceMatrix(self.u, select='backbone', - weights=weights_atoms) + weights=weights_atoms) self.dist.run() def test_different_steps(self): self.dmap = diffusionmap.DiffusionMap(self.u, select='backbone', step=3) self.dmap.run() - + def test_transform(self): self.n_eigenvectors = 4 self.dmap = diffusionmap.DiffusionMap(self.u)