From b299d891789d7668b6ac5fde90118a41a19cd9b2 Mon Sep 17 00:00:00 2001
From: John Detlefs <jdetle@gmail.com>
Date: Thu, 7 Jul 2016 14:22:10 -0700
Subject: [PATCH] Changed n_frames warning to 5000, eliminated reference to
 anisotropic kernel in docs fixed sphinx linking, documented metric API.

Changes to documentation and some style fixes.

More work on docs

Change to license v2

Fixed tests

Docs fixes, protected eigenvectors
---
 package/MDAnalysis/analysis/diffusionmap.py   | 163 +++++++++++-------
 .../documentation_pages/analysis_modules.rst  |   2 +-
 .../analysis/test_diffusionmap.py             |   8 +-
 3 files changed, 109 insertions(+), 64 deletions(-)

diff --git a/package/MDAnalysis/analysis/diffusionmap.py b/package/MDAnalysis/analysis/diffusionmap.py
index 52e747041f0..79e1dab2ebd 100644
--- a/package/MDAnalysis/analysis/diffusionmap.py
+++ b/package/MDAnalysis/analysis/diffusionmap.py
@@ -20,33 +20,28 @@
 
 :Authors: Eugen Hruska, John Detlefs
 :Year: 2016
-:Copyright: GNU Public License v3
-
-The module contains the non-linear dimension reduction method diffusion map.
-The diffusion map provides an estimate of the slowest collective
-coordinates for a trajectory. This non-linear dimension reduction method
-assumes that the trajectory is long enough to represent a probability
-distribution of a protein close to the equilibrium. Furthermore, the diffusion
-map assumes that the diffusion coefficients associated with the dynamical
-motion of molecules in the system are constant. The eigenvectors with
-the largest eigenvalues are the more dominant collective coordinates. Assigning
-phyiscal meaning of the 'collective coordinates' is a fundamentally difficult
-problem. The time complexity of the diffusion map is O(N^3), where N is the
-number of frames in the trajectory, and the in-memory storage complexity is
-O(N^2). Instead of a single trajectory a sample of protein structures
-can be used. The sample should be equiblibrated, at least locally. The order of
-the sampled structures in the trajectory is irrelevant.
+:Copyright: GNU Public License v2
+
+This module contains the non-linear dimension reduction method diffusion map.
+The eigenvectors of a diffusion matrix represent the 'collective coordinates'
+of a molecule; the largest eigenvalues are the more dominant collective
+coordinates. Assigning phyiscal meaning to the 'collective coordinates' is a
+fundamentally difficult problem. The time complexity of the diffusion map is
+:math:`O(N^3)`, where N is the number of frames in the trajectory, and the in-memory
+storage complexity is :math:`O(N^2)`. Instead of a single trajectory a sample of
+protein structures can be used. The sample should be equiblibrated, at least
+locally. The order of the sampled structures in the trajectory is irrelevant.
 
 The :ref:`Diffusion-Map-tutorial` shows how to use diffusion map for dimension
 reduction.
 
-More details about diffusion maps are in [Lafon1]_ , [Ferguson1]_, and
-[Clementi1]_.
+More details about diffusion maps are in [deLaPorte1]_, [Lafon1]_ ,
+[Ferguson1]_, and [Clementi1]_.
 
 .. _Diffusion-Map-tutorial:
 
 Diffusion Map tutorial
---------------------
+----------------------
 
 The example uses files provided as part of the MDAnalysis test suite
 (in the variables :data:`~MDAnalysis.tests.datafiles.PSF` and
@@ -55,7 +50,7 @@
 
 First load all modules and test data ::
 
-   >>> import MDAnalysis
+   >>> import MDAnalysis as mda
    >>> import numpy as np
    >>> import MDAnalysis.analysis.diffusionmap as diffusionmap
    >>> from MDAnalysis.tests.datafiles import PSF, DCD
@@ -64,7 +59,7 @@
 the Diffusion Matrix from that trajectory using :class:`DiffusionMap`:: and get
 the corresponding eigenvalues and eigenvectors.
 
-   >>> u = MDAnalysis.Universe(PSF,DCD)
+   >>> u = mda.Universe(PSF,DCD)
 
 We leave determination of the appropriate scale parameter epsilon to the user,
 [Clementi1]_ uses a complex method involving the k-nearest-neighbors of a
@@ -75,8 +70,7 @@
    >>> dmap = diffusionmap.DiffusionMap(u, select='backbone', epsilon=2)
    >>> dmap.run()
 
-From here we can perform an embedding onto the k dominant eigenvectors. This
-is similar to the idea of a transform in Principal Component Analysis, but the
+From here we can perform an embedding onto the k dominant eigenvectors. The
 non-linearity of the map means there is no explicit relationship between the
 lower dimensional space and our original trajectory. However, this is an
 isometry (distance preserving map), which means that points close in the lower
@@ -87,40 +81,83 @@
 spectral gap and should be somewhat apparent for a system at equilibrium with a
 high number of frames.
 
-   >>> num_eigenvectors = # some number less than the number of frames
+   >>>  # first cell of  a jupyter notebook should contain: %matplotlib inline
+   >>>  import matplotlib.pyplot as plt
+   >>>  f, ax = plt.subplots()
+   >>>  upper_limit = # some reasonably high number less than the n_eigenvectors
+   >>>  ax.plot(dmap.eigenvalues[:upper_limit])
+   >>>  ax.set(xlabel ='eigenvalue index', ylabel='eigenvalue')
+   >>>  plt.tight_layout()
+
+From here we can transform into the diffusion space
+
+   >>> num_eigenvectors = # some number less than the number of frames after
+   >>> # inspecting for the spectral gap
    >>> fit = dmap.transform(num_eigenvectors, time=1)
 
-From here it can be difficult to interpret the data, and is left as a task
+It can be difficult to interpret the data, and is left as a task
 for the user. The `diffusion distance` between frames i and j is best
 approximated by the euclidean distance  between rows i and j of
-self.diffusion_space. A Jupyter [notebook](https://github.com/jdetle/dimension_reduction/blob/master/diffusionMaps/Diffusion_Map_Analysis_of_ADK.ipynb)
-providing an analysis of protein pening and closing has been provided.
+self.diffusion_space.
+
+
+.. _Distance-Matrix-tutorial:
+
+Distance Matrix tutorial
+------------------------
+
+Often a, a custom distance matrix could be useful for local
+epsilon determination or other manipulations on the diffusion
+map method. The :class:`DistanceMatrix` exists in
+:mod:`~MDAnalysis.analysis.diffusionmap` and can be passed
+as an initialization argument for :class:`DiffusionMap`.
+    >>> import MDAnalysis as mda
+    >>> import numpy as np
+    >>> import MDAnalysis.analysis.diffusionmap as diffusionmap
+    >>> from MDAnalysis.tests.datafiles import PSF, DCD
+
+Now create the distance matrix and pass it as an argument to
+:class:`DiffusionMap`.
+
+    >>> u = mda.Universe(PSF,DCD)
+    >>> dist_matrix = diffusionmap.DistanceMatrix(u, select='all')
+    >>> dist_matrix.run()
+    >>> dmap = diffusionmap.DiffusionMap(dist_matrix)
+    >>> dmap.run()
 
 Classes
 -------
 
 .. autoclass:: DiffusionMap
-.. autoclass:: DistMatrix
+.. autoclass:: DistanceMatrix
 
 References
----------
+----------
 
 If you use this Dimension Reduction method in a publication, please
-reference:
-..[Ferguson1] Ferguson, A. L.; Panagiotopoulos, A. Z.; Kevrekidis, I. G.
-Debenedetti,  P. G. Nonlinear dimensionality reduction in molecular
-simulation: The diffusion map approach  Chem. Phys. Lett. 509, 1−11 (2011)
-..[deLaPorte1] J. de la Porte, B. M. Herbst, W. Hereman, S. J. van der Walt.
+cite:
+
+.. [Lafon1]
+Coifman, Ronald R., Lafon, Stephane Diffusion maps. Appl. Comput. Harmon.
+Anal. 21, 5–30  (2006).
+
+For more information
+--------------------
+
+.. [deLaPorte1]
+J. de la Porte, B. M. Herbst, W. Hereman, S. J. van der Walt.
 An Introduction to Diffusion Maps.
-..[Lafon1] Coifman, Ronald R., Lafon, Stephane Diffusion maps.
-Appl. Comput. Harmon. Anal. 21, 5–30  (2006).
-..[Lafon2] Boaz Nadler, Stéphane Lafon, Ronald R. Coifman, Ioannis G.
-Kevrekidis. Diffusion maps, spectral clustering and reaction coordinates
-of dynamical systems. Appl. Comput. Harmon. Anal. 21 (2006) 113–127
-..[Clementi1] Rohrdanz, M. A, Zheng, W, Maggioni, M, & Clementi, C.
+
+.. [Clementi1]
+Rohrdanz, M. A, Zheng, W, Maggioni, M, & Clementi, C.
 Determination of reaction coordinates via locally scaled
 diffusion map. J. Chem. Phys. 134, 124116 (2011).
 
+.. [Ferguson1]
+Ferguson, A. L.; Panagiotopoulos, A. Z.; Kevrekidis, I. G.
+Debenedetti,  P. G. Nonlinear dimensionality reduction in molecular simulation:
+The diffusion map approach  Chem. Phys. Lett. 509, 1−11 (2011)
+
 .. If you choose the default metric, this module uses the fast QCP algorithm
 [Theobald2005]_ to calculate the root mean square distance (RMSD) between
 two coordinate sets (as implemented
@@ -133,7 +170,7 @@
 import logging
 import warnings
 
-import MDAnalysis as mda
+from MDAnalysis.core.AtomGroup import Universe
 import numpy as np
 
 from .rms import rmsd
@@ -143,8 +180,12 @@
 
 
 class DistanceMatrix(AnalysisBase):
-    """ Calculate the pairwise distance between each frame in a trajectory using
-        a given metric
+    """Calculate the pairwise distance between each frame in a trajectory
+    using a given metric
+
+    A distance matrix can be initialized on its own and used as an
+    initialization argument in :class:`DiffusionMap`. Refer to the
+    :ref:`Distance-Matrix-tutorial` for a demonstration.
 
     Attributes
     ----------
@@ -177,7 +218,11 @@ def __init__(self, u, select='all', metric=rmsd, cutoff=1E0-5,
             different frames. Water should be excluded.
         metric : function, optional
             Maps two numpy arrays to a float, is positive definite and
-            symmetric, Default: metric is set to rms.rmsd().
+            symmetric. The API for a metric requires that the arrays must have
+            equal length, and that the function should have weights as an
+            optional argument. Weights give each index value its own weight for
+            the metric calculation over the entire arrays. Default: metric is
+            set to rms.rmsd().
         cutoff : float, optional
             Specify a given cutoff for metric values to be considered equal,
             Default: 1EO-5
@@ -190,7 +235,6 @@ def __init__(self, u, select='all', metric=rmsd, cutoff=1E0-5,
         step : int, optional
             Step between frames to analyse, Default: 1
         """
-
         self._u = u
         traj = self._u.trajectory
         self.atoms = self._u.select_atoms(select)
@@ -198,11 +242,11 @@ def __init__(self, u, select='all', metric=rmsd, cutoff=1E0-5,
         self._cutoff = cutoff
         self._weights = weights
         self._calculated = False
-        # remember that this must be called before referencing self.nframes
+        # remember that this must be called before referencing self.n_frames
         self._setup_frames(traj, start, stop, step)
 
     def _prepare(self):
-        self.dist_matrix = np.zeros((self.nframes, self.nframes))
+        self.dist_matrix = np.zeros((self.n_frames, self.n_frames))
 
     def _single_frame(self):
         iframe = self._ts.frame
@@ -238,11 +282,6 @@ class DiffusionMap(object):
     ----------
     eigenvalues: array
         Eigenvalues of the diffusion map
-    eigenvectors: array
-        Eigenvectors of the diffusion map
-    diffusion_space : array
-        After calling `transform(n_eigenvectors)` the diffusion map embedding
-        into the lower dimensional diffusion space will exist here.
 
     Methods
     -------
@@ -265,9 +304,13 @@ def __init__(self, u, epsilon=1, **kwargs):
             into a diffusion kernel.
         epsilon : Float
             Specifies the method used for the choice of scale parameter in the
-            diffusion map. More information in [1], [2] and [3], Default: 1.
-        """
-        if isinstance(u, mda.Universe):
+            diffusion map. More information in [Lafon1]_, [Ferguson1]_ and
+            [Clementi1]_, Default: 1.
+        **kwargs
+            Parameters to be passed for the initialization of a
+            :class:`DistanceMatrix`.
+            """
+        if isinstance(u, Universe):
             self._dist_matrix = DistanceMatrix(u, **kwargs)
         elif isinstance(u, DistanceMatrix):
             self._dist_matrix = u
@@ -276,14 +319,16 @@ def __init__(self, u, epsilon=1, **kwargs):
                              " so the DiffusionMap has no data to work with.")
         self._epsilon = epsilon
         # important for transform function and length of .run() method
-        self._nframes = self._dist_matrix.nframes
-        if self._nframes > 2000:
+        self._n_frames = self._dist_matrix.n_frames
+        if self._n_frames > 5000:
             warnings.warn("The distance matrix is very large, and can "
                           "be very slow to compute. Consider picking a larger "
                           "step size in distance matrix initialization.")
 
 
     def run(self):
+        """ Create and decompose the diffusion matrix in preparation
+        for a diffusion map."""
         # run only if distance matrix not already calculated
         if not self._dist_matrix._calculated:
             self._dist_matrix.run()
@@ -296,7 +341,7 @@ def run(self):
         self._eigenvals, self._eigenvectors = np.linalg.eig(self._diff)
         sort_idx = np.argsort(self._eigenvals)[::-1]
         self.eigenvalues = self._eigenvals[sort_idx]
-        self.eigenvectors = self._eigenvectors[sort_idx]
+        self._eigenvectors = self._eigenvectors[sort_idx]
         self._calculated = True
 
     def transform(self, n_eigenvectors, time):
@@ -315,5 +360,5 @@ def transform(self, n_eigenvectors, time):
         diffusion_space : array
             The diffusion map embedding as defined by [Ferguson1]_.
         """
-        return (self.eigenvectors[1:n_eigenvectors+1,].T *
+        return (self._eigenvectors[1:n_eigenvectors+1,].T *
                 (self.eigenvalues[1:n_eigenvectors+1]**time))
diff --git a/package/doc/sphinx/source/documentation_pages/analysis_modules.rst b/package/doc/sphinx/source/documentation_pages/analysis_modules.rst
index c3d3c35814a..3d6a3ec12b0 100644
--- a/package/doc/sphinx/source/documentation_pages/analysis_modules.rst
+++ b/package/doc/sphinx/source/documentation_pages/analysis_modules.rst
@@ -113,5 +113,5 @@ Dimension Reduction
 ===================
 .. toctree::
    :maxdepth: 1
-   
+
    analysis/diffusionmap
diff --git a/testsuite/MDAnalysisTests/analysis/test_diffusionmap.py b/testsuite/MDAnalysisTests/analysis/test_diffusionmap.py
index 2769e88607f..052af99dea9 100644
--- a/testsuite/MDAnalysisTests/analysis/test_diffusionmap.py
+++ b/testsuite/MDAnalysisTests/analysis/test_diffusionmap.py
@@ -31,24 +31,24 @@ def setUp(self):
         self.dmap = diffusionmap.DiffusionMap(self.dist)
         self.dmap.run()
         self.eigvals = self.dmap.eigenvalues
-        self.eigvects = self.dmap.eigenvectors
+        self.eigvects = self.dmap._eigenvectors
 
     def test_eg(self):
         # number of frames is trajectory is now 10 vs. 98
-        assert_equal(self.eigvals.shape, (self.dist.nframes, ))
+        assert_equal(self.eigvals.shape, (self.dist.n_frames, ))
         # makes no sense to test values here, no physical meaning
 
     def test_dist_weights(self):
         backbone = self.u.select_atoms('backbone')
         weights_atoms = np.ones(len(backbone.atoms))
         self.dist = diffusionmap.DistanceMatrix(self.u, select='backbone',
-                                           weights=weights_atoms)
+                                                weights=weights_atoms)
         self.dist.run()
 
     def test_different_steps(self):
         self.dmap = diffusionmap.DiffusionMap(self.u, select='backbone', step=3)
         self.dmap.run()
-    
+
     def test_transform(self):
         self.n_eigenvectors = 4
         self.dmap = diffusionmap.DiffusionMap(self.u)