Merge pull request #1237 from marscher/vamp_rebased

VAMP estimator and cktest
markovmodel · Feb 7, 2018 · 4173ebe · 4173ebe
2 parents ac19e84 + 2fb9d9a
commit 4173ebe
Show file tree

Hide file tree

Showing 23 changed files with 1,543 additions and 55 deletions.
diff --git a/devtools/ci/travis/install_miniconda.sh b/devtools/ci/travis/install_miniconda.sh
@@ -30,5 +30,5 @@ else # if it does not exist, we need to install miniconda
 fi
 
 # we want to have an up to date conda-build.
-conda install conda-build=3.2
+conda install conda-build=3
 conda info -a # for debugging
diff --git a/devtools/conda-recipe/meta.yaml b/devtools/conda-recipe/meta.yaml
@@ -24,7 +24,7 @@ requirements:
     - numpy 1.9.*  # [not (win and (py35 or py36))]
     - numpy 1.9.*  # [win and py35]
     - numpy 1.11.*  # [win and py36]
-    - python >=3
+    - python
     - scipy
     - setuptools
     - gcc # [ not win ]
@@ -42,10 +42,11 @@ requirements:
     - numpy >=1.11,<1.14 # [win and py36]
     - pathos
     - psutil >3.1
-    - python >=3
+    - python
     - pyyaml
     - scipy
     - setuptools
+    - six >=1.10
     - thermotools >=0.2.6
     - tqdm
 

diff --git a/devtools/conda-recipe/run_test.py b/devtools/conda-recipe/run_test.py
@@ -9,7 +9,9 @@
 # where to write junit xml
 junit_xml = os.path.join(os.getenv('CIRCLE_TEST_REPORTS', os.path.expanduser('~')),
                          'reports', 'junit.xml')
-os.makedirs(os.path.dirname(junit_xml), exist_ok=True)
+target_dir = os.path.dirname(junit_xml)
+if not os.path.exists(target_dir):
+    os.makedirs(target_dir)
 print('junit destination:', junit_xml)
 njobs_args = '-p no:xdist' if os.getenv('TRAVIS') else '-n2'
 

diff --git a/doc/source/CHANGELOG.rst b/doc/source/CHANGELOG.rst
@@ -5,19 +5,20 @@ Changelog
 ----------------
 
 As of this version the usage of Python 2.7 is officially deprecated. Please upgrade
-your Python installation to at least version 3.5.
+your Python installation to at least version 3.5 to catch future updates.
 
 **New features**:
 
 - msm: Added Augmented Markov Models. A way to include averaged experimental
   data into estimation of Markov models from molecular simulations. The method is described in [1]. #1111
 - msm: Added mincount_connectivity argument to MSM estimators. This option enables to omit counts below
   a given threshold. #1106
-- coodinates: selection based features allow alignment to a reference structure. #1184
+- coordinates: selection based features allow alignment to a reference structure. #1184
 - coordinates: two new center of mass features: ResidueCOMFeature() and GroupCOMFeature()
 - coordinates: new configuration variable 'default_chunksize' can be set to limit the size of a fragmented
   extracted per iteration from a data source. This is invariant to the dimension of data sets. #1190
 - datasets: added Prinz potential (quadwell). #1226
+- coordinates: added VAMP estimator. #1237
 
 
 - References:

diff --git a/pyemma/_base/estimator.py b/pyemma/_base/estimator.py
@@ -299,7 +299,8 @@ def estimate_param_scan(estimator, X, param_sets, evaluate=None, evaluate_args=N
     if evaluate is not None and evaluate_args is not None and len(evaluate) != len(evaluate_args):
         raise ValueError("length mismatch: evaluate ({}) and evaluate_args ({})".format(len(evaluate), len(evaluate_args)))
 
-    if progress_reporter is not None:
+    show_progress = progress_reporter is not None and show_progress
+    if show_progress:
         progress_reporter._progress_register(len(estimators), stage=0,
                                              description="estimating %s" % str(estimator.__class__.__name__))
 
@@ -317,8 +318,7 @@ def estimate_param_scan(estimator, X, param_sets, evaluate=None, evaluate_args=N
         from pathos.multiprocessing import Pool as Parallel
         pool = Parallel(processes=n_jobs)
         args = list(task_iter)
-        if progress_reporter is not None:
-            progress_reporter._progress_register(len(estimators), stage=0, description="estimating %s" % str(estimator.__class__.__name__))
+        if show_progress:
             from pyemma._base.model import SampledModel
             for a in args:
                 if isinstance(a[0], SampledModel):
@@ -352,7 +352,7 @@ def error_callback(*args, **kw):
             estimators[0].logger.debug('estimating %s with n_jobs=1 because of the setting or '
                                        'you not have a POSIX system', estimator)
         res = []
-        if progress_reporter is not None:
+        if show_progress:
             from pyemma._base.model import SampledModel
             if isinstance(estimator, SampledModel):
                 for e in estimators:
@@ -361,10 +361,10 @@ def error_callback(*args, **kw):
         for estimator, param_set in zip(estimators, param_sets):
             res.append(_estimate_param_scan_worker(estimator, param_set, X,
                                                    evaluate, evaluate_args, failfast, return_exceptions))
-            if progress_reporter is not None and show_progress:
+            if show_progress:
                 progress_reporter._progress_update(1, stage=0)
 
-    if progress_reporter is not None and show_progress:
+    if show_progress:
         progress_reporter._progress_force_finish(0)
 
     # done

diff --git a/pyemma/_ext/variational/solvers/direct.py b/pyemma/_ext/variational/solvers/direct.py
@@ -125,7 +125,7 @@ def spd_inv(W, epsilon=1e-10, method='QR', canonical_signs=False):
     return Winv
 
 
-def spd_inv_sqrt(W, epsilon=1e-10, method='QR', canonical_signs=False):
+def spd_inv_sqrt(W, epsilon=1e-10, method='QR', canonical_signs=False, return_rank=False):
     """
     Computes :math:`W^{-1/2}` of symmetric positive-definite matrix :math:`W`.
 
@@ -153,14 +153,18 @@ def spd_inv_sqrt(W, epsilon=1e-10, method='QR', canonical_signs=False):
         Matrix :math:`L` from the decomposition :math:`W^{-1} = L L^T`.
 
     """
-    if (_np.shape(W)[0] == 1):
-        Winv = 1./_np.sqrt(W[0,0])
+    if _np.shape(W)[0] == 1:
+        Winv = 1./_np.sqrt(W[0, 0])
+        sm = _np.ones(1)
     else:
         sm, Vm = spd_eig(W, epsilon=epsilon, method=method, canonical_signs=canonical_signs)
         Winv = _np.dot(Vm, _np.diag(1.0 / _np.sqrt(sm))).dot(Vm.T)
 
     # return split
-    return Winv
+    if return_rank:
+        return Winv, sm.shape[0]
+    else:
+        return Winv
 
 
 def spd_inv_split(W, epsilon=1e-10, method='QR', canonical_signs=False):

diff --git a/pyemma/coordinates/__init__.py b/pyemma/coordinates/__init__.py
@@ -51,6 +51,7 @@
 
    pca
    tica
+   vamp
 
 **Clustering Algorithms**
 
@@ -84,6 +85,7 @@
 
    transform.PCA
    transform.TICA
+   transform.VAMP
 
 **Covariance estimation**
 

diff --git a/pyemma/coordinates/acf.py b/pyemma/coordinates/acf.py
@@ -18,7 +18,6 @@
 
 
 
-from __future__ import absolute_import, print_function
 import numpy as np
 import sys
 

diff --git a/pyemma/coordinates/api.py b/pyemma/coordinates/api.py
@@ -51,6 +51,7 @@
            'save_trajs',
            'pca',  # transform
            'tica',
+           'vamp',
            'covariance_lagged',
            'cluster_regspace',  # cluster
            'cluster_kmeans',
@@ -375,9 +376,9 @@ def source(inp, features=None, top=None, chunksize=None, **kw):
 
     # CASE 1: input is a string or list of strings
     # check: if single string create a one-element list
-    if isinstance(inp, str) or (
+    if isinstance(inp, _string_types) or (
             isinstance(inp, (list, tuple))
-            and (any(isinstance(item, (list, tuple, str)) for item in inp) or len(inp) is 0)):
+            and (any(isinstance(item, (list, tuple, _string_types)) for item in inp) or len(inp) is 0)):
         reader = create_file_reader(inp, top, features, chunksize=cs, **kw)
 
     elif isinstance(inp, _np.ndarray) or (isinstance(inp, (list, tuple))
@@ -716,7 +717,7 @@ def save_traj(traj_inp, indexes, outfile, top=None, stride = 1, chunksize=None,
         # Do we have what we need?
         if not isinstance(traj_inp, (list, tuple)):
             raise TypeError("traj_inp has to be of type list, not %s" % type(traj_inp))
-        if not isinstance(top, (str, Topology, Trajectory)):
+        if not isinstance(top, (_string_types, Topology, Trajectory)):
             raise TypeError("traj_inp cannot be a list of files without an input "
                             "top of type str (eg filename.pdb), mdtraj.Trajectory or mdtraj.Topology. "
                             "Got type %s instead" % type(top))
@@ -1255,10 +1256,160 @@ def tica(data=None, lag=10, dim=-1, var_cutoff=0.95, kinetic_map=True, commute_m
     return res
 
 
-def covariance_lagged(data=None, c00=True, c0t=True, ctt=False, remove_constant_mean=None, remove_data_mean=False,
-                      reversible=False, bessel=True, lag=0, weights="empirical", stride=1, skip=0, chunksize=None):
+def vamp(data=None, lag=10, dim=None, scaling=None, right=True, ncov_max=float('inf'),
+         stride=1, skip=0, chunksize=None):
+    r""" Variational approach for Markov processes (VAMP) [1]_.
+
+      Parameters
+      ----------
+      lag : int
+          lag time
+      dim : float or int
+          Number of dimensions to keep:
+
+          * if dim is not set all available ranks are kept:
+              `n_components == min(n_samples, n_features)`
+          * if dim is an integer >= 1, this number specifies the number
+            of dimensions to keep. By default this will use the kinetic
+            variance.
+          * if dim is a float with ``0 < dim < 1``, select the number
+            of dimensions such that the amount of kinetic variance
+            that needs to be explained is greater than the percentage
+            specified by dim.
+      scaling : None or string
+          Scaling to be applied to the VAMP order parameters upon transformation
+
+          * None: no scaling will be applied, variance of the order parameters is 1
+          * 'kinetic map' or 'km': order parameters are scaled by singular value
+            Only the left singular functions induce a kinetic map.
+            Therefore scaling='km' is only effective if `right` is False.
+      right : boolean
+          Whether to compute the right singular functions.
+          If `right==True`, `get_output()` will return the right singular
+          functions. Otherwise, `get_output()` will return the left singular
+          functions.
+          Beware that only `frames[tau:, :]` of each trajectory returned
+          by `get_output()` contain valid values of the right singular
+          functions. Conversely, only `frames[0:-tau, :]` of each
+          trajectory returned by `get_output()` contain valid values of
+          the left singular functions. The remaining frames might
+          possibly be interpreted as some extrapolation.
+      epsilon : float
+          singular value cutoff. Singular values of :math:`C0` with
+          norms <= epsilon will be cut off. The remaining number of
+          singular values define the size of the output.
+      stride: int, optional, default = 1
+          Use only every stride-th time step. By default, every time step is used.
+      skip : int, default=0
+          skip the first initial n frames per trajectory.
+      ncov_max : int, default=infinity
+          limit the memory usage of the algorithm from [3]_ to an amount that corresponds
+          to ncov_max additional copies of each correlation matrix
+
+      Notes
+      -----
+      VAMP is a method for dimensionality reduction of Markov processes.
+
+      The Koopman operator :math:`\mathcal{K}` is an integral operator
+      that describes conditional future expectation values. Let
+      :math:`p(\mathbf{x},\,\mathbf{y})` be the conditional probability
+      density of visiting an infinitesimal phase space volume around
+      point :math:`\mathbf{y}` at time :math:`t+\tau` given that the phase
+      space point :math:`\mathbf{x}` was visited at the earlier time
+      :math:`t`. Then the action of the Koopman operator on a function
+      :math:`f` can be written as follows:
+
+      .. math::
+
+          \mathcal{K}f=\int p(\mathbf{x},\,\mathbf{y})f(\mathbf{y})\,\mathrm{dy}=\mathbb{E}\left[f(\mathbf{x}_{t+\tau}\mid\mathbf{x}_{t}=\mathbf{x})\right]
+
+      The Koopman operator is defined without any reference to an
+      equilibrium distribution. Therefore it is well-defined in
+      situations where the dynamics is irreversible or/and non-stationary
+      such that no equilibrium distribution exists.
+
+      If we approximate :math:`f` by a linear superposition of ansatz
+      functions :math:`\boldsymbol{\chi}` of the conformational
+      degrees of freedom (features), the operator :math:`\mathcal{K}`
+      can be approximated by a (finite-dimensional) matrix :math:`\mathbf{K}`.
+
+      The approximation is computed as follows: From the time-dependent
+      input features :math:`\boldsymbol{\chi}(t)`, we compute the mean
+      :math:`\boldsymbol{\mu}_{0}` (:math:`\boldsymbol{\mu}_{1}`) from
+      all data excluding the last (first) :math:`\tau` steps of every
+      trajectory as follows:
+
+      .. math::
+
+          \boldsymbol{\mu}_{0}	:=\frac{1}{T-\tau}\sum_{t=0}^{T-\tau}\boldsymbol{\chi}(t)
+
+          \boldsymbol{\mu}_{1}	:=\frac{1}{T-\tau}\sum_{t=\tau}^{T}\boldsymbol{\chi}(t)
+
+      Next, we compute the instantaneous covariance matrices
+      :math:`\mathbf{C}_{00}` and :math:`\mathbf{C}_{11}` and the
+      time-lagged covariance matrix :math:`\mathbf{C}_{01}` as follows:
+
+      .. math::
+
+          \mathbf{C}_{00}	:=\frac{1}{T-\tau}\sum_{t=0}^{T-\tau}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right]\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right]
+
+          \mathbf{C}_{11}	:=\frac{1}{T-\tau}\sum_{t=\tau}^{T}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{1}\right]\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{1}\right]
+
+          \mathbf{C}_{01}	:=\frac{1}{T-\tau}\sum_{t=0}^{T-\tau}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right]\left[\boldsymbol{\chi}(t+\tau)-\boldsymbol{\mu}_{1}\right]
+
+      The Koopman matrix is then computed as follows:
+
+      .. math::
+
+          \mathbf{K}=\mathbf{C}_{00}^{-1}\mathbf{C}_{01}
+
+      It can be shown [1]_ that the leading singular functions of the
+      half-weighted Koopman matrix
+
+      .. math::
+
+          \bar{\mathbf{K}}:=\mathbf{C}_{00}^{-\frac{1}{2}}\mathbf{C}_{01}\mathbf{C}_{11}^{-\frac{1}{2}}
+
+      encode the best reduced dynamical model for the time series.
+
+      The singular functions can be computed by first performing the
+      singular value decomposition
+
+      .. math::
+
+          \bar{\mathbf{K}}=\mathbf{U}^{\prime}\mathbf{S}\mathbf{V}^{\prime}
+
+      and then mapping the input conformation to the left singular
+      functions :math:`\boldsymbol{\psi}` and right singular
+      functions :math:`\boldsymbol{\phi}` as follows:
+
+      .. math::
+
+          \boldsymbol{\psi}(t):=\mathbf{U}^{\prime\top}\mathbf{C}_{00}^{-\frac{1}{2}}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right]
+
+          \boldsymbol{\phi}(t):=\mathbf{V}^{\prime\top}\mathbf{C}_{11}^{-\frac{1}{2}}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{1}\right]
+
+
+      References
+      ----------
+      .. [1] Wu, H. and Noe, F. 2017. Variational approach for learning Markov processes from time series data.
+          arXiv:1707.04659v1
+      .. [2] Noe, F. and Clementi, C. 2015. Kinetic distance and kinetic maps from molecular dynamics simulation.
+          J. Chem. Theory. Comput. doi:10.1021/acs.jctc.5b00553
+      .. [3] Chan, T. F., Golub G. H., LeVeque R. J. 1979. Updating formulae and pairwiese algorithms for
+         computing sample variances. Technical Report STAN-CS-79-773, Department of Computer Science, Stanford University.
     """
-    Compute lagged covariances between time series. If data is available as an array of size (TxN), where T is the
+    from pyemma.coordinates.transform.vamp import VAMP
+    res = VAMP(lag, dim=dim, scaling=scaling, right=right, skip=skip, ncov_max=ncov_max)
+    if data is not None:
+        res.estimate(data, stride=stride, chunksize=chunksize)
+    return res
+
+
+def covariance_lagged(data=None, c00=True, c0t=True, ctt=False, remove_constant_mean=None, remove_data_mean=False,
+                      reversible=False, bessel=True, lag=0, weights="empirical", stride=1, skip=0, chunksize=None,
+                      ncov_max=float('inf')):
+    r"""Compute lagged covariances between time series. If data is available as an array of size (TxN), where T is the
     number of time steps and N the number of dimensions, this function can compute lagged covariances like
 
     .. math::
@@ -1306,6 +1457,9 @@ def covariance_lagged(data=None, c00=True, c0t=True, ctt=False, remove_constant_
         to optimize thread usage and gain processing speed. If None is passed,
         use the default value of the underlying reader/data source. Choose zero to
         disable chunking at all.
+    ncov_max : int, default=infinity
+        limit the memory usage of the algorithm from [2]_ to an amount that corresponds
+        to ncov_max additional copies of each correlation matrix
 
     Returns
     -------
@@ -1314,17 +1468,17 @@ def covariance_lagged(data=None, c00=True, c0t=True, ctt=False, remove_constant_
 
     .. [1] Wu, H., Nueske, F., Paul, F., Klus, S., Koltai, P., and Noe, F. 2016. Bias reduced variational
        approximation of molecular kinetics from short off-equilibrium simulations. J. Chem. Phys. (submitted)
-
+    .. [2] Chan, T. F., Golub G. H., LeVeque R. J. 1979. Updating formulae and pairwiese algorithms for
+        computing sample variances. Technical Report STAN-CS-79-773, Department of Computer Science, Stanford University.
     """
-
     from pyemma.coordinates.estimation.covariance import LaggedCovariance
     from pyemma.coordinates.estimation.koopman import _KoopmanEstimator
     import types
-    if isinstance(weights, str):
+    if isinstance(weights, _string_types):
         if weights== "koopman":
             if data is None:
                 raise ValueError("Data must be supplied for reweighting='koopman'")
-            koop = _KoopmanEstimator(lag=lag, stride=stride, skip=skip)
+            koop = _KoopmanEstimator(lag=lag, stride=stride, skip=skip, ncov_max=ncov_max)
             koop.estimate(data, chunksize=chunksize)
             weights = koop.weights
         elif weights == "empirical":
@@ -1342,7 +1496,7 @@ def covariance_lagged(data=None, c00=True, c0t=True, ctt=False, remove_constant_
     # chunksize is an estimation parameter for now.
     lc = LaggedCovariance(c00=c00, c0t=c0t, ctt=ctt, remove_constant_mean=remove_constant_mean,
                           remove_data_mean=remove_data_mean, reversible=reversible, bessel=bessel, lag=lag,
-                          weights=weights, stride=stride, skip=skip)
+                          weights=weights, stride=stride, skip=skip, ncov_max=ncov_max)
     if data is not None:
         lc.estimate(data, chunksize=chunksize)
     return lc
Original file line number	Diff line number	Diff line change
Expand Up		@@ -18,7 +18,6 @@



		from __future__ import absolute_import, print_function
		import numpy as np
		import sys

Expand Down