diff --git a/devtools/ci/travis/install_miniconda.sh b/devtools/ci/travis/install_miniconda.sh
index ace5acd15..76b5ac968 100755
--- a/devtools/ci/travis/install_miniconda.sh
+++ b/devtools/ci/travis/install_miniconda.sh
@@ -30,5 +30,5 @@ else # if it does not exist, we need to install miniconda
fi
# we want to have an up to date conda-build.
-conda install conda-build=3.2
+conda install conda-build=3
conda info -a # for debugging
diff --git a/devtools/conda-recipe/meta.yaml b/devtools/conda-recipe/meta.yaml
index 6095c74dd..e8f26ef92 100644
--- a/devtools/conda-recipe/meta.yaml
+++ b/devtools/conda-recipe/meta.yaml
@@ -24,7 +24,7 @@ requirements:
- numpy 1.9.* # [not (win and (py35 or py36))]
- numpy 1.9.* # [win and py35]
- numpy 1.11.* # [win and py36]
- - python >=3
+ - python
- scipy
- setuptools
- gcc # [ not win ]
@@ -42,10 +42,11 @@ requirements:
- numpy >=1.11,<1.14 # [win and py36]
- pathos
- psutil >3.1
- - python >=3
+ - python
- pyyaml
- scipy
- setuptools
+ - six >=1.10
- thermotools >=0.2.6
- tqdm
diff --git a/devtools/conda-recipe/run_test.py b/devtools/conda-recipe/run_test.py
index 71bef7209..dcece6801 100644
--- a/devtools/conda-recipe/run_test.py
+++ b/devtools/conda-recipe/run_test.py
@@ -9,7 +9,9 @@
# where to write junit xml
junit_xml = os.path.join(os.getenv('CIRCLE_TEST_REPORTS', os.path.expanduser('~')),
'reports', 'junit.xml')
-os.makedirs(os.path.dirname(junit_xml), exist_ok=True)
+target_dir = os.path.dirname(junit_xml)
+if not os.path.exists(target_dir):
+ os.makedirs(target_dir)
print('junit destination:', junit_xml)
njobs_args = '-p no:xdist' if os.getenv('TRAVIS') else '-n2'
diff --git a/doc/source/CHANGELOG.rst b/doc/source/CHANGELOG.rst
index 35a82c182..22d0e1d7c 100644
--- a/doc/source/CHANGELOG.rst
+++ b/doc/source/CHANGELOG.rst
@@ -5,7 +5,7 @@ Changelog
----------------
As of this version the usage of Python 2.7 is officially deprecated. Please upgrade
-your Python installation to at least version 3.5.
+your Python installation to at least version 3.5 to catch future updates.
**New features**:
@@ -13,11 +13,12 @@ your Python installation to at least version 3.5.
data into estimation of Markov models from molecular simulations. The method is described in [1]. #1111
- msm: Added mincount_connectivity argument to MSM estimators. This option enables to omit counts below
a given threshold. #1106
-- coodinates: selection based features allow alignment to a reference structure. #1184
+- coordinates: selection based features allow alignment to a reference structure. #1184
- coordinates: two new center of mass features: ResidueCOMFeature() and GroupCOMFeature()
- coordinates: new configuration variable 'default_chunksize' can be set to limit the size of a fragmented
extracted per iteration from a data source. This is invariant to the dimension of data sets. #1190
- datasets: added Prinz potential (quadwell). #1226
+- coordinates: added VAMP estimator. #1237
- References:
diff --git a/pyemma/_base/estimator.py b/pyemma/_base/estimator.py
index 88564d92f..64a3d1b68 100644
--- a/pyemma/_base/estimator.py
+++ b/pyemma/_base/estimator.py
@@ -299,7 +299,8 @@ def estimate_param_scan(estimator, X, param_sets, evaluate=None, evaluate_args=N
if evaluate is not None and evaluate_args is not None and len(evaluate) != len(evaluate_args):
raise ValueError("length mismatch: evaluate ({}) and evaluate_args ({})".format(len(evaluate), len(evaluate_args)))
- if progress_reporter is not None:
+ show_progress = progress_reporter is not None and show_progress
+ if show_progress:
progress_reporter._progress_register(len(estimators), stage=0,
description="estimating %s" % str(estimator.__class__.__name__))
@@ -317,8 +318,7 @@ def estimate_param_scan(estimator, X, param_sets, evaluate=None, evaluate_args=N
from pathos.multiprocessing import Pool as Parallel
pool = Parallel(processes=n_jobs)
args = list(task_iter)
- if progress_reporter is not None:
- progress_reporter._progress_register(len(estimators), stage=0, description="estimating %s" % str(estimator.__class__.__name__))
+ if show_progress:
from pyemma._base.model import SampledModel
for a in args:
if isinstance(a[0], SampledModel):
@@ -352,7 +352,7 @@ def error_callback(*args, **kw):
estimators[0].logger.debug('estimating %s with n_jobs=1 because of the setting or '
'you not have a POSIX system', estimator)
res = []
- if progress_reporter is not None:
+ if show_progress:
from pyemma._base.model import SampledModel
if isinstance(estimator, SampledModel):
for e in estimators:
@@ -361,10 +361,10 @@ def error_callback(*args, **kw):
for estimator, param_set in zip(estimators, param_sets):
res.append(_estimate_param_scan_worker(estimator, param_set, X,
evaluate, evaluate_args, failfast, return_exceptions))
- if progress_reporter is not None and show_progress:
+ if show_progress:
progress_reporter._progress_update(1, stage=0)
- if progress_reporter is not None and show_progress:
+ if show_progress:
progress_reporter._progress_force_finish(0)
# done
diff --git a/pyemma/_ext/variational/solvers/direct.py b/pyemma/_ext/variational/solvers/direct.py
index db442aedf..d3c2be57b 100644
--- a/pyemma/_ext/variational/solvers/direct.py
+++ b/pyemma/_ext/variational/solvers/direct.py
@@ -125,7 +125,7 @@ def spd_inv(W, epsilon=1e-10, method='QR', canonical_signs=False):
return Winv
-def spd_inv_sqrt(W, epsilon=1e-10, method='QR', canonical_signs=False):
+def spd_inv_sqrt(W, epsilon=1e-10, method='QR', canonical_signs=False, return_rank=False):
"""
Computes :math:`W^{-1/2}` of symmetric positive-definite matrix :math:`W`.
@@ -153,14 +153,18 @@ def spd_inv_sqrt(W, epsilon=1e-10, method='QR', canonical_signs=False):
Matrix :math:`L` from the decomposition :math:`W^{-1} = L L^T`.
"""
- if (_np.shape(W)[0] == 1):
- Winv = 1./_np.sqrt(W[0,0])
+ if _np.shape(W)[0] == 1:
+ Winv = 1./_np.sqrt(W[0, 0])
+ sm = _np.ones(1)
else:
sm, Vm = spd_eig(W, epsilon=epsilon, method=method, canonical_signs=canonical_signs)
Winv = _np.dot(Vm, _np.diag(1.0 / _np.sqrt(sm))).dot(Vm.T)
# return split
- return Winv
+ if return_rank:
+ return Winv, sm.shape[0]
+ else:
+ return Winv
def spd_inv_split(W, epsilon=1e-10, method='QR', canonical_signs=False):
diff --git a/pyemma/coordinates/__init__.py b/pyemma/coordinates/__init__.py
index c7b5dbfba..0b3adf034 100644
--- a/pyemma/coordinates/__init__.py
+++ b/pyemma/coordinates/__init__.py
@@ -51,6 +51,7 @@
pca
tica
+ vamp
**Clustering Algorithms**
@@ -84,6 +85,7 @@
transform.PCA
transform.TICA
+ transform.VAMP
**Covariance estimation**
diff --git a/pyemma/coordinates/acf.py b/pyemma/coordinates/acf.py
index 86eb8d489..746bcde3d 100644
--- a/pyemma/coordinates/acf.py
+++ b/pyemma/coordinates/acf.py
@@ -18,7 +18,6 @@
-from __future__ import absolute_import, print_function
import numpy as np
import sys
diff --git a/pyemma/coordinates/api.py b/pyemma/coordinates/api.py
index e8127c59b..75ad04717 100644
--- a/pyemma/coordinates/api.py
+++ b/pyemma/coordinates/api.py
@@ -51,6 +51,7 @@
'save_trajs',
'pca', # transform
'tica',
+ 'vamp',
'covariance_lagged',
'cluster_regspace', # cluster
'cluster_kmeans',
@@ -375,9 +376,9 @@ def source(inp, features=None, top=None, chunksize=None, **kw):
# CASE 1: input is a string or list of strings
# check: if single string create a one-element list
- if isinstance(inp, str) or (
+ if isinstance(inp, _string_types) or (
isinstance(inp, (list, tuple))
- and (any(isinstance(item, (list, tuple, str)) for item in inp) or len(inp) is 0)):
+ and (any(isinstance(item, (list, tuple, _string_types)) for item in inp) or len(inp) is 0)):
reader = create_file_reader(inp, top, features, chunksize=cs, **kw)
elif isinstance(inp, _np.ndarray) or (isinstance(inp, (list, tuple))
@@ -716,7 +717,7 @@ def save_traj(traj_inp, indexes, outfile, top=None, stride = 1, chunksize=None,
# Do we have what we need?
if not isinstance(traj_inp, (list, tuple)):
raise TypeError("traj_inp has to be of type list, not %s" % type(traj_inp))
- if not isinstance(top, (str, Topology, Trajectory)):
+ if not isinstance(top, (_string_types, Topology, Trajectory)):
raise TypeError("traj_inp cannot be a list of files without an input "
"top of type str (eg filename.pdb), mdtraj.Trajectory or mdtraj.Topology. "
"Got type %s instead" % type(top))
@@ -1255,10 +1256,160 @@ def tica(data=None, lag=10, dim=-1, var_cutoff=0.95, kinetic_map=True, commute_m
return res
-def covariance_lagged(data=None, c00=True, c0t=True, ctt=False, remove_constant_mean=None, remove_data_mean=False,
- reversible=False, bessel=True, lag=0, weights="empirical", stride=1, skip=0, chunksize=None):
+def vamp(data=None, lag=10, dim=None, scaling=None, right=True, ncov_max=float('inf'),
+ stride=1, skip=0, chunksize=None):
+ r""" Variational approach for Markov processes (VAMP) [1]_.
+
+ Parameters
+ ----------
+ lag : int
+ lag time
+ dim : float or int
+ Number of dimensions to keep:
+
+ * if dim is not set all available ranks are kept:
+ `n_components == min(n_samples, n_features)`
+ * if dim is an integer >= 1, this number specifies the number
+ of dimensions to keep. By default this will use the kinetic
+ variance.
+ * if dim is a float with ``0 < dim < 1``, select the number
+ of dimensions such that the amount of kinetic variance
+ that needs to be explained is greater than the percentage
+ specified by dim.
+ scaling : None or string
+ Scaling to be applied to the VAMP order parameters upon transformation
+
+ * None: no scaling will be applied, variance of the order parameters is 1
+ * 'kinetic map' or 'km': order parameters are scaled by singular value
+ Only the left singular functions induce a kinetic map.
+ Therefore scaling='km' is only effective if `right` is False.
+ right : boolean
+ Whether to compute the right singular functions.
+ If `right==True`, `get_output()` will return the right singular
+ functions. Otherwise, `get_output()` will return the left singular
+ functions.
+ Beware that only `frames[tau:, :]` of each trajectory returned
+ by `get_output()` contain valid values of the right singular
+ functions. Conversely, only `frames[0:-tau, :]` of each
+ trajectory returned by `get_output()` contain valid values of
+ the left singular functions. The remaining frames might
+ possibly be interpreted as some extrapolation.
+ epsilon : float
+ singular value cutoff. Singular values of :math:`C0` with
+ norms <= epsilon will be cut off. The remaining number of
+ singular values define the size of the output.
+ stride: int, optional, default = 1
+ Use only every stride-th time step. By default, every time step is used.
+ skip : int, default=0
+ skip the first initial n frames per trajectory.
+ ncov_max : int, default=infinity
+ limit the memory usage of the algorithm from [3]_ to an amount that corresponds
+ to ncov_max additional copies of each correlation matrix
+
+ Notes
+ -----
+ VAMP is a method for dimensionality reduction of Markov processes.
+
+ The Koopman operator :math:`\mathcal{K}` is an integral operator
+ that describes conditional future expectation values. Let
+ :math:`p(\mathbf{x},\,\mathbf{y})` be the conditional probability
+ density of visiting an infinitesimal phase space volume around
+ point :math:`\mathbf{y}` at time :math:`t+\tau` given that the phase
+ space point :math:`\mathbf{x}` was visited at the earlier time
+ :math:`t`. Then the action of the Koopman operator on a function
+ :math:`f` can be written as follows:
+
+ .. math::
+
+ \mathcal{K}f=\int p(\mathbf{x},\,\mathbf{y})f(\mathbf{y})\,\mathrm{dy}=\mathbb{E}\left[f(\mathbf{x}_{t+\tau}\mid\mathbf{x}_{t}=\mathbf{x})\right]
+
+ The Koopman operator is defined without any reference to an
+ equilibrium distribution. Therefore it is well-defined in
+ situations where the dynamics is irreversible or/and non-stationary
+ such that no equilibrium distribution exists.
+
+ If we approximate :math:`f` by a linear superposition of ansatz
+ functions :math:`\boldsymbol{\chi}` of the conformational
+ degrees of freedom (features), the operator :math:`\mathcal{K}`
+ can be approximated by a (finite-dimensional) matrix :math:`\mathbf{K}`.
+
+ The approximation is computed as follows: From the time-dependent
+ input features :math:`\boldsymbol{\chi}(t)`, we compute the mean
+ :math:`\boldsymbol{\mu}_{0}` (:math:`\boldsymbol{\mu}_{1}`) from
+ all data excluding the last (first) :math:`\tau` steps of every
+ trajectory as follows:
+
+ .. math::
+
+ \boldsymbol{\mu}_{0} :=\frac{1}{T-\tau}\sum_{t=0}^{T-\tau}\boldsymbol{\chi}(t)
+
+ \boldsymbol{\mu}_{1} :=\frac{1}{T-\tau}\sum_{t=\tau}^{T}\boldsymbol{\chi}(t)
+
+ Next, we compute the instantaneous covariance matrices
+ :math:`\mathbf{C}_{00}` and :math:`\mathbf{C}_{11}` and the
+ time-lagged covariance matrix :math:`\mathbf{C}_{01}` as follows:
+
+ .. math::
+
+ \mathbf{C}_{00} :=\frac{1}{T-\tau}\sum_{t=0}^{T-\tau}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right]\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right]
+
+ \mathbf{C}_{11} :=\frac{1}{T-\tau}\sum_{t=\tau}^{T}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{1}\right]\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{1}\right]
+
+ \mathbf{C}_{01} :=\frac{1}{T-\tau}\sum_{t=0}^{T-\tau}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right]\left[\boldsymbol{\chi}(t+\tau)-\boldsymbol{\mu}_{1}\right]
+
+ The Koopman matrix is then computed as follows:
+
+ .. math::
+
+ \mathbf{K}=\mathbf{C}_{00}^{-1}\mathbf{C}_{01}
+
+ It can be shown [1]_ that the leading singular functions of the
+ half-weighted Koopman matrix
+
+ .. math::
+
+ \bar{\mathbf{K}}:=\mathbf{C}_{00}^{-\frac{1}{2}}\mathbf{C}_{01}\mathbf{C}_{11}^{-\frac{1}{2}}
+
+ encode the best reduced dynamical model for the time series.
+
+ The singular functions can be computed by first performing the
+ singular value decomposition
+
+ .. math::
+
+ \bar{\mathbf{K}}=\mathbf{U}^{\prime}\mathbf{S}\mathbf{V}^{\prime}
+
+ and then mapping the input conformation to the left singular
+ functions :math:`\boldsymbol{\psi}` and right singular
+ functions :math:`\boldsymbol{\phi}` as follows:
+
+ .. math::
+
+ \boldsymbol{\psi}(t):=\mathbf{U}^{\prime\top}\mathbf{C}_{00}^{-\frac{1}{2}}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right]
+
+ \boldsymbol{\phi}(t):=\mathbf{V}^{\prime\top}\mathbf{C}_{11}^{-\frac{1}{2}}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{1}\right]
+
+
+ References
+ ----------
+ .. [1] Wu, H. and Noe, F. 2017. Variational approach for learning Markov processes from time series data.
+ arXiv:1707.04659v1
+ .. [2] Noe, F. and Clementi, C. 2015. Kinetic distance and kinetic maps from molecular dynamics simulation.
+ J. Chem. Theory. Comput. doi:10.1021/acs.jctc.5b00553
+ .. [3] Chan, T. F., Golub G. H., LeVeque R. J. 1979. Updating formulae and pairwiese algorithms for
+ computing sample variances. Technical Report STAN-CS-79-773, Department of Computer Science, Stanford University.
"""
- Compute lagged covariances between time series. If data is available as an array of size (TxN), where T is the
+ from pyemma.coordinates.transform.vamp import VAMP
+ res = VAMP(lag, dim=dim, scaling=scaling, right=right, skip=skip, ncov_max=ncov_max)
+ if data is not None:
+ res.estimate(data, stride=stride, chunksize=chunksize)
+ return res
+
+
+def covariance_lagged(data=None, c00=True, c0t=True, ctt=False, remove_constant_mean=None, remove_data_mean=False,
+ reversible=False, bessel=True, lag=0, weights="empirical", stride=1, skip=0, chunksize=None,
+ ncov_max=float('inf')):
+ r"""Compute lagged covariances between time series. If data is available as an array of size (TxN), where T is the
number of time steps and N the number of dimensions, this function can compute lagged covariances like
.. math::
@@ -1306,6 +1457,9 @@ def covariance_lagged(data=None, c00=True, c0t=True, ctt=False, remove_constant_
to optimize thread usage and gain processing speed. If None is passed,
use the default value of the underlying reader/data source. Choose zero to
disable chunking at all.
+ ncov_max : int, default=infinity
+ limit the memory usage of the algorithm from [2]_ to an amount that corresponds
+ to ncov_max additional copies of each correlation matrix
Returns
-------
@@ -1314,17 +1468,17 @@ def covariance_lagged(data=None, c00=True, c0t=True, ctt=False, remove_constant_
.. [1] Wu, H., Nueske, F., Paul, F., Klus, S., Koltai, P., and Noe, F. 2016. Bias reduced variational
approximation of molecular kinetics from short off-equilibrium simulations. J. Chem. Phys. (submitted)
-
+ .. [2] Chan, T. F., Golub G. H., LeVeque R. J. 1979. Updating formulae and pairwiese algorithms for
+ computing sample variances. Technical Report STAN-CS-79-773, Department of Computer Science, Stanford University.
"""
-
from pyemma.coordinates.estimation.covariance import LaggedCovariance
from pyemma.coordinates.estimation.koopman import _KoopmanEstimator
import types
- if isinstance(weights, str):
+ if isinstance(weights, _string_types):
if weights== "koopman":
if data is None:
raise ValueError("Data must be supplied for reweighting='koopman'")
- koop = _KoopmanEstimator(lag=lag, stride=stride, skip=skip)
+ koop = _KoopmanEstimator(lag=lag, stride=stride, skip=skip, ncov_max=ncov_max)
koop.estimate(data, chunksize=chunksize)
weights = koop.weights
elif weights == "empirical":
@@ -1342,7 +1496,7 @@ def covariance_lagged(data=None, c00=True, c0t=True, ctt=False, remove_constant_
# chunksize is an estimation parameter for now.
lc = LaggedCovariance(c00=c00, c0t=c0t, ctt=ctt, remove_constant_mean=remove_constant_mean,
remove_data_mean=remove_data_mean, reversible=reversible, bessel=bessel, lag=lag,
- weights=weights, stride=stride, skip=skip)
+ weights=weights, stride=stride, skip=skip, ncov_max=ncov_max)
if data is not None:
lc.estimate(data, chunksize=chunksize)
return lc
diff --git a/pyemma/coordinates/data/sources_merger.py b/pyemma/coordinates/data/sources_merger.py
index 1e663352f..0b1e7351d 100644
--- a/pyemma/coordinates/data/sources_merger.py
+++ b/pyemma/coordinates/data/sources_merger.py
@@ -18,10 +18,10 @@ class SourcesMerger(DataSource, SerializableMixIn):
sources : list, tuple
list of DataSources (Readers, StreamingTransformers etc.) to combine for streaming access.
- chunk: int
+ chunk: int or None
chunk size to use for underlying iterators.
"""
- def __init__(self, sources, chunk=5000):
+ def __init__(self, sources, chunk=None):
super(SourcesMerger, self).__init__(chunksize=chunk)
self.sources = sources
self._is_reader = True
diff --git a/pyemma/coordinates/data/util/reader_utils.py b/pyemma/coordinates/data/util/reader_utils.py
index db1024639..907d1a6f9 100644
--- a/pyemma/coordinates/data/util/reader_utils.py
+++ b/pyemma/coordinates/data/util/reader_utils.py
@@ -23,6 +23,8 @@
import numpy as np
import os
+from six import string_types
+
def create_file_reader(input_files, topology, featurizer, chunksize=None, **kw):
r"""
@@ -43,8 +45,6 @@ def create_file_reader(input_files, topology, featurizer, chunksize=None, **kw):
from pyemma.coordinates.data.py_csv_reader import PyCSVReader
from pyemma.coordinates.data import FeatureReader
from pyemma.coordinates.data.fragmented_trajectory_reader import FragmentedTrajectoryReader
- import six
- str = six.string_types
# fragmented trajectories
if (isinstance(input_files, (list, tuple)) and len(input_files) > 0 and
@@ -52,15 +52,15 @@ def create_file_reader(input_files, topology, featurizer, chunksize=None, **kw):
return FragmentedTrajectoryReader(input_files, topology, chunksize, featurizer)
# normal trajectories
- if (isinstance(input_files, str)
+ if (isinstance(input_files, string_types)
or (isinstance(input_files, (list, tuple))
- and (any(isinstance(item, str) for item in input_files)
+ and (any(isinstance(item, string_types) for item in input_files)
or len(input_files) is 0))):
reader = None
# check: if single string create a one-element list
- if isinstance(input_files, str):
+ if isinstance(input_files, string_types):
input_list = [input_files]
- elif len(input_files) > 0 and all(isinstance(item, str) for item in input_files):
+ elif len(input_files) > 0 and all(isinstance(item, string_types) for item in input_files):
input_list = input_files
else:
if len(input_files) is 0:
@@ -177,7 +177,7 @@ def preallocate_empty_trajectory(top, n_frames=1):
def enforce_top(top):
- if isinstance(top, str):
+ if isinstance(top, string_types):
top = md.load(top).top
elif isinstance(top, md.Trajectory):
top = top.top
diff --git a/pyemma/coordinates/estimation/covariance.py b/pyemma/coordinates/estimation/covariance.py
index 2405f3200..5d84726a9 100644
--- a/pyemma/coordinates/estimation/covariance.py
+++ b/pyemma/coordinates/estimation/covariance.py
@@ -77,14 +77,19 @@ class LaggedCovariance(StreamingEstimator):
Use only every stride-th time step. By default, every time step is used.
skip : int, optional, default=0
skip the first initial n frames per trajectory.
- chunksize : deprecated, default=NoTImplemented
- The chunk size can be se during estimation.
+ chunksize : deprecated, default=NotImplemented
+ The chunk size should now be set during estimation.
"""
def __init__(self, c00=True, c0t=False, ctt=False, remove_constant_mean=None, remove_data_mean=False, reversible=False,
bessel=True, sparse_mode='auto', modify_data=False, lag=0, weights=None, stride=1, skip=0,
chunksize=NotImplemented, ncov_max=float('inf')):
super(LaggedCovariance, self).__init__()
+ if chunksize is not NotImplemented:
+ import warnings
+ from pyemma.util.exceptions import PyEMMA_DeprecationWarning
+ warnings.warn('passed deprecated argument chunksize to LaggedCovariance. Will be ignored!',
+ category=PyEMMA_DeprecationWarning)
if (c0t or ctt) and lag == 0:
raise ValueError("lag must be positive if c0t=True or ctt=True")
diff --git a/pyemma/coordinates/tests/test_vamp.py b/pyemma/coordinates/tests/test_vamp.py
new file mode 100644
index 000000000..2d04a8eb1
--- /dev/null
+++ b/pyemma/coordinates/tests/test_vamp.py
@@ -0,0 +1,283 @@
+# This file is part of PyEMMA.
+#
+# Copyright (c) 2017 Computational Molecular Biology Group, Freie Universitaet Berlin (GER)
+#
+# PyEMMA is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see .
+
+
+"""
+@author: paul
+"""
+
+from __future__ import absolute_import
+import unittest
+import numpy as np
+from pyemma.coordinates import vamp as pyemma_api_vamp
+from pyemma.msm import estimate_markov_model
+from logging import getLogger
+
+from pyemma.msm.estimators._dtraj_stats import cvsplit_dtrajs
+
+logger = getLogger('pyemma.'+'TestVAMP')
+
+
+def random_matrix(n, rank=None, eps=0.01):
+ m = np.random.randn(n, n)
+ u, s, v = np.linalg.svd(m)
+ if rank is None:
+ rank = n
+ if rank > n:
+ rank = n
+ s = np.concatenate((np.maximum(s, eps)[0:rank], np.zeros(n-rank)))
+ return u.dot(np.diag(s)).dot(v)
+
+
+class TestVAMPEstimatorSelfConsistency(unittest.TestCase):
+ def test_full_rank(self):
+ self.do_test(20, 20, test_partial_fit=True)
+
+ def test_low_rank(self):
+ dim = 30
+ rank = 15
+ self.do_test(dim, rank, test_partial_fit=True)
+
+ def do_test(self, dim, rank, test_partial_fit=False):
+ # setup
+ N_frames = [123, 456, 789]
+ N_trajs = len(N_frames)
+ A = random_matrix(dim, rank)
+ trajs = []
+ mean = np.random.randn(dim)
+ for i in range(N_trajs):
+ # set up data
+ white = np.random.randn(N_frames[i], dim)
+ brown = np.cumsum(white, axis=0)
+ correlated = np.dot(brown, A)
+ trajs.append(correlated + mean)
+
+ # test
+ tau = 50
+ vamp = pyemma_api_vamp(trajs, lag=tau, scaling=None)
+ vamp.right = True
+
+ assert vamp.dimension() <= rank
+
+ atol = np.finfo(vamp.output_type()).eps*10.0
+ phi_trajs = [ sf[tau:, :] for sf in vamp.get_output() ]
+ phi = np.concatenate(phi_trajs)
+ mean_right = phi.sum(axis=0) / phi.shape[0]
+ cov_right = phi.T.dot(phi) / phi.shape[0]
+ np.testing.assert_allclose(mean_right, 0.0, atol=atol)
+ np.testing.assert_allclose(cov_right, np.eye(vamp.dimension()), atol=atol)
+
+ vamp.right = False
+ psi_trajs = [ sf[0:-tau, :] for sf in vamp.get_output() ]
+ psi = np.concatenate(psi_trajs)
+ mean_left = psi.sum(axis=0) / psi.shape[0]
+ cov_left = psi.T.dot(psi) / psi.shape[0]
+ np.testing.assert_allclose(mean_left, 0.0, atol=atol)
+ np.testing.assert_allclose(cov_left, np.eye(vamp.dimension()), atol=atol)
+
+ # compute correlation between left and right
+ assert phi.shape[0]==psi.shape[0]
+ C01_psi_phi = psi.T.dot(phi) / phi.shape[0]
+ n = max(C01_psi_phi.shape)
+ C01_psi_phi = C01_psi_phi[0:n,:][:, 0:n]
+ np.testing.assert_allclose(C01_psi_phi, np.diag(vamp.singular_values[0:vamp.dimension()]), atol=atol)
+
+ if test_partial_fit:
+ vamp2 = pyemma_api_vamp(lag=tau, scaling=None)
+ for t in trajs:
+ vamp2.partial_fit(t)
+
+ model_params = vamp._model.get_model_params()
+ model_params2 = vamp2._model.get_model_params()
+
+ atol = 1e-15
+ rtol = 1e-6
+
+ for n in model_params.keys():
+ if model_params[n] is not None and model_params2[n] is not None:
+ if n not in ('U', 'V'):
+ np.testing.assert_allclose(model_params[n], model_params2[n], rtol=rtol, atol=atol,
+ err_msg='failed for model param %s' % n)
+ else:
+ assert_allclose_ignore_phase(model_params[n], model_params2[n], atol=atol)
+
+ vamp2.singular_values # trigger diagonalization
+
+ vamp2.right = True
+ for t, ref in zip(trajs, phi_trajs):
+ assert_allclose_ignore_phase(vamp2.transform(t[tau:]), ref, rtol=rtol, atol=atol)
+
+ vamp2.right = False
+ for t, ref in zip(trajs, psi_trajs):
+ assert_allclose_ignore_phase(vamp2.transform(t[0:-tau]), ref, rtol=rtol, atol=atol)
+
+
+def generate(T, N_steps, s0=0):
+ dtraj = np.zeros(N_steps, dtype=int)
+ s = s0
+ T_cdf = T.cumsum(axis=1)
+ for t in range(N_steps):
+ dtraj[t] = s
+ s = np.searchsorted(T_cdf[s, :], np.random.rand())
+ return dtraj
+
+
+def assert_allclose_ignore_phase(A, B, atol, rtol=1e-5):
+ A = np.atleast_2d(A)
+ B = np.atleast_2d(B)
+ assert A.shape == B.shape
+ for i in range(B.shape[1]):
+ assert (np.allclose(A[:, i], B[:, i], atol=atol, rtol=rtol)
+ or np.allclose(A[:, i], -B[:, i], atol=atol, rtol=rtol))
+
+
+class TestVAMPModel(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls):
+ N_steps = 10000
+ N_traj = 20
+ lag = 1
+ T = np.linalg.matrix_power(np.array([[0.7, 0.2, 0.1], [0.1, 0.8, 0.1], [0.1, 0.1, 0.8]]), lag)
+ dtrajs = [generate(T, N_steps) for _ in range(N_traj)]
+ p0 = np.zeros(3)
+ p1 = np.zeros(3)
+ trajs = []
+ for dtraj in dtrajs:
+ traj = np.zeros((N_steps, T.shape[0]))
+ traj[np.arange(len(dtraj)), dtraj] = 1.0
+ trajs.append(traj)
+ p0 += traj[:-lag, :].sum(axis=0)
+ p1 += traj[lag:, :].sum(axis=0)
+ vamp = pyemma_api_vamp(trajs, lag=lag, scaling=None, dim=1.0)
+ msm = estimate_markov_model(dtrajs, lag=lag, reversible=False)
+ cls.trajs = trajs
+ cls.dtrajs = dtrajs
+ cls.lag = lag
+ cls.msm = msm
+ cls.vamp = vamp
+ cls.p0 = p0 / p0.sum()
+ cls.p1 = p1 / p1.sum()
+ cls.atol = np.finfo(vamp.output_type()).eps*1000.0
+
+ def test_K_is_T(self):
+ m0 = self.vamp.model.mean_0
+ mt = self.vamp.model.mean_t
+ C0 = self.vamp.model.C00 + m0[:, np.newaxis]*m0[np.newaxis, :]
+ C1 = self.vamp.model.C0t + m0[:, np.newaxis]*mt[np.newaxis, :]
+ K = np.linalg.inv(C0).dot(C1)
+ np.testing.assert_allclose(K, self.msm.P, atol=1E-5)
+
+ Tsym = np.diag(self.p0 ** 0.5).dot(self.msm.P).dot(np.diag(self.p1 ** -0.5))
+ np.testing.assert_allclose(np.linalg.svd(Tsym)[1][1:], self.vamp.singular_values[0:2], atol=1E-7)
+
+ def test_singular_functions_against_MSM(self):
+ Tsym = np.diag(self.p0 ** 0.5).dot(self.msm.P).dot(np.diag(self.p1 ** -0.5))
+ Up, S, Vhp = np.linalg.svd(Tsym)
+ Vp = Vhp.T
+ U = Up * (self.p0 ** -0.5)[:, np.newaxis]
+ V = Vp * (self.p1 ** -0.5)[:, np.newaxis]
+ assert_allclose_ignore_phase(U[:, 0], np.ones(3), atol=1E-5)
+ assert_allclose_ignore_phase(V[:, 0], np.ones(3), atol=1E-5)
+ U = U[:, 1:]
+ V = V[:, 1:]
+ self.vamp.right = True
+ phi = self.vamp.transform(np.eye(3))
+ self.vamp.right = False
+ psi = self.vamp.transform(np.eye(3))
+ assert_allclose_ignore_phase(U, psi, atol=1E-5)
+ assert_allclose_ignore_phase(V, phi, atol=1E-5)
+ references_sf = [U.T.dot(np.diag(self.p0)).dot(np.linalg.matrix_power(self.msm.P, k*self.lag)).dot(V).T for k in
+ range(10-1)]
+ cktest = self.vamp.cktest(n_observables=2, mlags=10)
+ pred_sf = cktest.predictions
+ esti_sf = cktest.estimates
+ for e, p, r in zip(esti_sf[1:], pred_sf[1:], references_sf[1:]):
+ np.testing.assert_allclose(np.diag(p), np.diag(r), atol=1E-6)
+ np.testing.assert_allclose(np.abs(p), np.abs(r), atol=1E-6)
+
+ def test_CK_expectation_against_MSM(self):
+ obs = np.eye(3) # observe every state
+ cktest = self.vamp.cktest(observables=obs, statistics=None, mlags=4)
+ pred = cktest.predictions[1:]
+ est = cktest.estimates[1:]
+
+ for i, (est_, pred_) in enumerate(zip(est, pred)):
+ msm = estimate_markov_model(dtrajs=self.dtrajs, lag=self.lag*(i+1), reversible=False)
+ msm_esti = self.p0.T.dot(msm.P).dot(obs)
+ msm_pred = self.p0.T.dot(np.linalg.matrix_power(self.msm.P, (i+1))).dot(obs)
+ np.testing.assert_allclose(pred_, msm_pred, atol=self.atol)
+ np.testing.assert_allclose(est_, msm_esti, atol=self.atol)
+ np.testing.assert_allclose(est_, pred_, atol=0.006)
+
+ def test_CK_covariances_of_singular_functions(self):
+ cktest = self.vamp.cktest(n_observables=2, mlags=4) # auto
+ pred = cktest.predictions[1:]
+ est = cktest.estimates[1:]
+ error = np.max(np.abs(np.array(pred) - np.array(est))) / max(np.max(pred), np.max(est))
+ assert error < 0.05
+
+ def test_CK_covariances_against_MSM(self):
+ obs = np.eye(3) # observe every state
+ sta = np.eye(3) # restrict p0 to every state
+ cktest = self.vamp.cktest(observables=obs, statistics=sta, mlags=4, show_progress=True)
+ pred = cktest.predictions[1:]
+ est = cktest.estimates[1:]
+
+ for i, (est_, pred_) in enumerate(zip(est, pred)):
+ msm = estimate_markov_model(dtrajs=self.dtrajs, lag=self.lag*(i+1), reversible=False)
+ msm_esti = (self.p0 * sta).T.dot(msm.P).dot(obs).T
+ msm_pred = (self.p0 * sta).T.dot(np.linalg.matrix_power(self.msm.P, (i+1))).dot(obs).T
+ np.testing.assert_allclose(np.diag(pred_), np.diag(msm_pred), atol=self.atol)
+ np.testing.assert_allclose(np.diag(est_), np.diag(msm_esti), atol=self.atol)
+ np.testing.assert_allclose(np.diag(est_), np.diag(pred_), atol=0.006)
+
+ def test_self_score_with_MSM(self):
+ T = self.msm.P
+ Tadj = np.diag(1./self.p1).dot(T.T).dot(np.diag(self.p0))
+ NFro = np.trace(T.dot(Tadj))
+ s2 = self.vamp.score(score_method='VAMP2')
+ np.testing.assert_allclose(s2, NFro)
+
+ Tsym = np.diag(self.p0**0.5).dot(T).dot(np.diag(self.p1**-0.5))
+ Nnuc = np.linalg.norm(Tsym, ord='nuc')
+ s1 = self.vamp.score(score_method='VAMP1')
+ np.testing.assert_allclose(s1, Nnuc)
+
+ # TODO: check why this is not equal
+ sE = self.vamp.score(score_method='VAMPE')
+ np.testing.assert_allclose(sE, NFro) # see paper appendix H.2
+
+ def test_score_vs_MSM(self):
+ from pyemma.util.contexts import numpy_random_seed
+ with numpy_random_seed(32):
+ trajs_test, trajs_train = cvsplit_dtrajs(self.trajs)
+ with numpy_random_seed(32):
+ dtrajs_test, dtrajs_train = cvsplit_dtrajs(self.dtrajs)
+
+ methods = ('VAMP1', 'VAMP2', 'VAMPE')
+
+ for m in methods:
+ msm_train = estimate_markov_model(dtrajs=dtrajs_train, lag=self.lag, reversible=False)
+ score_msm = msm_train.score(dtrajs_test, score_method=m, score_k=None)
+
+ vamp_train = pyemma_api_vamp(data=trajs_train, lag=self.lag, dim=1.0)
+ score_vamp = vamp_train.score(test_data=trajs_test, score_method=m)
+
+ self.assertAlmostEqual(score_msm, score_vamp, places=2 if m == 'VAMPE' else 3, msg=m)
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/pyemma/coordinates/transform/__init__.py b/pyemma/coordinates/transform/__init__.py
index de8366d13..b7f976ceb 100644
--- a/pyemma/coordinates/transform/__init__.py
+++ b/pyemma/coordinates/transform/__init__.py
@@ -28,7 +28,11 @@
PCA - principal components
TICA - time independent components
+ VAMP - Variational approach for Markov processes
+ VAMPModel - Kinetic model form the Variational approach for Markov processes
+ VAMPChapmanKolmogorovValidator - Chapman Kolmogorov test for the Variational approach for Markov processes
"""
from .pca import *
from .tica import *
+from .vamp import *
diff --git a/pyemma/coordinates/transform/vamp.py b/pyemma/coordinates/transform/vamp.py
new file mode 100644
index 000000000..981e1080b
--- /dev/null
+++ b/pyemma/coordinates/transform/vamp.py
@@ -0,0 +1,1018 @@
+# This file is part of PyEMMA.
+#
+# Copyright (c) 2017 Computational Molecular Biology Group, Freie Universitaet Berlin (GER)
+#
+# PyEMMA is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see .
+'''
+@author: paul, marscher, wu, noe
+'''
+
+from __future__ import absolute_import
+
+import numpy as np
+
+from pyemma._base.model import Model
+from pyemma._base.serialization.serialization import SerializableMixIn
+from pyemma.util.annotators import fix_docs
+from pyemma.util.types import ensure_ndarray_or_None, ensure_ndarray
+from pyemma._ext.variational.solvers.direct import spd_inv_sqrt
+from pyemma.coordinates.estimation.covariance import LaggedCovariance
+from pyemma.coordinates.data._base.transformer import StreamingEstimationTransformer
+from pyemma.msm.estimators.lagged_model_validators import LaggedModelValidator
+from pyemma.util.linalg import mdot
+
+import warnings
+
+__all__ = ['VAMP', 'VAMPModel', 'VAMPChapmanKolmogorovValidator']
+
+
+class VAMPModel(Model, SerializableMixIn):
+ __serialize_version = 0
+ __serialize_fields = ('_U', '_V', '_svd_performed')
+
+ def set_model_params(self, mean_0, mean_t, C00, Ctt, C0t, U, V, singular_values, cumvar, dim, epsilon):
+ self.mean_0 = mean_0
+ self.mean_t = mean_t
+ self.C00 = C00
+ self.Ctt = Ctt
+ self.C0t = C0t
+ self._svd_performed = False
+ self._U = U
+ self._V = V
+ self._singular_values = singular_values
+ self.cumvar = cumvar
+ self.dim = dim
+ self.epsilon = epsilon
+
+ @property
+ def U(self):
+ "Tranformation matrix that represents the linear map from mean-free feature space to the space of left singular functions."
+ if not self._svd_performed:
+ self._diagonalize()
+ return self._U
+
+ @property
+ def V(self):
+ "Tranformation matrix that represents the linear map from mean-free feature space to the space of right singular functions."
+ if not self._svd_performed:
+ self._diagonalize()
+ return self._V
+
+ @property
+ def singular_values(self):
+ "The singular values of the half-weighted Koopman matrix"
+ if not self._svd_performed:
+ self._diagonalize()
+ return self._singular_values
+
+ @property
+ def C00(self):
+ return self._C00
+
+ @C00.setter
+ def C00(self, val):
+ self._svd_performed = False
+ self._C00 = val
+
+ @property
+ def C0t(self):
+ return self._C0t
+
+ @C0t.setter
+ def C0t(self, val):
+ self._svd_performed = False
+ self._C0t = val
+
+ @property
+ def Ctt(self):
+ return self._Ctt
+
+ @Ctt.setter
+ def Ctt(self, val):
+ self._svd_performed = False
+ self._Ctt = val
+
+ def dimension(self):
+ """ output dimension """
+ if self.dim is None or (isinstance(self.dim, float) and self.dim == 1.0):
+ if hasattr(self, '_rank0'):
+ return min(self._rank0, self._rankt)
+ else:
+ raise RuntimeError('Requested dimension, but the dimension depends on the singular values of C00 and C11'
+ ' and the transformer has not yet been estimated. Call estimate() before.')
+ if isinstance(self.dim, float):
+ if hasattr(self, 'cumvar') and self.cumvar is not None:
+ return np.count_nonzero(self.cumvar >= self.dim)
+ else:
+ raise RuntimeError('Requested dimension, but the dimension depends on the cumulative variance and the '
+ 'transformer has not yet been estimated. Call estimate() before.')
+ else:
+ if hasattr(self, '_rank0'):
+ return np.min([self._rank0, self._rankt, self.dim])
+ else:
+ warnings.warn(
+ RuntimeWarning('Requested dimension, but the dimension depends on the singular values of C00 and C11'
+ ' and the transformer has not yet been estimated. Result is only an approximation.'))
+ return self.dim
+
+ def expectation(self, observables, statistics, lag_multiple=1, observables_mean_free=False, statistics_mean_free=False):
+ r"""Compute future expectation of observable or covariance using the approximated Koopman operator.
+
+ Parameters
+ ----------
+ observables : np.ndarray((input_dimension, n_observables))
+ Coefficients that express one or multiple observables in
+ the basis of the input features.
+
+ statistics : np.ndarray((input_dimension, n_statistics)), optional
+ Coefficients that express one or multiple statistics in
+ the basis of the input features.
+ This parameter can be None. In that case, this method
+ returns the future expectation value of the observable(s).
+
+ lag_multiple : int
+ If > 1, extrapolate to a multiple of the estimator's lag
+ time by assuming Markovianity of the approximated Koopman
+ operator.
+
+ observables_mean_free : bool, default=False
+ If true, coefficients in `observables` refer to the input
+ features with feature means removed.
+ If false, coefficients in `observables` refer to the
+ unmodified input features.
+
+ statistics_mean_free : bool, default=False
+ If true, coefficients in `statistics` refer to the input
+ features with feature means removed.
+ If false, coefficients in `statistics` refer to the
+ unmodified input features.
+
+ Notes
+ -----
+ A "future expectation" of a observable g is the average of g computed
+ over a time window that has the same total length as the input data
+ from which the Koopman operator was estimated but is shifted
+ by lag_multiple*tau time steps into the future (where tau is the lag
+ time).
+
+ It is computed with the equation:
+
+ .. math::
+
+ \mathbb{E}[g]_{\rho_{n}}=\mathbf{q}^{T}\mathbf{P}^{n-1}\mathbf{e}_{1}
+
+ where
+
+ .. math::
+
+ P_{ij}=\sigma_{i}\langle\psi_{i},\phi_{j}\rangle_{\rho_{1}}
+
+ and
+
+ .. math::
+
+ q_{i}=\langle g,\phi_{i}\rangle_{\rho_{1}}
+
+ and :math:`\mathbf{e}_{1}` is the first canonical unit vector.
+
+
+ A model prediction of time-lagged covariances between the
+ observable f and the statistic g at a lag-time of lag_multiple*tau
+ is computed with the equation:
+
+ .. math::
+
+ \mathrm{cov}[g,\,f;n\tau]=\mathbf{q}^{T}\mathbf{P}^{n-1}\boldsymbol{\Sigma}\mathbf{r}
+
+ where :math:`r_{i}=\langle\psi_{i},f\rangle_{\rho_{0}}` and
+ :math:`\boldsymbol{\Sigma}=\mathrm{diag(\boldsymbol{\sigma})}` .
+ """
+ # TODO: implement the case lag_multiple=0
+
+ dim = self.dimension()
+
+ S = np.diag(np.concatenate(([1.0], self.singular_values[0:dim])))
+ V = self.V[:, 0:dim]
+ U = self.U[:, 0:dim]
+ m_0 = self.mean_0
+ m_t = self.mean_t
+
+ assert lag_multiple >= 1, 'lag_multiple = 0 not implemented'
+
+ if lag_multiple == 1:
+ P = S
+ else:
+ p = np.zeros((dim + 1, dim + 1))
+ p[0, 0] = 1.0
+ p[1:, 0] = U.T.dot(m_t - m_0)
+ p[1:, 1:] = U.T.dot(self.Ctt).dot(V)
+ P = np.linalg.matrix_power(S.dot(p), lag_multiple - 1).dot(S)
+
+ Q = np.zeros((observables.shape[1], dim + 1))
+ if not observables_mean_free:
+ Q[:, 0] = observables.T.dot(m_t)
+ Q[:, 1:] = observables.T.dot(self.Ctt).dot(V)
+
+ if statistics is not None:
+ # compute covariance
+ R = np.zeros((statistics.shape[1], dim + 1))
+ if not statistics_mean_free:
+ R[:, 0] = statistics.T.dot(m_0)
+ R[:, 1:] = statistics.T.dot(self.C00).dot(U)
+
+ if statistics is not None:
+ # compute lagged covariance
+ return Q.dot(P).dot(R.T)
+ # TODO: discuss whether we want to return this or the transpose
+ # TODO: from MSMs one might expect to first index to refer to the statistics, here it is the other way round
+ else:
+ # compute future expectation
+ return Q.dot(P)[:, 0]
+
+ def _diagonalize(self, scaling=None):
+ """Performs SVD on covariance matrices and save left, right singular vectors and values in the model.
+
+ Parameters
+ ----------
+ scaling : None or string, default=None
+ Scaling to be applied to the VAMP modes upon transformation
+ * None: no scaling will be applied, variance of the singular
+ functions is 1
+ * 'kinetic map' or 'km': singular functions are scaled by
+ singular value. Note that only the left singular functions
+ induce a kinetic map.
+ """
+
+ L0, self._rank0 = spd_inv_sqrt(self.C00, epsilon=self.epsilon, return_rank=True)
+ Lt, self._rankt = spd_inv_sqrt(self.Ctt, epsilon=self.epsilon, return_rank=True)
+ A = L0.T.dot(self.C0t).dot(Lt)
+
+ Uprime, s, Vprimeh = np.linalg.svd(A, compute_uv=True)
+ self._singular_values = s
+
+ # compute cumulative variance
+ cumvar = np.cumsum(s ** 2)
+ cumvar /= cumvar[-1]
+ self.cumvar = cumvar
+
+ self._L0 = L0
+ self._Lt = Lt
+
+ m = self.dimension()
+
+ U = L0.dot(Uprime[:, :m]) # U in the paper singular_vectors_left
+ V = Lt.dot(Vprimeh[:m, :].T) # V in the paper singular_vectors_right
+
+ # scale vectors
+ if scaling is None:
+ pass
+ elif scaling in ['km', 'kinetic map']:
+ U *= s[np.newaxis, 0:m]
+ else:
+ raise ValueError('unexpected value (%s) of "scaling"' % scaling)
+
+ self._U = U
+ self._V = V
+ self._svd_performed = True
+
+ def score(self, test_model=None, score_method='VAMP2'):
+ """Compute the VAMP score for this model or the cross-validation score between self and a second model.
+
+ Parameters
+ ----------
+ test_model : VAMPModel, optional, default=None
+
+ If `test_model` is not None, this method computes the cross-validation score
+ between self and `test_model`. It is assumed that self was estimated from
+ the "training" data and `test_model` was estimated from the "test" data. The
+ score is computed for one realization of self and `test_model`. Estimation
+ of the average cross-validation score and partitioning of data into test and
+ training part is not performed by this method.
+
+ If `test_model` is None, this method computes the VAMP score for the model
+ contained in self.
+
+ score_method : str, optional, default='VAMP2'
+ Available scores are based on the variational approach for Markov processes [1]_:
+
+ * 'VAMP1' Sum of singular values of the half-weighted Koopman matrix [1]_ .
+ If the model is reversible, this is equal to the sum of
+ Koopman matrix eigenvalues, also called Rayleigh quotient [1]_.
+ * 'VAMP2' Sum of squared singular values of the half-weighted Koopman matrix [1]_ .
+ If the model is reversible, this is equal to the kinetic variance [2]_ .
+ * 'VAMPE' Approximation error of the estimated Koopman operator with respect to
+ the true Koopman operator up to an additive constant [1]_ .
+
+ Returns
+ -------
+ score : float
+ If `test_model` is not None, returns the cross-validation VAMP score between
+ self and `test_model`. Otherwise return the selected VAMP-score of self.
+
+ References
+ ----------
+ .. [1] Wu, H. and Noe, F. 2017. Variational approach for learning Markov processes from time series data.
+ arXiv:1707.04659v1
+ .. [2] Noe, F. and Clementi, C. 2015. Kinetic distance and kinetic maps from molecular dynamics simulation.
+ J. Chem. Theory. Comput. doi:10.1021/acs.jctc.5b00553
+ """
+ # TODO: implement for TICA too
+ if test_model is None:
+ test_model = self
+ Uk = self.U[:, 0:self.dimension()]
+ Vk = self.V[:, 0:self.dimension()]
+ res = None
+ if score_method == 'VAMP1' or score_method == 'VAMP2':
+ A = spd_inv_sqrt(Uk.T.dot(test_model.C00).dot(Uk))
+ B = Uk.T.dot(test_model.C0t).dot(Vk)
+ C = spd_inv_sqrt(Vk.T.dot(test_model.Ctt).dot(Vk))
+ ABC = mdot(A, B, C)
+ if score_method == 'VAMP1':
+ res = np.linalg.norm(ABC, ord='nuc')
+ elif score_method == 'VAMP2':
+ res = np.linalg.norm(ABC, ord='fro')**2
+ elif score_method == 'VAMPE':
+ Sk = np.diag(self.singular_values[0:self.dimension()])
+ res = np.trace(2.0 * mdot(Vk, Sk, Uk.T, test_model.C0t) - mdot(Vk, Sk, Uk.T, test_model.C00, Uk, Sk, Vk.T, test_model.Ctt))
+ else:
+ raise ValueError('"score" should be one of VAMP1, VAMP2 or VAMPE')
+ # add the contribution (+1) of the constant singular functions to the result
+ assert res
+ return res + 1
+
+
+@fix_docs
+class VAMP(StreamingEstimationTransformer, SerializableMixIn):
+ r"""Variational approach for Markov processes (VAMP)"""
+
+ __serialize_version = 0
+
+ def describe(self):
+ return "[VAMP, lag = %i; max. output dim. = %s]" % (self._lag, str(self.dim))
+
+ def __init__(self, lag, dim=None, scaling=None, right=True, epsilon=1e-6,
+ stride=1, skip=0, ncov_max=float('inf')):
+ r""" Variational approach for Markov processes (VAMP) [1]_.
+
+ Parameters
+ ----------
+ lag : int
+ lag time
+ dim : float or int
+ Number of dimensions to keep:
+
+ * if dim is not set all available ranks are kept:
+ `n_components == min(n_samples, n_features)`
+ * if dim is an integer >= 1, this number specifies the number
+ of dimensions to keep. By default this will use the kinetic
+ variance.
+ * if dim is a float with ``0 < dim < 1``, select the number
+ of dimensions such that the amount of kinetic variance
+ that needs to be explained is greater than the percentage
+ specified by dim.
+ scaling : None or string
+ Scaling to be applied to the VAMP order parameters upon transformation
+
+ * None: no scaling will be applied, variance of the order parameters is 1
+ * 'kinetic map' or 'km': order parameters are scaled by singular value
+ Only the left singular functions induce a kinetic map.
+ Therefore scaling='km' is only effective if `right` is False.
+ right : boolean
+ Whether to compute the right singular functions.
+ If `right==True`, `get_output()` will return the right singular
+ functions. Otherwise, `get_output()` will return the left singular
+ functions.
+ Beware that only `frames[tau:, :]` of each trajectory returned
+ by `get_output()` contain valid values of the right singular
+ functions. Conversely, only `frames[0:-tau, :]` of each
+ trajectory returned by `get_output()` contain valid values of
+ the left singular functions. The remaining frames might
+ possibly be interpreted as some extrapolation.
+ epsilon : float
+ singular value cutoff. Singular values of :math:`C0` with
+ norms <= epsilon will be cut off. The remaining number of
+ singular values define the size of the output.
+ stride: int, optional, default = 1
+ Use only every stride-th time step. By default, every time step is used.
+ skip : int, default=0
+ skip the first initial n frames per trajectory.
+ ncov_max : int, default=infinity
+ limit the memory usage of the algorithm from [3]_ to an amount that corresponds
+ to ncov_max additional copies of each correlation matrix
+
+ Notes
+ -----
+ VAMP is a method for dimensionality reduction of Markov processes.
+
+ The Koopman operator :math:`\mathcal{K}` is an integral operator
+ that describes conditional future expectation values. Let
+ :math:`p(\mathbf{x},\,\mathbf{y})` be the conditional probability
+ density of visiting an infinitesimal phase space volume around
+ point :math:`\mathbf{y}` at time :math:`t+\tau` given that the phase
+ space point :math:`\mathbf{x}` was visited at the earlier time
+ :math:`t`. Then the action of the Koopman operator on a function
+ :math:`f` can be written as follows:
+
+ .. math::
+
+ \mathcal{K}f=\int p(\mathbf{x},\,\mathbf{y})f(\mathbf{y})\,\mathrm{dy}=\mathbb{E}\left[f(\mathbf{x}_{t+\tau}\mid\mathbf{x}_{t}=\mathbf{x})\right]
+
+ The Koopman operator is defined without any reference to an
+ equilibrium distribution. Therefore it is well-defined in
+ situations where the dynamics is irreversible or/and non-stationary
+ such that no equilibrium distribution exists.
+
+ If we approximate :math:`f` by a linear superposition of ansatz
+ functions :math:`\boldsymbol{\chi}` of the conformational
+ degrees of freedom (features), the operator :math:`\mathcal{K}`
+ can be approximated by a (finite-dimensional) matrix :math:`\mathbf{K}`.
+
+ The approximation is computed as follows: From the time-dependent
+ input features :math:`\boldsymbol{\chi}(t)`, we compute the mean
+ :math:`\boldsymbol{\mu}_{0}` (:math:`\boldsymbol{\mu}_{1}`) from
+ all data excluding the last (first) :math:`\tau` steps of every
+ trajectory as follows:
+
+ .. math::
+
+ \boldsymbol{\mu}_{0} :=\frac{1}{T-\tau}\sum_{t=0}^{T-\tau}\boldsymbol{\chi}(t)
+
+ \boldsymbol{\mu}_{1} :=\frac{1}{T-\tau}\sum_{t=\tau}^{T}\boldsymbol{\chi}(t)
+
+ Next, we compute the instantaneous covariance matrices
+ :math:`\mathbf{C}_{00}` and :math:`\mathbf{C}_{11}` and the
+ time-lagged covariance matrix :math:`\mathbf{C}_{01}` as follows:
+
+ .. math::
+
+ \mathbf{C}_{00} :=\frac{1}{T-\tau}\sum_{t=0}^{T-\tau}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right]\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right]
+
+ \mathbf{C}_{11} :=\frac{1}{T-\tau}\sum_{t=\tau}^{T}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{1}\right]\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{1}\right]
+
+ \mathbf{C}_{01} :=\frac{1}{T-\tau}\sum_{t=0}^{T-\tau}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right]\left[\boldsymbol{\chi}(t+\tau)-\boldsymbol{\mu}_{1}\right]
+
+ The Koopman matrix is then computed as follows:
+
+ .. math::
+
+ \mathbf{K}=\mathbf{C}_{00}^{-1}\mathbf{C}_{01}
+
+ It can be shown [1]_ that the leading singular functions of the
+ half-weighted Koopman matrix
+
+ .. math::
+
+ \bar{\mathbf{K}}:=\mathbf{C}_{00}^{-\frac{1}{2}}\mathbf{C}_{01}\mathbf{C}_{11}^{-\frac{1}{2}}
+
+ encode the best reduced dynamical model for the time series.
+
+ The singular functions can be computed by first performing the
+ singular value decomposition
+
+ .. math::
+
+ \bar{\mathbf{K}}=\mathbf{U}^{\prime}\mathbf{S}\mathbf{V}^{\prime}
+
+ and then mapping the input conformation to the left singular
+ functions :math:`\boldsymbol{\psi}` and right singular
+ functions :math:`\boldsymbol{\phi}` as follows:
+
+ .. math::
+
+ \boldsymbol{\psi}(t):=\mathbf{U}^{\prime\top}\mathbf{C}_{00}^{-\frac{1}{2}}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right]
+
+ \boldsymbol{\phi}(t):=\mathbf{V}^{\prime\top}\mathbf{C}_{11}^{-\frac{1}{2}}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{1}\right]
+
+
+ References
+ ----------
+ .. [1] Wu, H. and Noe, F. 2017. Variational approach for learning Markov processes from time series data.
+ arXiv:1707.04659v1
+ .. [2] Noe, F. and Clementi, C. 2015. Kinetic distance and kinetic maps from molecular dynamics simulation.
+ J. Chem. Theory. Comput. doi:10.1021/acs.jctc.5b00553
+ .. [3] Chan, T. F., Golub G. H., LeVeque R. J. 1979. Updating formulae and pairwiese algorithms for
+ computing sample variances. Technical Report STAN-CS-79-773, Department of Computer Science, Stanford University.
+ """
+ StreamingEstimationTransformer.__init__(self)
+
+ # empty dummy model instance
+ self._model = VAMPModel()
+ self.set_params(lag=lag, dim=dim, scaling=scaling, right=right,
+ epsilon=epsilon, stride=stride, skip=skip, ncov_max=ncov_max)
+ self._covar = None
+ self._model.update_model_params(dim=dim, epsilon=epsilon)
+
+ def _estimate(self, iterable, **kw):
+ self._covar = LaggedCovariance(c00=True, c0t=True, ctt=True, remove_data_mean=True, reversible=False,
+ lag=self.lag, bessel=False, stride=self.stride, skip=self.skip, weights=None,
+ ncov_max=self.ncov_max)
+ indim = iterable.dimension()
+
+ if isinstance(self.dim, int) and not self.dim <= indim:
+ raise RuntimeError("requested more output dimensions (%i) than dimension"
+ " of input data (%i)" % (self.dim, indim))
+
+ if self._logger_is_active(self._loglevel_DEBUG):
+ self._logger.debug("Running VAMP with tau=%i; Estimating two covariance matrices"
+ " with dimension (%i, %i)" % (self._lag, indim, indim))
+
+ self._covar.estimate(iterable, **kw)
+ self._model.update_model_params(mean_0=self._covar.mean,
+ mean_t=self._covar.mean_tau,
+ C00=self._covar.C00_,
+ C0t=self._covar.C0t_,
+ Ctt=self._covar.Ctt_)
+ self._diagonalize()
+
+ return self._model
+
+ def partial_fit(self, X):
+ """ incrementally update the covariances and mean.
+
+ Parameters
+ ----------
+ X: array, list of arrays, PyEMMA reader
+ input data.
+
+ Notes
+ -----
+ The projection matrix is first being calculated upon its first access.
+ """
+ from pyemma.coordinates import source
+ iterable = source(X)
+
+ if isinstance(self.dim, int):
+ indim = iterable.dimension()
+ if not self.dim <= indim:
+ raise RuntimeError("requested more output dimensions (%i) than dimension"
+ " of input data (%i)" % (self.dim, indim))
+
+ if self._covar is None:
+ self._covar = LaggedCovariance(c00=True, c0t=True, ctt=True, remove_data_mean=True, reversible=False,
+ lag=self.lag, bessel=False, stride=self.stride, skip=self.skip, weights=None,
+ ncov_max=self.ncov_max)
+ self._covar.partial_fit(iterable)
+ self._model.update_model_params(mean_0=self._covar.mean, # TODO: inefficient, fixme
+ mean_t=self._covar.mean_tau,
+ C00=self._covar.C00_,
+ C0t=self._covar.C0t_,
+ Ctt=self._covar.Ctt_)
+
+ # self._used_data = self._covar._used_data
+ self._estimated = False
+
+ return self
+
+ def _diagonalize(self):
+ # diagonalize with low rank approximation
+ self._logger.debug("diagonalize covariance matrices")
+ self.model._diagonalize(self.scaling)
+ self._logger.debug("finished diagonalization.")
+ self._estimated = True
+
+ def dimension(self):
+ return self._model.dimension()
+
+ def _transform_array(self, X):
+ r"""Projects the data onto the dominant singular functions.
+
+ Parameters
+ ----------
+ X : ndarray(n, m)
+ the input data
+
+ Returns
+ -------
+ Y : ndarray(n,)
+ the projected data
+ If `self.right` is True, projection will be on the right singular
+ functions. Otherwise, projection will be on the left singular
+ functions.
+ """
+ # TODO: in principle get_output should not return data for *all* frames!
+ # TODO: implement our own iterators? This would also include random access to be complete...
+ if self.right:
+ X_meanfree = X - self._model.mean_t
+ Y = np.dot(X_meanfree, self._model.V[:, 0:self.dimension()])
+ else:
+ X_meanfree = X - self._model.mean_0
+ Y = np.dot(X_meanfree, self._model.U[:, 0:self.dimension()])
+
+ return Y.astype(self.output_type())
+
+ @property
+ def singular_values(self):
+ r"""Singular values of the half-weighted Koopman matrix (usually denoted :math:`\sigma`)
+
+ Returns
+ -------
+ singular values: 1-D np.array
+ """
+ return self._model.singular_values
+
+ @property
+ def singular_vectors_right(self):
+ r"""Tranformation matrix that represents the linear map from feature space to the space of right singular functions.
+
+ Notes
+ -----
+ Right "singular vectors" V of the VAMP problem (equation 13 in [1]_), columnwise
+
+ Returns
+ -------
+ vectors: 2-D ndarray
+ Coefficients that express the right singular functions in the
+ basis of mean-free input features.
+
+ References
+ ----------
+ .. [1] Wu, H. and Noe, F. 2017. Variational approach for learning Markov processes from time series data.
+ arXiv:1707.04659v1
+ """
+ return self._model.V
+
+ @property
+ def singular_vectors_left(self):
+ r"""Tranformation matrix that represents the linear map from feature space to the space of left singular functions.
+
+ Notes
+ -----
+ Left "singular vectors" U of the VAMP problem (equation 13 in [1]_), columnwise
+
+ Returns
+ -------
+ vectors: 2-D ndarray
+ Coefficients that express the left singular functions in the
+ basis of mean-free input features.
+
+ References
+ ----------
+ .. [1] Wu, H. and Noe, F. 2017. Variational approach for learning Markov processes from time series data.
+ arXiv:1707.04659v1
+ """
+ return self._model.U
+
+ @property
+ def cumvar(self):
+ r"""Cumulative sum of the squared and normalized singular values
+
+ Returns
+ -------
+ cumvar: 1D np.array
+ """
+ return self._model.cumvar
+
+ @property
+ def show_progress(self):
+ if self._covar is None:
+ return False
+ else:
+ return self._covar.show_progress
+
+ @show_progress.setter
+ def show_progress(self, value):
+ if self._covar is not None:
+ self._covar.show_progress = value
+
+ def expectation(self, observables, statistics, lag_multiple=1, observables_mean_free=False,
+ statistics_mean_free=False):
+ r"""Compute future expectation of observable or covariance using the approximated Koopman operator.
+
+ Parameters
+ ----------
+ observables : np.ndarray((input_dimension, n_observables))
+ Coefficients that express one or multiple observables in
+ the basis of the input features.
+
+ statistics : np.ndarray((input_dimension, n_statistics)), optional
+ Coefficients that express one or multiple statistics in
+ the basis of the input features.
+ This parameter can be None. In that case, this method
+ returns the future expectation value of the observable(s).
+
+ lag_multiple : int
+ If > 1, extrapolate to a multiple of the estimator's lag
+ time by assuming Markovianity of the approximated Koopman
+ operator.
+
+ observables_mean_free : bool, default=False
+ If true, coefficients in `observables` refer to the input
+ features with feature means removed.
+ If false, coefficients in `observables` refer to the
+ unmodified input features.
+
+ statistics_mean_free : bool, default=False
+ If true, coefficients in `statistics` refer to the input
+ features with feature means removed.
+ If false, coefficients in `statistics` refer to the
+ unmodified input features.
+
+ Notes
+ -----
+ A "future expectation" of a observable g is the average of g computed
+ over a time window that has the same total length as the input data
+ from which the Koopman operator was estimated but is shifted
+ by lag_multiple*tau time steps into the future (where tau is the lag
+ time).
+
+ It is computed with the equation:
+
+ .. math::
+
+ \mathbb{E}[g]_{\rho_{n}}=\mathbf{q}^{T}\mathbf{P}^{n-1}\mathbf{e}_{1}
+
+ where
+
+ .. math::
+
+ P_{ij}=\sigma_{i}\langle\psi_{i},\phi_{j}\rangle_{\rho_{1}}
+
+ and
+
+ .. math::
+
+ q_{i}=\langle g,\phi_{i}\rangle_{\rho_{1}}
+
+ and :math:`\mathbf{e}_{1}` is the first canonical unit vector.
+
+
+ A model prediction of time-lagged covariances between the
+ observable f and the statistic g at a lag-time of lag_multiple*tau
+ is computed with the equation:
+
+ .. math::
+
+ \mathrm{cov}[g,\,f;n\tau]=\mathbf{q}^{T}\mathbf{P}^{n-1}\boldsymbol{\Sigma}\mathbf{r}
+
+ where :math:`r_{i}=\langle\psi_{i},f\rangle_{\rho_{0}}` and
+ :math:`\boldsymbol{\Sigma}=\mathrm{diag(\boldsymbol{\sigma})}` .
+ """
+ return self._model.expectation(observables, statistics, lag_multiple=lag_multiple,
+ statistics_mean_free=statistics_mean_free,
+ observables_mean_free=observables_mean_free)
+
+ def cktest(self, n_observables=None, observables='phi', statistics='psi', mlags=10, n_jobs=1, show_progress=True,
+ iterable=None):
+ r"""Do the Chapman-Kolmogorov test by computing predictions for higher lag times and by performing estimations at higher lag times.
+
+ Notes
+ -----
+
+ This method computes two sets of time-lagged covariance matrices
+
+ * estimates at higher lag times :
+
+ .. math::
+
+ \left\langle \mathbf{K}(n\tau)g_{i},f_{j}\right\rangle_{\rho_{0}}
+
+ where :math:`\rho_{0}` is the empirical distribution implicitly defined
+ by all data points from time steps 0 to T-tau in all trajectories,
+ :math:`\mathbf{K}(n\tau)` is a rank-reduced Koopman matrix estimated
+ at the lag-time n*tau and g and f are some functions of the data.
+ Rank-reduction of the Koopman matrix is controlled by the `dim`
+ parameter of :func:`vamp `.
+
+ * predictions at higher lag times :
+
+ .. math::
+
+ \left\langle \mathbf{K}^{n}(\tau)g_{i},f_{j}\right\rangle_{\rho_{0}}
+
+ where :math:`\mathbf{K}^{n}` is the n'th power of the rank-reduced
+ Koopman matrix contained in self.
+
+
+ The Champan-Kolmogorov test is to compare the predictions to the
+ estimates.
+
+ Parameters
+ ----------
+ n_observables : int, optional, default=None
+ Limit the number of default observables (and of default statistics)
+ to this number.
+ Only used if `observables` are None or `statistics` are None.
+
+ observables : np.ndarray((input_dimension, n_observables)) or 'phi'
+ Coefficients that express one or multiple observables :math:`g`
+ in the basis of the input features.
+ This parameter can be 'phi'. In that case, the dominant
+ right singular functions of the Koopman operator estimated
+ at the smallest lag time are used as default observables.
+
+ statistics : np.ndarray((input_dimension, n_statistics)) or 'psi'
+ Coefficients that express one or multiple statistics :math:`f`
+ in the basis of the input features.
+ This parameter can be 'psi'. In that case, the dominant
+ left singular functions of the Koopman operator estimated
+ at the smallest lag time are used as default statistics.
+
+ mlags : int or int-array, default=10
+ multiples of lag times for testing the Model, e.g. range(10).
+ A single int will trigger a range, i.e. mlags=10 maps to
+ mlags=range(10).
+ Note that you need to be able to do a model prediction for each
+ of these lag time multiples, e.g. the value 0 only make sense
+ if model.expectation(lag_multiple=0) will work.
+
+ n_jobs : int, default=1
+ how many jobs to use during calculation
+
+ show_progress : bool, default=True
+ Show progressbars for calculation?
+
+ iterable : any data format that `pyemma.coordinates.vamp()` accepts as input, optional
+ It `iterable` is None, the same data source with which VAMP
+ was initialized will be used for all estimation.
+ Otherwise, all estimates (not predictions) from data will be computed
+ from the data contained in `iterable`.
+
+ Returns
+ -------
+ vckv : :class:`VAMPChapmanKolmogorovValidator `
+ Contains the estimated and the predicted covarince matrices.
+ The object can be plotted with :func:`plot_cktest ` with the option `y01=False`.
+ """
+ if n_observables is not None:
+ if n_observables > self.dimension():
+ warnings.warn('Selected singular functions as observables but dimension '
+ 'is lower than requested number of observables.')
+ n_observables = self.dimension()
+ else:
+ n_observables = self.dimension()
+
+ if isinstance(observables, str) and observables == 'phi':
+ observables = self.singular_vectors_right[:, 0:n_observables]
+ observables_mean_free = True
+ else:
+ ensure_ndarray(observables, ndim=2)
+ observables_mean_free = False
+
+ if isinstance(statistics, str) and statistics == 'psi':
+ statistics = self.singular_vectors_left[:, 0:n_observables]
+ statistics_mean_free = True
+ else:
+ ensure_ndarray_or_None(statistics, ndim=2)
+ statistics_mean_free = False
+
+ ck = VAMPChapmanKolmogorovValidator(self, self, observables, statistics, observables_mean_free,
+ statistics_mean_free, mlags=mlags, n_jobs=n_jobs,
+ show_progress=show_progress)
+
+ if iterable is None:
+ iterable = self.data_producer
+
+ ck.estimate(iterable)
+ return ck
+
+ def score(self, test_data=None, score_method='VAMP2'):
+ """Compute the VAMP score for this model or the cross-validation score between self and a second model estimated form different data.
+
+ Parameters
+ ----------
+ test_data : any data format that `pyemma.coordinates.vamp()` accepts as input
+
+ If `test_data` is not None, this method computes the cross-validation score
+ between self and a VAMP model estimated from `test_data`. It is assumed that
+ self was estimated from the "training" data and `test_data` is the test data.
+ The score is computed for one realization of self and `test_data`. Estimation
+ of the average cross-validation score and partitioning of data into test and
+ training part is not performed by this method.
+
+ If `test_data` is None, this method computes the VAMP score for the model
+ contained in self.
+
+ The model that is estimated from `test_data` will inherit all hyperparameters
+ from self.
+
+ score_method : str, optional, default='VAMP2'
+ Available scores are based on the variational approach for Markov processes [1]_:
+
+ * 'VAMP1' Sum of singular values of the half-weighted Koopman matrix [1]_ .
+ If the model is reversible, this is equal to the sum of
+ Koopman matrix eigenvalues, also called Rayleigh quotient [1]_.
+ * 'VAMP2' Sum of squared singular values of the half-weighted Koopman matrix [1]_ .
+ If the model is reversible, this is equal to the kinetic variance [2]_ .
+ * 'VAMPE' Approximation error of the estimated Koopman operator with respect to
+ the true Koopman operator up to an additive constant [1]_ .
+
+ Returns
+ -------
+ score : float
+ If `test_data` is not None, returns the cross-validation VAMP score between
+ self and the model estimated from `test_data`. Otherwise return the selected
+ VAMP-score of self.
+
+ References
+ ----------
+ .. [1] Wu, H. and Noe, F. 2017. Variational approach for learning Markov processes from time series data.
+ arXiv:1707.04659v1
+ .. [2] Noe, F. and Clementi, C. 2015. Kinetic distance and kinetic maps from molecular dynamics simulation.
+ J. Chem. Theory. Comput. doi:10.1021/acs.jctc.5b00553
+ """
+ from pyemma._ext.sklearn.base import clone as clone_estimator
+ est = clone_estimator(self)
+
+ if test_data is None:
+ return self.model.score(None, score_method=score_method)
+ else:
+ est.estimate(test_data)
+ return self.model.score(est.model, score_method=score_method)
+
+
+class VAMPChapmanKolmogorovValidator(LaggedModelValidator):
+ __serialize_version = 0
+ __serialize_fields = ('nsets', 'statistics', 'observables', 'observables_mean_free', 'statistics_mean_free')
+
+ def __init__(self, model, estimator, observables, statistics, observables_mean_free, statistics_mean_free,
+ mlags=10, n_jobs=1, show_progress=True):
+ r"""
+ Note
+ ----
+ It is recommended that you create this object by calling the
+ `cktest` method of a VAMP object created with
+ :func:`vamp `.
+
+ Parameters
+ ----------
+ model : Model
+ Model with the smallest lag time. Is used to make predictions
+ for larger lag times.
+
+ estimator : Estimator
+ Parametrized Estimator that has produced the model.
+ Is used as a prototype for estimating models at higher lag times.
+
+ observables : np.ndarray((input_dimension, n_observables))
+ Coefficients that express one or multiple observables in
+ the basis of the input features.
+
+ statistics : np.ndarray((input_dimension, n_statistics))
+ Coefficients that express one or multiple statistics in
+ the basis of the input features.
+
+ observables_mean_free : bool, default=False
+ If true, coefficients in `observables` refer to the input
+ features with feature means removed.
+ If false, coefficients in `observables` refer to the
+ unmodified input features.
+
+ statistics_mean_free : bool, default=False
+ If true, coefficients in `statistics` refer to the input
+ features with feature means removed.
+ If false, coefficients in `statistics` refer to the
+ unmodified input features.
+
+ mlags : int or int-array, default=10
+ multiples of lag times for testing the Model, e.g. range(10).
+ A single int will trigger a range, i.e. mlags=10 maps to
+ mlags=range(10).
+ Note that you need to be able to do a model prediction for each
+ of these lag time multiples, e.g. the value 0 only make sense
+ if model.expectation(lag_multiple=0) will work.
+
+ n_jobs : int, default=1
+ how many jobs to use during calculation
+
+ show_progress : bool, default=True
+ Show progressbars for calculation?
+
+ Notes
+ -----
+ The object can be plotted with :func:`plot_cktest `
+ with the option `y01=False`.
+ """
+ LaggedModelValidator.__init__(self, model, estimator, mlags=mlags,
+ n_jobs=n_jobs, show_progress=show_progress)
+ self.statistics = statistics
+ self.observables = observables
+ self.observables_mean_free = observables_mean_free
+ self.statistics_mean_free = statistics_mean_free
+ if self.statistics is not None:
+ self.nsets = min(self.observables.shape[1], self.statistics.shape[1])
+
+ def _compute_observables(self, model, estimator, mlag=1):
+ # for lag time 0 we return a matrix of nan, until the correct solution is implemented
+ if mlag == 0 or model is None:
+ if self.statistics is None:
+ return np.zeros(self.observables.shape[1]) + np.nan
+ else:
+ return np.zeros((self.observables.shape[1], self.statistics.shape[1])) + np.nan
+ else:
+ return model.expectation(statistics=self.statistics, observables=self.observables, lag_multiple=mlag,
+ statistics_mean_free=self.statistics_mean_free,
+ observables_mean_free=self.observables_mean_free)
+
+ def _compute_observables_conf(self, model, estimator, mlag=1):
+ raise NotImplementedError('estimation of confidence intervals not yet implemented for VAMP')
diff --git a/pyemma/msm/estimators/__init__.py b/pyemma/msm/estimators/__init__.py
index b562a3a9e..56b52eb53 100644
--- a/pyemma/msm/estimators/__init__.py
+++ b/pyemma/msm/estimators/__init__.py
@@ -15,13 +15,13 @@
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see .
-
from __future__ import absolute_import
+
__author__ = 'noe'
from .maximum_likelihood_msm import MaximumLikelihoodMSM
from .maximum_likelihood_msm import OOMReweightedMSM
-from .maximum_likelihood_msm import AugmentedMarkovModel
+from .maximum_likelihood_msm import AugmentedMarkovModel
from .bayesian_msm import BayesianMSM
from .maximum_likelihood_hmsm import MaximumLikelihoodHMSM
from .bayesian_hmsm import BayesianHMSM
diff --git a/pyemma/msm/estimators/lagged_model_validators.py b/pyemma/msm/estimators/lagged_model_validators.py
index a39b3c141..da45352cf 100644
--- a/pyemma/msm/estimators/lagged_model_validators.py
+++ b/pyemma/msm/estimators/lagged_model_validators.py
@@ -17,7 +17,7 @@
# along with this program. If not, see .
from __future__ import absolute_import
-
+from six.moves import range
import math
import numpy as np
@@ -81,7 +81,10 @@ def __init__(self, model, estimator, mlags=None, conf=0.95, err_est=False,
self.test_estimator = estimator
# set mlags
- maxlength = np.max([len(dtraj) for dtraj in estimator.discrete_trajectories_full])
+ try:
+ maxlength = np.max([len(dtraj) for dtraj in estimator.discrete_trajectories_full])
+ except AttributeError:
+ maxlength = np.max(estimator.trajectory_lengths())
maxmlag = int(math.floor(maxlength / estimator.lag))
if mlags is None:
mlags = maxmlag
diff --git a/pyemma/msm/estimators/maximum_likelihood_hmsm.py b/pyemma/msm/estimators/maximum_likelihood_hmsm.py
index d659e1d3c..5e77ab21c 100644
--- a/pyemma/msm/estimators/maximum_likelihood_hmsm.py
+++ b/pyemma/msm/estimators/maximum_likelihood_hmsm.py
@@ -17,7 +17,7 @@
# along with this program. If not, see .
from __future__ import absolute_import
-#
+from six.moves import range
from pyemma.util.annotators import alias, aliased, fix_docs
import numpy as _np
diff --git a/pyemma/msm/estimators/maximum_likelihood_msm.py b/pyemma/msm/estimators/maximum_likelihood_msm.py
index e68b56dd3..3120b1141 100644
--- a/pyemma/msm/estimators/maximum_likelihood_msm.py
+++ b/pyemma/msm/estimators/maximum_likelihood_msm.py
@@ -228,7 +228,7 @@ def score(self, dtrajs, score_method=None, score_k=None):
score_method : str
Overwrite scoring method if desired. If `None`, the estimators scoring
method will be used. See __init__ for documentation.
- score_k : str
+ score_k : int or None
Overwrite scoring rank if desired. If `None`, the estimators scoring
rank will be used. See __init__ for documentation.
score_method : str, optional, default='VAMP2'
diff --git a/pyemma/util/_config.py b/pyemma/util/_config.py
index 8bfe25849..86dac0225 100644
--- a/pyemma/util/_config.py
+++ b/pyemma/util/_config.py
@@ -17,6 +17,7 @@
from __future__ import absolute_import, print_function
+import six
from six.moves.configparser import ConfigParser
import os
import shutil
@@ -32,6 +33,10 @@
class ReadConfigException(Exception):
pass
+if six.PY2:
+ class NotADirectoryError(Exception):
+ pass
+
__all__ = ('Config', )
@@ -172,10 +177,10 @@ def cfg_dir(self, pyemma_cfg_dir):
if not os.path.exists(pyemma_cfg_dir):
try:
mkdir_p(pyemma_cfg_dir)
- except EnvironmentError:
- raise ConfigDirectoryException("could not create configuration directory '%s'" % pyemma_cfg_dir)
except NotADirectoryError: # on Python 3
raise ConfigDirectoryException("pyemma cfg dir (%s) is not a directory" % pyemma_cfg_dir)
+ except EnvironmentError:
+ raise ConfigDirectoryException("could not create configuration directory '%s'" % pyemma_cfg_dir)
if not os.path.isdir(pyemma_cfg_dir):
raise ConfigDirectoryException("%s is no valid directory" % pyemma_cfg_dir)
diff --git a/pyemma/util/annotators.py b/pyemma/util/annotators.py
index 5843d4fc9..ecbe05f37 100644
--- a/pyemma/util/annotators.py
+++ b/pyemma/util/annotators.py
@@ -28,7 +28,6 @@
'deprecated',
'shortcut',
'fix_docs',
- 'estimation_required',
]
diff --git a/pyemma/util/types.py b/pyemma/util/types.py
index ca3957134..65a0834f3 100644
--- a/pyemma/util/types.py
+++ b/pyemma/util/types.py
@@ -27,6 +27,8 @@
import numbers
import collections
+from six import string_types
+
# ======================================================================================================================
# BASIC TYPE CHECKS
# ======================================================================================================================
@@ -137,7 +139,7 @@ def is_float_array(l):
return False
def is_string(s):
- return isinstance(s, str)
+ return isinstance(s, string_types)
def is_iterable(I):
return isinstance(I, collections.Iterable)
@@ -147,7 +149,7 @@ def is_list(S):
return isinstance(S, (list, tuple))
def is_list_of_string(S):
- return isinstance(S, (list, tuple)) and (all(isinstance(s, str) for s in S))
+ return isinstance(S, (list, tuple)) and (all(isinstance(s, string_types) for s in S))
def ensure_dtraj(dtraj):
r"""Makes sure that dtraj is a discrete trajectory (array of int)
@@ -171,8 +173,8 @@ def ensure_dtraj_list(dtrajs):
if is_list_of_int(dtrajs):
return [np.array(dtrajs, dtype=int)]
else:
- for i in range(len(dtrajs)):
- dtrajs[i] = ensure_dtraj(dtrajs[i])
+ for i, dtraj in enumerate(dtrajs):
+ dtrajs[i] = ensure_dtraj(dtraj)
return dtrajs
else:
return [ensure_dtraj(dtrajs)]
@@ -476,8 +478,8 @@ def ensure_traj_list(trajs):
return [np.array(trajs)[:,None]]
else:
res = []
- for i in range(len(trajs)):
- res.append(ensure_traj(trajs[i]))
+ for traj in trajs:
+ res.append(ensure_traj(traj))
return res
else:
# looks like this is one trajectory
diff --git a/setup.py b/setup.py
index b2f41c2ce..b201bad95 100755
--- a/setup.py
+++ b/setup.py
@@ -59,6 +59,7 @@
Operating System :: MacOS :: MacOS X
Operating System :: POSIX
Operating System :: Microsoft :: Windows
+Programming Language :: Python :: 2.7
Programming Language :: Python :: 3
Topic :: Scientific/Engineering :: Bio-Informatics
Topic :: Scientific/Engineering :: Chemistry
@@ -66,7 +67,12 @@
Topic :: Scientific/Engineering :: Physics
"""
-
+from setup_util import lazy_cythonize
+try:
+ from setuptools import setup, Extension, find_packages
+except ImportError as ie:
+ print("PyEMMA requires setuptools. Please install it with conda or pip.")
+ sys.exit(1)
###############################################################################
# Extensions