diff --git a/devtools/ci/travis/install_miniconda.sh b/devtools/ci/travis/install_miniconda.sh index ace5acd15..76b5ac968 100755 --- a/devtools/ci/travis/install_miniconda.sh +++ b/devtools/ci/travis/install_miniconda.sh @@ -30,5 +30,5 @@ else # if it does not exist, we need to install miniconda fi # we want to have an up to date conda-build. -conda install conda-build=3.2 +conda install conda-build=3 conda info -a # for debugging diff --git a/devtools/conda-recipe/meta.yaml b/devtools/conda-recipe/meta.yaml index 6095c74dd..e8f26ef92 100644 --- a/devtools/conda-recipe/meta.yaml +++ b/devtools/conda-recipe/meta.yaml @@ -24,7 +24,7 @@ requirements: - numpy 1.9.* # [not (win and (py35 or py36))] - numpy 1.9.* # [win and py35] - numpy 1.11.* # [win and py36] - - python >=3 + - python - scipy - setuptools - gcc # [ not win ] @@ -42,10 +42,11 @@ requirements: - numpy >=1.11,<1.14 # [win and py36] - pathos - psutil >3.1 - - python >=3 + - python - pyyaml - scipy - setuptools + - six >=1.10 - thermotools >=0.2.6 - tqdm diff --git a/devtools/conda-recipe/run_test.py b/devtools/conda-recipe/run_test.py index 71bef7209..dcece6801 100644 --- a/devtools/conda-recipe/run_test.py +++ b/devtools/conda-recipe/run_test.py @@ -9,7 +9,9 @@ # where to write junit xml junit_xml = os.path.join(os.getenv('CIRCLE_TEST_REPORTS', os.path.expanduser('~')), 'reports', 'junit.xml') -os.makedirs(os.path.dirname(junit_xml), exist_ok=True) +target_dir = os.path.dirname(junit_xml) +if not os.path.exists(target_dir): + os.makedirs(target_dir) print('junit destination:', junit_xml) njobs_args = '-p no:xdist' if os.getenv('TRAVIS') else '-n2' diff --git a/doc/source/CHANGELOG.rst b/doc/source/CHANGELOG.rst index 35a82c182..22d0e1d7c 100644 --- a/doc/source/CHANGELOG.rst +++ b/doc/source/CHANGELOG.rst @@ -5,7 +5,7 @@ Changelog ---------------- As of this version the usage of Python 2.7 is officially deprecated. Please upgrade -your Python installation to at least version 3.5. +your Python installation to at least version 3.5 to catch future updates. **New features**: @@ -13,11 +13,12 @@ your Python installation to at least version 3.5. data into estimation of Markov models from molecular simulations. The method is described in [1]. #1111 - msm: Added mincount_connectivity argument to MSM estimators. This option enables to omit counts below a given threshold. #1106 -- coodinates: selection based features allow alignment to a reference structure. #1184 +- coordinates: selection based features allow alignment to a reference structure. #1184 - coordinates: two new center of mass features: ResidueCOMFeature() and GroupCOMFeature() - coordinates: new configuration variable 'default_chunksize' can be set to limit the size of a fragmented extracted per iteration from a data source. This is invariant to the dimension of data sets. #1190 - datasets: added Prinz potential (quadwell). #1226 +- coordinates: added VAMP estimator. #1237 - References: diff --git a/pyemma/_base/estimator.py b/pyemma/_base/estimator.py index 88564d92f..64a3d1b68 100644 --- a/pyemma/_base/estimator.py +++ b/pyemma/_base/estimator.py @@ -299,7 +299,8 @@ def estimate_param_scan(estimator, X, param_sets, evaluate=None, evaluate_args=N if evaluate is not None and evaluate_args is not None and len(evaluate) != len(evaluate_args): raise ValueError("length mismatch: evaluate ({}) and evaluate_args ({})".format(len(evaluate), len(evaluate_args))) - if progress_reporter is not None: + show_progress = progress_reporter is not None and show_progress + if show_progress: progress_reporter._progress_register(len(estimators), stage=0, description="estimating %s" % str(estimator.__class__.__name__)) @@ -317,8 +318,7 @@ def estimate_param_scan(estimator, X, param_sets, evaluate=None, evaluate_args=N from pathos.multiprocessing import Pool as Parallel pool = Parallel(processes=n_jobs) args = list(task_iter) - if progress_reporter is not None: - progress_reporter._progress_register(len(estimators), stage=0, description="estimating %s" % str(estimator.__class__.__name__)) + if show_progress: from pyemma._base.model import SampledModel for a in args: if isinstance(a[0], SampledModel): @@ -352,7 +352,7 @@ def error_callback(*args, **kw): estimators[0].logger.debug('estimating %s with n_jobs=1 because of the setting or ' 'you not have a POSIX system', estimator) res = [] - if progress_reporter is not None: + if show_progress: from pyemma._base.model import SampledModel if isinstance(estimator, SampledModel): for e in estimators: @@ -361,10 +361,10 @@ def error_callback(*args, **kw): for estimator, param_set in zip(estimators, param_sets): res.append(_estimate_param_scan_worker(estimator, param_set, X, evaluate, evaluate_args, failfast, return_exceptions)) - if progress_reporter is not None and show_progress: + if show_progress: progress_reporter._progress_update(1, stage=0) - if progress_reporter is not None and show_progress: + if show_progress: progress_reporter._progress_force_finish(0) # done diff --git a/pyemma/_ext/variational/solvers/direct.py b/pyemma/_ext/variational/solvers/direct.py index db442aedf..d3c2be57b 100644 --- a/pyemma/_ext/variational/solvers/direct.py +++ b/pyemma/_ext/variational/solvers/direct.py @@ -125,7 +125,7 @@ def spd_inv(W, epsilon=1e-10, method='QR', canonical_signs=False): return Winv -def spd_inv_sqrt(W, epsilon=1e-10, method='QR', canonical_signs=False): +def spd_inv_sqrt(W, epsilon=1e-10, method='QR', canonical_signs=False, return_rank=False): """ Computes :math:`W^{-1/2}` of symmetric positive-definite matrix :math:`W`. @@ -153,14 +153,18 @@ def spd_inv_sqrt(W, epsilon=1e-10, method='QR', canonical_signs=False): Matrix :math:`L` from the decomposition :math:`W^{-1} = L L^T`. """ - if (_np.shape(W)[0] == 1): - Winv = 1./_np.sqrt(W[0,0]) + if _np.shape(W)[0] == 1: + Winv = 1./_np.sqrt(W[0, 0]) + sm = _np.ones(1) else: sm, Vm = spd_eig(W, epsilon=epsilon, method=method, canonical_signs=canonical_signs) Winv = _np.dot(Vm, _np.diag(1.0 / _np.sqrt(sm))).dot(Vm.T) # return split - return Winv + if return_rank: + return Winv, sm.shape[0] + else: + return Winv def spd_inv_split(W, epsilon=1e-10, method='QR', canonical_signs=False): diff --git a/pyemma/coordinates/__init__.py b/pyemma/coordinates/__init__.py index c7b5dbfba..0b3adf034 100644 --- a/pyemma/coordinates/__init__.py +++ b/pyemma/coordinates/__init__.py @@ -51,6 +51,7 @@ pca tica + vamp **Clustering Algorithms** @@ -84,6 +85,7 @@ transform.PCA transform.TICA + transform.VAMP **Covariance estimation** diff --git a/pyemma/coordinates/acf.py b/pyemma/coordinates/acf.py index 86eb8d489..746bcde3d 100644 --- a/pyemma/coordinates/acf.py +++ b/pyemma/coordinates/acf.py @@ -18,7 +18,6 @@ -from __future__ import absolute_import, print_function import numpy as np import sys diff --git a/pyemma/coordinates/api.py b/pyemma/coordinates/api.py index e8127c59b..75ad04717 100644 --- a/pyemma/coordinates/api.py +++ b/pyemma/coordinates/api.py @@ -51,6 +51,7 @@ 'save_trajs', 'pca', # transform 'tica', + 'vamp', 'covariance_lagged', 'cluster_regspace', # cluster 'cluster_kmeans', @@ -375,9 +376,9 @@ def source(inp, features=None, top=None, chunksize=None, **kw): # CASE 1: input is a string or list of strings # check: if single string create a one-element list - if isinstance(inp, str) or ( + if isinstance(inp, _string_types) or ( isinstance(inp, (list, tuple)) - and (any(isinstance(item, (list, tuple, str)) for item in inp) or len(inp) is 0)): + and (any(isinstance(item, (list, tuple, _string_types)) for item in inp) or len(inp) is 0)): reader = create_file_reader(inp, top, features, chunksize=cs, **kw) elif isinstance(inp, _np.ndarray) or (isinstance(inp, (list, tuple)) @@ -716,7 +717,7 @@ def save_traj(traj_inp, indexes, outfile, top=None, stride = 1, chunksize=None, # Do we have what we need? if not isinstance(traj_inp, (list, tuple)): raise TypeError("traj_inp has to be of type list, not %s" % type(traj_inp)) - if not isinstance(top, (str, Topology, Trajectory)): + if not isinstance(top, (_string_types, Topology, Trajectory)): raise TypeError("traj_inp cannot be a list of files without an input " "top of type str (eg filename.pdb), mdtraj.Trajectory or mdtraj.Topology. " "Got type %s instead" % type(top)) @@ -1255,10 +1256,160 @@ def tica(data=None, lag=10, dim=-1, var_cutoff=0.95, kinetic_map=True, commute_m return res -def covariance_lagged(data=None, c00=True, c0t=True, ctt=False, remove_constant_mean=None, remove_data_mean=False, - reversible=False, bessel=True, lag=0, weights="empirical", stride=1, skip=0, chunksize=None): +def vamp(data=None, lag=10, dim=None, scaling=None, right=True, ncov_max=float('inf'), + stride=1, skip=0, chunksize=None): + r""" Variational approach for Markov processes (VAMP) [1]_. + + Parameters + ---------- + lag : int + lag time + dim : float or int + Number of dimensions to keep: + + * if dim is not set all available ranks are kept: + `n_components == min(n_samples, n_features)` + * if dim is an integer >= 1, this number specifies the number + of dimensions to keep. By default this will use the kinetic + variance. + * if dim is a float with ``0 < dim < 1``, select the number + of dimensions such that the amount of kinetic variance + that needs to be explained is greater than the percentage + specified by dim. + scaling : None or string + Scaling to be applied to the VAMP order parameters upon transformation + + * None: no scaling will be applied, variance of the order parameters is 1 + * 'kinetic map' or 'km': order parameters are scaled by singular value + Only the left singular functions induce a kinetic map. + Therefore scaling='km' is only effective if `right` is False. + right : boolean + Whether to compute the right singular functions. + If `right==True`, `get_output()` will return the right singular + functions. Otherwise, `get_output()` will return the left singular + functions. + Beware that only `frames[tau:, :]` of each trajectory returned + by `get_output()` contain valid values of the right singular + functions. Conversely, only `frames[0:-tau, :]` of each + trajectory returned by `get_output()` contain valid values of + the left singular functions. The remaining frames might + possibly be interpreted as some extrapolation. + epsilon : float + singular value cutoff. Singular values of :math:`C0` with + norms <= epsilon will be cut off. The remaining number of + singular values define the size of the output. + stride: int, optional, default = 1 + Use only every stride-th time step. By default, every time step is used. + skip : int, default=0 + skip the first initial n frames per trajectory. + ncov_max : int, default=infinity + limit the memory usage of the algorithm from [3]_ to an amount that corresponds + to ncov_max additional copies of each correlation matrix + + Notes + ----- + VAMP is a method for dimensionality reduction of Markov processes. + + The Koopman operator :math:`\mathcal{K}` is an integral operator + that describes conditional future expectation values. Let + :math:`p(\mathbf{x},\,\mathbf{y})` be the conditional probability + density of visiting an infinitesimal phase space volume around + point :math:`\mathbf{y}` at time :math:`t+\tau` given that the phase + space point :math:`\mathbf{x}` was visited at the earlier time + :math:`t`. Then the action of the Koopman operator on a function + :math:`f` can be written as follows: + + .. math:: + + \mathcal{K}f=\int p(\mathbf{x},\,\mathbf{y})f(\mathbf{y})\,\mathrm{dy}=\mathbb{E}\left[f(\mathbf{x}_{t+\tau}\mid\mathbf{x}_{t}=\mathbf{x})\right] + + The Koopman operator is defined without any reference to an + equilibrium distribution. Therefore it is well-defined in + situations where the dynamics is irreversible or/and non-stationary + such that no equilibrium distribution exists. + + If we approximate :math:`f` by a linear superposition of ansatz + functions :math:`\boldsymbol{\chi}` of the conformational + degrees of freedom (features), the operator :math:`\mathcal{K}` + can be approximated by a (finite-dimensional) matrix :math:`\mathbf{K}`. + + The approximation is computed as follows: From the time-dependent + input features :math:`\boldsymbol{\chi}(t)`, we compute the mean + :math:`\boldsymbol{\mu}_{0}` (:math:`\boldsymbol{\mu}_{1}`) from + all data excluding the last (first) :math:`\tau` steps of every + trajectory as follows: + + .. math:: + + \boldsymbol{\mu}_{0} :=\frac{1}{T-\tau}\sum_{t=0}^{T-\tau}\boldsymbol{\chi}(t) + + \boldsymbol{\mu}_{1} :=\frac{1}{T-\tau}\sum_{t=\tau}^{T}\boldsymbol{\chi}(t) + + Next, we compute the instantaneous covariance matrices + :math:`\mathbf{C}_{00}` and :math:`\mathbf{C}_{11}` and the + time-lagged covariance matrix :math:`\mathbf{C}_{01}` as follows: + + .. math:: + + \mathbf{C}_{00} :=\frac{1}{T-\tau}\sum_{t=0}^{T-\tau}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right]\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right] + + \mathbf{C}_{11} :=\frac{1}{T-\tau}\sum_{t=\tau}^{T}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{1}\right]\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{1}\right] + + \mathbf{C}_{01} :=\frac{1}{T-\tau}\sum_{t=0}^{T-\tau}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right]\left[\boldsymbol{\chi}(t+\tau)-\boldsymbol{\mu}_{1}\right] + + The Koopman matrix is then computed as follows: + + .. math:: + + \mathbf{K}=\mathbf{C}_{00}^{-1}\mathbf{C}_{01} + + It can be shown [1]_ that the leading singular functions of the + half-weighted Koopman matrix + + .. math:: + + \bar{\mathbf{K}}:=\mathbf{C}_{00}^{-\frac{1}{2}}\mathbf{C}_{01}\mathbf{C}_{11}^{-\frac{1}{2}} + + encode the best reduced dynamical model for the time series. + + The singular functions can be computed by first performing the + singular value decomposition + + .. math:: + + \bar{\mathbf{K}}=\mathbf{U}^{\prime}\mathbf{S}\mathbf{V}^{\prime} + + and then mapping the input conformation to the left singular + functions :math:`\boldsymbol{\psi}` and right singular + functions :math:`\boldsymbol{\phi}` as follows: + + .. math:: + + \boldsymbol{\psi}(t):=\mathbf{U}^{\prime\top}\mathbf{C}_{00}^{-\frac{1}{2}}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right] + + \boldsymbol{\phi}(t):=\mathbf{V}^{\prime\top}\mathbf{C}_{11}^{-\frac{1}{2}}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{1}\right] + + + References + ---------- + .. [1] Wu, H. and Noe, F. 2017. Variational approach for learning Markov processes from time series data. + arXiv:1707.04659v1 + .. [2] Noe, F. and Clementi, C. 2015. Kinetic distance and kinetic maps from molecular dynamics simulation. + J. Chem. Theory. Comput. doi:10.1021/acs.jctc.5b00553 + .. [3] Chan, T. F., Golub G. H., LeVeque R. J. 1979. Updating formulae and pairwiese algorithms for + computing sample variances. Technical Report STAN-CS-79-773, Department of Computer Science, Stanford University. """ - Compute lagged covariances between time series. If data is available as an array of size (TxN), where T is the + from pyemma.coordinates.transform.vamp import VAMP + res = VAMP(lag, dim=dim, scaling=scaling, right=right, skip=skip, ncov_max=ncov_max) + if data is not None: + res.estimate(data, stride=stride, chunksize=chunksize) + return res + + +def covariance_lagged(data=None, c00=True, c0t=True, ctt=False, remove_constant_mean=None, remove_data_mean=False, + reversible=False, bessel=True, lag=0, weights="empirical", stride=1, skip=0, chunksize=None, + ncov_max=float('inf')): + r"""Compute lagged covariances between time series. If data is available as an array of size (TxN), where T is the number of time steps and N the number of dimensions, this function can compute lagged covariances like .. math:: @@ -1306,6 +1457,9 @@ def covariance_lagged(data=None, c00=True, c0t=True, ctt=False, remove_constant_ to optimize thread usage and gain processing speed. If None is passed, use the default value of the underlying reader/data source. Choose zero to disable chunking at all. + ncov_max : int, default=infinity + limit the memory usage of the algorithm from [2]_ to an amount that corresponds + to ncov_max additional copies of each correlation matrix Returns ------- @@ -1314,17 +1468,17 @@ def covariance_lagged(data=None, c00=True, c0t=True, ctt=False, remove_constant_ .. [1] Wu, H., Nueske, F., Paul, F., Klus, S., Koltai, P., and Noe, F. 2016. Bias reduced variational approximation of molecular kinetics from short off-equilibrium simulations. J. Chem. Phys. (submitted) - + .. [2] Chan, T. F., Golub G. H., LeVeque R. J. 1979. Updating formulae and pairwiese algorithms for + computing sample variances. Technical Report STAN-CS-79-773, Department of Computer Science, Stanford University. """ - from pyemma.coordinates.estimation.covariance import LaggedCovariance from pyemma.coordinates.estimation.koopman import _KoopmanEstimator import types - if isinstance(weights, str): + if isinstance(weights, _string_types): if weights== "koopman": if data is None: raise ValueError("Data must be supplied for reweighting='koopman'") - koop = _KoopmanEstimator(lag=lag, stride=stride, skip=skip) + koop = _KoopmanEstimator(lag=lag, stride=stride, skip=skip, ncov_max=ncov_max) koop.estimate(data, chunksize=chunksize) weights = koop.weights elif weights == "empirical": @@ -1342,7 +1496,7 @@ def covariance_lagged(data=None, c00=True, c0t=True, ctt=False, remove_constant_ # chunksize is an estimation parameter for now. lc = LaggedCovariance(c00=c00, c0t=c0t, ctt=ctt, remove_constant_mean=remove_constant_mean, remove_data_mean=remove_data_mean, reversible=reversible, bessel=bessel, lag=lag, - weights=weights, stride=stride, skip=skip) + weights=weights, stride=stride, skip=skip, ncov_max=ncov_max) if data is not None: lc.estimate(data, chunksize=chunksize) return lc diff --git a/pyemma/coordinates/data/sources_merger.py b/pyemma/coordinates/data/sources_merger.py index 1e663352f..0b1e7351d 100644 --- a/pyemma/coordinates/data/sources_merger.py +++ b/pyemma/coordinates/data/sources_merger.py @@ -18,10 +18,10 @@ class SourcesMerger(DataSource, SerializableMixIn): sources : list, tuple list of DataSources (Readers, StreamingTransformers etc.) to combine for streaming access. - chunk: int + chunk: int or None chunk size to use for underlying iterators. """ - def __init__(self, sources, chunk=5000): + def __init__(self, sources, chunk=None): super(SourcesMerger, self).__init__(chunksize=chunk) self.sources = sources self._is_reader = True diff --git a/pyemma/coordinates/data/util/reader_utils.py b/pyemma/coordinates/data/util/reader_utils.py index db1024639..907d1a6f9 100644 --- a/pyemma/coordinates/data/util/reader_utils.py +++ b/pyemma/coordinates/data/util/reader_utils.py @@ -23,6 +23,8 @@ import numpy as np import os +from six import string_types + def create_file_reader(input_files, topology, featurizer, chunksize=None, **kw): r""" @@ -43,8 +45,6 @@ def create_file_reader(input_files, topology, featurizer, chunksize=None, **kw): from pyemma.coordinates.data.py_csv_reader import PyCSVReader from pyemma.coordinates.data import FeatureReader from pyemma.coordinates.data.fragmented_trajectory_reader import FragmentedTrajectoryReader - import six - str = six.string_types # fragmented trajectories if (isinstance(input_files, (list, tuple)) and len(input_files) > 0 and @@ -52,15 +52,15 @@ def create_file_reader(input_files, topology, featurizer, chunksize=None, **kw): return FragmentedTrajectoryReader(input_files, topology, chunksize, featurizer) # normal trajectories - if (isinstance(input_files, str) + if (isinstance(input_files, string_types) or (isinstance(input_files, (list, tuple)) - and (any(isinstance(item, str) for item in input_files) + and (any(isinstance(item, string_types) for item in input_files) or len(input_files) is 0))): reader = None # check: if single string create a one-element list - if isinstance(input_files, str): + if isinstance(input_files, string_types): input_list = [input_files] - elif len(input_files) > 0 and all(isinstance(item, str) for item in input_files): + elif len(input_files) > 0 and all(isinstance(item, string_types) for item in input_files): input_list = input_files else: if len(input_files) is 0: @@ -177,7 +177,7 @@ def preallocate_empty_trajectory(top, n_frames=1): def enforce_top(top): - if isinstance(top, str): + if isinstance(top, string_types): top = md.load(top).top elif isinstance(top, md.Trajectory): top = top.top diff --git a/pyemma/coordinates/estimation/covariance.py b/pyemma/coordinates/estimation/covariance.py index 2405f3200..5d84726a9 100644 --- a/pyemma/coordinates/estimation/covariance.py +++ b/pyemma/coordinates/estimation/covariance.py @@ -77,14 +77,19 @@ class LaggedCovariance(StreamingEstimator): Use only every stride-th time step. By default, every time step is used. skip : int, optional, default=0 skip the first initial n frames per trajectory. - chunksize : deprecated, default=NoTImplemented - The chunk size can be se during estimation. + chunksize : deprecated, default=NotImplemented + The chunk size should now be set during estimation. """ def __init__(self, c00=True, c0t=False, ctt=False, remove_constant_mean=None, remove_data_mean=False, reversible=False, bessel=True, sparse_mode='auto', modify_data=False, lag=0, weights=None, stride=1, skip=0, chunksize=NotImplemented, ncov_max=float('inf')): super(LaggedCovariance, self).__init__() + if chunksize is not NotImplemented: + import warnings + from pyemma.util.exceptions import PyEMMA_DeprecationWarning + warnings.warn('passed deprecated argument chunksize to LaggedCovariance. Will be ignored!', + category=PyEMMA_DeprecationWarning) if (c0t or ctt) and lag == 0: raise ValueError("lag must be positive if c0t=True or ctt=True") diff --git a/pyemma/coordinates/tests/test_vamp.py b/pyemma/coordinates/tests/test_vamp.py new file mode 100644 index 000000000..2d04a8eb1 --- /dev/null +++ b/pyemma/coordinates/tests/test_vamp.py @@ -0,0 +1,283 @@ +# This file is part of PyEMMA. +# +# Copyright (c) 2017 Computational Molecular Biology Group, Freie Universitaet Berlin (GER) +# +# PyEMMA is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this program. If not, see . + + +""" +@author: paul +""" + +from __future__ import absolute_import +import unittest +import numpy as np +from pyemma.coordinates import vamp as pyemma_api_vamp +from pyemma.msm import estimate_markov_model +from logging import getLogger + +from pyemma.msm.estimators._dtraj_stats import cvsplit_dtrajs + +logger = getLogger('pyemma.'+'TestVAMP') + + +def random_matrix(n, rank=None, eps=0.01): + m = np.random.randn(n, n) + u, s, v = np.linalg.svd(m) + if rank is None: + rank = n + if rank > n: + rank = n + s = np.concatenate((np.maximum(s, eps)[0:rank], np.zeros(n-rank))) + return u.dot(np.diag(s)).dot(v) + + +class TestVAMPEstimatorSelfConsistency(unittest.TestCase): + def test_full_rank(self): + self.do_test(20, 20, test_partial_fit=True) + + def test_low_rank(self): + dim = 30 + rank = 15 + self.do_test(dim, rank, test_partial_fit=True) + + def do_test(self, dim, rank, test_partial_fit=False): + # setup + N_frames = [123, 456, 789] + N_trajs = len(N_frames) + A = random_matrix(dim, rank) + trajs = [] + mean = np.random.randn(dim) + for i in range(N_trajs): + # set up data + white = np.random.randn(N_frames[i], dim) + brown = np.cumsum(white, axis=0) + correlated = np.dot(brown, A) + trajs.append(correlated + mean) + + # test + tau = 50 + vamp = pyemma_api_vamp(trajs, lag=tau, scaling=None) + vamp.right = True + + assert vamp.dimension() <= rank + + atol = np.finfo(vamp.output_type()).eps*10.0 + phi_trajs = [ sf[tau:, :] for sf in vamp.get_output() ] + phi = np.concatenate(phi_trajs) + mean_right = phi.sum(axis=0) / phi.shape[0] + cov_right = phi.T.dot(phi) / phi.shape[0] + np.testing.assert_allclose(mean_right, 0.0, atol=atol) + np.testing.assert_allclose(cov_right, np.eye(vamp.dimension()), atol=atol) + + vamp.right = False + psi_trajs = [ sf[0:-tau, :] for sf in vamp.get_output() ] + psi = np.concatenate(psi_trajs) + mean_left = psi.sum(axis=0) / psi.shape[0] + cov_left = psi.T.dot(psi) / psi.shape[0] + np.testing.assert_allclose(mean_left, 0.0, atol=atol) + np.testing.assert_allclose(cov_left, np.eye(vamp.dimension()), atol=atol) + + # compute correlation between left and right + assert phi.shape[0]==psi.shape[0] + C01_psi_phi = psi.T.dot(phi) / phi.shape[0] + n = max(C01_psi_phi.shape) + C01_psi_phi = C01_psi_phi[0:n,:][:, 0:n] + np.testing.assert_allclose(C01_psi_phi, np.diag(vamp.singular_values[0:vamp.dimension()]), atol=atol) + + if test_partial_fit: + vamp2 = pyemma_api_vamp(lag=tau, scaling=None) + for t in trajs: + vamp2.partial_fit(t) + + model_params = vamp._model.get_model_params() + model_params2 = vamp2._model.get_model_params() + + atol = 1e-15 + rtol = 1e-6 + + for n in model_params.keys(): + if model_params[n] is not None and model_params2[n] is not None: + if n not in ('U', 'V'): + np.testing.assert_allclose(model_params[n], model_params2[n], rtol=rtol, atol=atol, + err_msg='failed for model param %s' % n) + else: + assert_allclose_ignore_phase(model_params[n], model_params2[n], atol=atol) + + vamp2.singular_values # trigger diagonalization + + vamp2.right = True + for t, ref in zip(trajs, phi_trajs): + assert_allclose_ignore_phase(vamp2.transform(t[tau:]), ref, rtol=rtol, atol=atol) + + vamp2.right = False + for t, ref in zip(trajs, psi_trajs): + assert_allclose_ignore_phase(vamp2.transform(t[0:-tau]), ref, rtol=rtol, atol=atol) + + +def generate(T, N_steps, s0=0): + dtraj = np.zeros(N_steps, dtype=int) + s = s0 + T_cdf = T.cumsum(axis=1) + for t in range(N_steps): + dtraj[t] = s + s = np.searchsorted(T_cdf[s, :], np.random.rand()) + return dtraj + + +def assert_allclose_ignore_phase(A, B, atol, rtol=1e-5): + A = np.atleast_2d(A) + B = np.atleast_2d(B) + assert A.shape == B.shape + for i in range(B.shape[1]): + assert (np.allclose(A[:, i], B[:, i], atol=atol, rtol=rtol) + or np.allclose(A[:, i], -B[:, i], atol=atol, rtol=rtol)) + + +class TestVAMPModel(unittest.TestCase): + @classmethod + def setUpClass(cls): + N_steps = 10000 + N_traj = 20 + lag = 1 + T = np.linalg.matrix_power(np.array([[0.7, 0.2, 0.1], [0.1, 0.8, 0.1], [0.1, 0.1, 0.8]]), lag) + dtrajs = [generate(T, N_steps) for _ in range(N_traj)] + p0 = np.zeros(3) + p1 = np.zeros(3) + trajs = [] + for dtraj in dtrajs: + traj = np.zeros((N_steps, T.shape[0])) + traj[np.arange(len(dtraj)), dtraj] = 1.0 + trajs.append(traj) + p0 += traj[:-lag, :].sum(axis=0) + p1 += traj[lag:, :].sum(axis=0) + vamp = pyemma_api_vamp(trajs, lag=lag, scaling=None, dim=1.0) + msm = estimate_markov_model(dtrajs, lag=lag, reversible=False) + cls.trajs = trajs + cls.dtrajs = dtrajs + cls.lag = lag + cls.msm = msm + cls.vamp = vamp + cls.p0 = p0 / p0.sum() + cls.p1 = p1 / p1.sum() + cls.atol = np.finfo(vamp.output_type()).eps*1000.0 + + def test_K_is_T(self): + m0 = self.vamp.model.mean_0 + mt = self.vamp.model.mean_t + C0 = self.vamp.model.C00 + m0[:, np.newaxis]*m0[np.newaxis, :] + C1 = self.vamp.model.C0t + m0[:, np.newaxis]*mt[np.newaxis, :] + K = np.linalg.inv(C0).dot(C1) + np.testing.assert_allclose(K, self.msm.P, atol=1E-5) + + Tsym = np.diag(self.p0 ** 0.5).dot(self.msm.P).dot(np.diag(self.p1 ** -0.5)) + np.testing.assert_allclose(np.linalg.svd(Tsym)[1][1:], self.vamp.singular_values[0:2], atol=1E-7) + + def test_singular_functions_against_MSM(self): + Tsym = np.diag(self.p0 ** 0.5).dot(self.msm.P).dot(np.diag(self.p1 ** -0.5)) + Up, S, Vhp = np.linalg.svd(Tsym) + Vp = Vhp.T + U = Up * (self.p0 ** -0.5)[:, np.newaxis] + V = Vp * (self.p1 ** -0.5)[:, np.newaxis] + assert_allclose_ignore_phase(U[:, 0], np.ones(3), atol=1E-5) + assert_allclose_ignore_phase(V[:, 0], np.ones(3), atol=1E-5) + U = U[:, 1:] + V = V[:, 1:] + self.vamp.right = True + phi = self.vamp.transform(np.eye(3)) + self.vamp.right = False + psi = self.vamp.transform(np.eye(3)) + assert_allclose_ignore_phase(U, psi, atol=1E-5) + assert_allclose_ignore_phase(V, phi, atol=1E-5) + references_sf = [U.T.dot(np.diag(self.p0)).dot(np.linalg.matrix_power(self.msm.P, k*self.lag)).dot(V).T for k in + range(10-1)] + cktest = self.vamp.cktest(n_observables=2, mlags=10) + pred_sf = cktest.predictions + esti_sf = cktest.estimates + for e, p, r in zip(esti_sf[1:], pred_sf[1:], references_sf[1:]): + np.testing.assert_allclose(np.diag(p), np.diag(r), atol=1E-6) + np.testing.assert_allclose(np.abs(p), np.abs(r), atol=1E-6) + + def test_CK_expectation_against_MSM(self): + obs = np.eye(3) # observe every state + cktest = self.vamp.cktest(observables=obs, statistics=None, mlags=4) + pred = cktest.predictions[1:] + est = cktest.estimates[1:] + + for i, (est_, pred_) in enumerate(zip(est, pred)): + msm = estimate_markov_model(dtrajs=self.dtrajs, lag=self.lag*(i+1), reversible=False) + msm_esti = self.p0.T.dot(msm.P).dot(obs) + msm_pred = self.p0.T.dot(np.linalg.matrix_power(self.msm.P, (i+1))).dot(obs) + np.testing.assert_allclose(pred_, msm_pred, atol=self.atol) + np.testing.assert_allclose(est_, msm_esti, atol=self.atol) + np.testing.assert_allclose(est_, pred_, atol=0.006) + + def test_CK_covariances_of_singular_functions(self): + cktest = self.vamp.cktest(n_observables=2, mlags=4) # auto + pred = cktest.predictions[1:] + est = cktest.estimates[1:] + error = np.max(np.abs(np.array(pred) - np.array(est))) / max(np.max(pred), np.max(est)) + assert error < 0.05 + + def test_CK_covariances_against_MSM(self): + obs = np.eye(3) # observe every state + sta = np.eye(3) # restrict p0 to every state + cktest = self.vamp.cktest(observables=obs, statistics=sta, mlags=4, show_progress=True) + pred = cktest.predictions[1:] + est = cktest.estimates[1:] + + for i, (est_, pred_) in enumerate(zip(est, pred)): + msm = estimate_markov_model(dtrajs=self.dtrajs, lag=self.lag*(i+1), reversible=False) + msm_esti = (self.p0 * sta).T.dot(msm.P).dot(obs).T + msm_pred = (self.p0 * sta).T.dot(np.linalg.matrix_power(self.msm.P, (i+1))).dot(obs).T + np.testing.assert_allclose(np.diag(pred_), np.diag(msm_pred), atol=self.atol) + np.testing.assert_allclose(np.diag(est_), np.diag(msm_esti), atol=self.atol) + np.testing.assert_allclose(np.diag(est_), np.diag(pred_), atol=0.006) + + def test_self_score_with_MSM(self): + T = self.msm.P + Tadj = np.diag(1./self.p1).dot(T.T).dot(np.diag(self.p0)) + NFro = np.trace(T.dot(Tadj)) + s2 = self.vamp.score(score_method='VAMP2') + np.testing.assert_allclose(s2, NFro) + + Tsym = np.diag(self.p0**0.5).dot(T).dot(np.diag(self.p1**-0.5)) + Nnuc = np.linalg.norm(Tsym, ord='nuc') + s1 = self.vamp.score(score_method='VAMP1') + np.testing.assert_allclose(s1, Nnuc) + + # TODO: check why this is not equal + sE = self.vamp.score(score_method='VAMPE') + np.testing.assert_allclose(sE, NFro) # see paper appendix H.2 + + def test_score_vs_MSM(self): + from pyemma.util.contexts import numpy_random_seed + with numpy_random_seed(32): + trajs_test, trajs_train = cvsplit_dtrajs(self.trajs) + with numpy_random_seed(32): + dtrajs_test, dtrajs_train = cvsplit_dtrajs(self.dtrajs) + + methods = ('VAMP1', 'VAMP2', 'VAMPE') + + for m in methods: + msm_train = estimate_markov_model(dtrajs=dtrajs_train, lag=self.lag, reversible=False) + score_msm = msm_train.score(dtrajs_test, score_method=m, score_k=None) + + vamp_train = pyemma_api_vamp(data=trajs_train, lag=self.lag, dim=1.0) + score_vamp = vamp_train.score(test_data=trajs_test, score_method=m) + + self.assertAlmostEqual(score_msm, score_vamp, places=2 if m == 'VAMPE' else 3, msg=m) + +if __name__ == "__main__": + unittest.main() diff --git a/pyemma/coordinates/transform/__init__.py b/pyemma/coordinates/transform/__init__.py index de8366d13..b7f976ceb 100644 --- a/pyemma/coordinates/transform/__init__.py +++ b/pyemma/coordinates/transform/__init__.py @@ -28,7 +28,11 @@ PCA - principal components TICA - time independent components + VAMP - Variational approach for Markov processes + VAMPModel - Kinetic model form the Variational approach for Markov processes + VAMPChapmanKolmogorovValidator - Chapman Kolmogorov test for the Variational approach for Markov processes """ from .pca import * from .tica import * +from .vamp import * diff --git a/pyemma/coordinates/transform/vamp.py b/pyemma/coordinates/transform/vamp.py new file mode 100644 index 000000000..981e1080b --- /dev/null +++ b/pyemma/coordinates/transform/vamp.py @@ -0,0 +1,1018 @@ +# This file is part of PyEMMA. +# +# Copyright (c) 2017 Computational Molecular Biology Group, Freie Universitaet Berlin (GER) +# +# PyEMMA is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this program. If not, see . +''' +@author: paul, marscher, wu, noe +''' + +from __future__ import absolute_import + +import numpy as np + +from pyemma._base.model import Model +from pyemma._base.serialization.serialization import SerializableMixIn +from pyemma.util.annotators import fix_docs +from pyemma.util.types import ensure_ndarray_or_None, ensure_ndarray +from pyemma._ext.variational.solvers.direct import spd_inv_sqrt +from pyemma.coordinates.estimation.covariance import LaggedCovariance +from pyemma.coordinates.data._base.transformer import StreamingEstimationTransformer +from pyemma.msm.estimators.lagged_model_validators import LaggedModelValidator +from pyemma.util.linalg import mdot + +import warnings + +__all__ = ['VAMP', 'VAMPModel', 'VAMPChapmanKolmogorovValidator'] + + +class VAMPModel(Model, SerializableMixIn): + __serialize_version = 0 + __serialize_fields = ('_U', '_V', '_svd_performed') + + def set_model_params(self, mean_0, mean_t, C00, Ctt, C0t, U, V, singular_values, cumvar, dim, epsilon): + self.mean_0 = mean_0 + self.mean_t = mean_t + self.C00 = C00 + self.Ctt = Ctt + self.C0t = C0t + self._svd_performed = False + self._U = U + self._V = V + self._singular_values = singular_values + self.cumvar = cumvar + self.dim = dim + self.epsilon = epsilon + + @property + def U(self): + "Tranformation matrix that represents the linear map from mean-free feature space to the space of left singular functions." + if not self._svd_performed: + self._diagonalize() + return self._U + + @property + def V(self): + "Tranformation matrix that represents the linear map from mean-free feature space to the space of right singular functions." + if not self._svd_performed: + self._diagonalize() + return self._V + + @property + def singular_values(self): + "The singular values of the half-weighted Koopman matrix" + if not self._svd_performed: + self._diagonalize() + return self._singular_values + + @property + def C00(self): + return self._C00 + + @C00.setter + def C00(self, val): + self._svd_performed = False + self._C00 = val + + @property + def C0t(self): + return self._C0t + + @C0t.setter + def C0t(self, val): + self._svd_performed = False + self._C0t = val + + @property + def Ctt(self): + return self._Ctt + + @Ctt.setter + def Ctt(self, val): + self._svd_performed = False + self._Ctt = val + + def dimension(self): + """ output dimension """ + if self.dim is None or (isinstance(self.dim, float) and self.dim == 1.0): + if hasattr(self, '_rank0'): + return min(self._rank0, self._rankt) + else: + raise RuntimeError('Requested dimension, but the dimension depends on the singular values of C00 and C11' + ' and the transformer has not yet been estimated. Call estimate() before.') + if isinstance(self.dim, float): + if hasattr(self, 'cumvar') and self.cumvar is not None: + return np.count_nonzero(self.cumvar >= self.dim) + else: + raise RuntimeError('Requested dimension, but the dimension depends on the cumulative variance and the ' + 'transformer has not yet been estimated. Call estimate() before.') + else: + if hasattr(self, '_rank0'): + return np.min([self._rank0, self._rankt, self.dim]) + else: + warnings.warn( + RuntimeWarning('Requested dimension, but the dimension depends on the singular values of C00 and C11' + ' and the transformer has not yet been estimated. Result is only an approximation.')) + return self.dim + + def expectation(self, observables, statistics, lag_multiple=1, observables_mean_free=False, statistics_mean_free=False): + r"""Compute future expectation of observable or covariance using the approximated Koopman operator. + + Parameters + ---------- + observables : np.ndarray((input_dimension, n_observables)) + Coefficients that express one or multiple observables in + the basis of the input features. + + statistics : np.ndarray((input_dimension, n_statistics)), optional + Coefficients that express one or multiple statistics in + the basis of the input features. + This parameter can be None. In that case, this method + returns the future expectation value of the observable(s). + + lag_multiple : int + If > 1, extrapolate to a multiple of the estimator's lag + time by assuming Markovianity of the approximated Koopman + operator. + + observables_mean_free : bool, default=False + If true, coefficients in `observables` refer to the input + features with feature means removed. + If false, coefficients in `observables` refer to the + unmodified input features. + + statistics_mean_free : bool, default=False + If true, coefficients in `statistics` refer to the input + features with feature means removed. + If false, coefficients in `statistics` refer to the + unmodified input features. + + Notes + ----- + A "future expectation" of a observable g is the average of g computed + over a time window that has the same total length as the input data + from which the Koopman operator was estimated but is shifted + by lag_multiple*tau time steps into the future (where tau is the lag + time). + + It is computed with the equation: + + .. math:: + + \mathbb{E}[g]_{\rho_{n}}=\mathbf{q}^{T}\mathbf{P}^{n-1}\mathbf{e}_{1} + + where + + .. math:: + + P_{ij}=\sigma_{i}\langle\psi_{i},\phi_{j}\rangle_{\rho_{1}} + + and + + .. math:: + + q_{i}=\langle g,\phi_{i}\rangle_{\rho_{1}} + + and :math:`\mathbf{e}_{1}` is the first canonical unit vector. + + + A model prediction of time-lagged covariances between the + observable f and the statistic g at a lag-time of lag_multiple*tau + is computed with the equation: + + .. math:: + + \mathrm{cov}[g,\,f;n\tau]=\mathbf{q}^{T}\mathbf{P}^{n-1}\boldsymbol{\Sigma}\mathbf{r} + + where :math:`r_{i}=\langle\psi_{i},f\rangle_{\rho_{0}}` and + :math:`\boldsymbol{\Sigma}=\mathrm{diag(\boldsymbol{\sigma})}` . + """ + # TODO: implement the case lag_multiple=0 + + dim = self.dimension() + + S = np.diag(np.concatenate(([1.0], self.singular_values[0:dim]))) + V = self.V[:, 0:dim] + U = self.U[:, 0:dim] + m_0 = self.mean_0 + m_t = self.mean_t + + assert lag_multiple >= 1, 'lag_multiple = 0 not implemented' + + if lag_multiple == 1: + P = S + else: + p = np.zeros((dim + 1, dim + 1)) + p[0, 0] = 1.0 + p[1:, 0] = U.T.dot(m_t - m_0) + p[1:, 1:] = U.T.dot(self.Ctt).dot(V) + P = np.linalg.matrix_power(S.dot(p), lag_multiple - 1).dot(S) + + Q = np.zeros((observables.shape[1], dim + 1)) + if not observables_mean_free: + Q[:, 0] = observables.T.dot(m_t) + Q[:, 1:] = observables.T.dot(self.Ctt).dot(V) + + if statistics is not None: + # compute covariance + R = np.zeros((statistics.shape[1], dim + 1)) + if not statistics_mean_free: + R[:, 0] = statistics.T.dot(m_0) + R[:, 1:] = statistics.T.dot(self.C00).dot(U) + + if statistics is not None: + # compute lagged covariance + return Q.dot(P).dot(R.T) + # TODO: discuss whether we want to return this or the transpose + # TODO: from MSMs one might expect to first index to refer to the statistics, here it is the other way round + else: + # compute future expectation + return Q.dot(P)[:, 0] + + def _diagonalize(self, scaling=None): + """Performs SVD on covariance matrices and save left, right singular vectors and values in the model. + + Parameters + ---------- + scaling : None or string, default=None + Scaling to be applied to the VAMP modes upon transformation + * None: no scaling will be applied, variance of the singular + functions is 1 + * 'kinetic map' or 'km': singular functions are scaled by + singular value. Note that only the left singular functions + induce a kinetic map. + """ + + L0, self._rank0 = spd_inv_sqrt(self.C00, epsilon=self.epsilon, return_rank=True) + Lt, self._rankt = spd_inv_sqrt(self.Ctt, epsilon=self.epsilon, return_rank=True) + A = L0.T.dot(self.C0t).dot(Lt) + + Uprime, s, Vprimeh = np.linalg.svd(A, compute_uv=True) + self._singular_values = s + + # compute cumulative variance + cumvar = np.cumsum(s ** 2) + cumvar /= cumvar[-1] + self.cumvar = cumvar + + self._L0 = L0 + self._Lt = Lt + + m = self.dimension() + + U = L0.dot(Uprime[:, :m]) # U in the paper singular_vectors_left + V = Lt.dot(Vprimeh[:m, :].T) # V in the paper singular_vectors_right + + # scale vectors + if scaling is None: + pass + elif scaling in ['km', 'kinetic map']: + U *= s[np.newaxis, 0:m] + else: + raise ValueError('unexpected value (%s) of "scaling"' % scaling) + + self._U = U + self._V = V + self._svd_performed = True + + def score(self, test_model=None, score_method='VAMP2'): + """Compute the VAMP score for this model or the cross-validation score between self and a second model. + + Parameters + ---------- + test_model : VAMPModel, optional, default=None + + If `test_model` is not None, this method computes the cross-validation score + between self and `test_model`. It is assumed that self was estimated from + the "training" data and `test_model` was estimated from the "test" data. The + score is computed for one realization of self and `test_model`. Estimation + of the average cross-validation score and partitioning of data into test and + training part is not performed by this method. + + If `test_model` is None, this method computes the VAMP score for the model + contained in self. + + score_method : str, optional, default='VAMP2' + Available scores are based on the variational approach for Markov processes [1]_: + + * 'VAMP1' Sum of singular values of the half-weighted Koopman matrix [1]_ . + If the model is reversible, this is equal to the sum of + Koopman matrix eigenvalues, also called Rayleigh quotient [1]_. + * 'VAMP2' Sum of squared singular values of the half-weighted Koopman matrix [1]_ . + If the model is reversible, this is equal to the kinetic variance [2]_ . + * 'VAMPE' Approximation error of the estimated Koopman operator with respect to + the true Koopman operator up to an additive constant [1]_ . + + Returns + ------- + score : float + If `test_model` is not None, returns the cross-validation VAMP score between + self and `test_model`. Otherwise return the selected VAMP-score of self. + + References + ---------- + .. [1] Wu, H. and Noe, F. 2017. Variational approach for learning Markov processes from time series data. + arXiv:1707.04659v1 + .. [2] Noe, F. and Clementi, C. 2015. Kinetic distance and kinetic maps from molecular dynamics simulation. + J. Chem. Theory. Comput. doi:10.1021/acs.jctc.5b00553 + """ + # TODO: implement for TICA too + if test_model is None: + test_model = self + Uk = self.U[:, 0:self.dimension()] + Vk = self.V[:, 0:self.dimension()] + res = None + if score_method == 'VAMP1' or score_method == 'VAMP2': + A = spd_inv_sqrt(Uk.T.dot(test_model.C00).dot(Uk)) + B = Uk.T.dot(test_model.C0t).dot(Vk) + C = spd_inv_sqrt(Vk.T.dot(test_model.Ctt).dot(Vk)) + ABC = mdot(A, B, C) + if score_method == 'VAMP1': + res = np.linalg.norm(ABC, ord='nuc') + elif score_method == 'VAMP2': + res = np.linalg.norm(ABC, ord='fro')**2 + elif score_method == 'VAMPE': + Sk = np.diag(self.singular_values[0:self.dimension()]) + res = np.trace(2.0 * mdot(Vk, Sk, Uk.T, test_model.C0t) - mdot(Vk, Sk, Uk.T, test_model.C00, Uk, Sk, Vk.T, test_model.Ctt)) + else: + raise ValueError('"score" should be one of VAMP1, VAMP2 or VAMPE') + # add the contribution (+1) of the constant singular functions to the result + assert res + return res + 1 + + +@fix_docs +class VAMP(StreamingEstimationTransformer, SerializableMixIn): + r"""Variational approach for Markov processes (VAMP)""" + + __serialize_version = 0 + + def describe(self): + return "[VAMP, lag = %i; max. output dim. = %s]" % (self._lag, str(self.dim)) + + def __init__(self, lag, dim=None, scaling=None, right=True, epsilon=1e-6, + stride=1, skip=0, ncov_max=float('inf')): + r""" Variational approach for Markov processes (VAMP) [1]_. + + Parameters + ---------- + lag : int + lag time + dim : float or int + Number of dimensions to keep: + + * if dim is not set all available ranks are kept: + `n_components == min(n_samples, n_features)` + * if dim is an integer >= 1, this number specifies the number + of dimensions to keep. By default this will use the kinetic + variance. + * if dim is a float with ``0 < dim < 1``, select the number + of dimensions such that the amount of kinetic variance + that needs to be explained is greater than the percentage + specified by dim. + scaling : None or string + Scaling to be applied to the VAMP order parameters upon transformation + + * None: no scaling will be applied, variance of the order parameters is 1 + * 'kinetic map' or 'km': order parameters are scaled by singular value + Only the left singular functions induce a kinetic map. + Therefore scaling='km' is only effective if `right` is False. + right : boolean + Whether to compute the right singular functions. + If `right==True`, `get_output()` will return the right singular + functions. Otherwise, `get_output()` will return the left singular + functions. + Beware that only `frames[tau:, :]` of each trajectory returned + by `get_output()` contain valid values of the right singular + functions. Conversely, only `frames[0:-tau, :]` of each + trajectory returned by `get_output()` contain valid values of + the left singular functions. The remaining frames might + possibly be interpreted as some extrapolation. + epsilon : float + singular value cutoff. Singular values of :math:`C0` with + norms <= epsilon will be cut off. The remaining number of + singular values define the size of the output. + stride: int, optional, default = 1 + Use only every stride-th time step. By default, every time step is used. + skip : int, default=0 + skip the first initial n frames per trajectory. + ncov_max : int, default=infinity + limit the memory usage of the algorithm from [3]_ to an amount that corresponds + to ncov_max additional copies of each correlation matrix + + Notes + ----- + VAMP is a method for dimensionality reduction of Markov processes. + + The Koopman operator :math:`\mathcal{K}` is an integral operator + that describes conditional future expectation values. Let + :math:`p(\mathbf{x},\,\mathbf{y})` be the conditional probability + density of visiting an infinitesimal phase space volume around + point :math:`\mathbf{y}` at time :math:`t+\tau` given that the phase + space point :math:`\mathbf{x}` was visited at the earlier time + :math:`t`. Then the action of the Koopman operator on a function + :math:`f` can be written as follows: + + .. math:: + + \mathcal{K}f=\int p(\mathbf{x},\,\mathbf{y})f(\mathbf{y})\,\mathrm{dy}=\mathbb{E}\left[f(\mathbf{x}_{t+\tau}\mid\mathbf{x}_{t}=\mathbf{x})\right] + + The Koopman operator is defined without any reference to an + equilibrium distribution. Therefore it is well-defined in + situations where the dynamics is irreversible or/and non-stationary + such that no equilibrium distribution exists. + + If we approximate :math:`f` by a linear superposition of ansatz + functions :math:`\boldsymbol{\chi}` of the conformational + degrees of freedom (features), the operator :math:`\mathcal{K}` + can be approximated by a (finite-dimensional) matrix :math:`\mathbf{K}`. + + The approximation is computed as follows: From the time-dependent + input features :math:`\boldsymbol{\chi}(t)`, we compute the mean + :math:`\boldsymbol{\mu}_{0}` (:math:`\boldsymbol{\mu}_{1}`) from + all data excluding the last (first) :math:`\tau` steps of every + trajectory as follows: + + .. math:: + + \boldsymbol{\mu}_{0} :=\frac{1}{T-\tau}\sum_{t=0}^{T-\tau}\boldsymbol{\chi}(t) + + \boldsymbol{\mu}_{1} :=\frac{1}{T-\tau}\sum_{t=\tau}^{T}\boldsymbol{\chi}(t) + + Next, we compute the instantaneous covariance matrices + :math:`\mathbf{C}_{00}` and :math:`\mathbf{C}_{11}` and the + time-lagged covariance matrix :math:`\mathbf{C}_{01}` as follows: + + .. math:: + + \mathbf{C}_{00} :=\frac{1}{T-\tau}\sum_{t=0}^{T-\tau}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right]\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right] + + \mathbf{C}_{11} :=\frac{1}{T-\tau}\sum_{t=\tau}^{T}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{1}\right]\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{1}\right] + + \mathbf{C}_{01} :=\frac{1}{T-\tau}\sum_{t=0}^{T-\tau}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right]\left[\boldsymbol{\chi}(t+\tau)-\boldsymbol{\mu}_{1}\right] + + The Koopman matrix is then computed as follows: + + .. math:: + + \mathbf{K}=\mathbf{C}_{00}^{-1}\mathbf{C}_{01} + + It can be shown [1]_ that the leading singular functions of the + half-weighted Koopman matrix + + .. math:: + + \bar{\mathbf{K}}:=\mathbf{C}_{00}^{-\frac{1}{2}}\mathbf{C}_{01}\mathbf{C}_{11}^{-\frac{1}{2}} + + encode the best reduced dynamical model for the time series. + + The singular functions can be computed by first performing the + singular value decomposition + + .. math:: + + \bar{\mathbf{K}}=\mathbf{U}^{\prime}\mathbf{S}\mathbf{V}^{\prime} + + and then mapping the input conformation to the left singular + functions :math:`\boldsymbol{\psi}` and right singular + functions :math:`\boldsymbol{\phi}` as follows: + + .. math:: + + \boldsymbol{\psi}(t):=\mathbf{U}^{\prime\top}\mathbf{C}_{00}^{-\frac{1}{2}}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right] + + \boldsymbol{\phi}(t):=\mathbf{V}^{\prime\top}\mathbf{C}_{11}^{-\frac{1}{2}}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{1}\right] + + + References + ---------- + .. [1] Wu, H. and Noe, F. 2017. Variational approach for learning Markov processes from time series data. + arXiv:1707.04659v1 + .. [2] Noe, F. and Clementi, C. 2015. Kinetic distance and kinetic maps from molecular dynamics simulation. + J. Chem. Theory. Comput. doi:10.1021/acs.jctc.5b00553 + .. [3] Chan, T. F., Golub G. H., LeVeque R. J. 1979. Updating formulae and pairwiese algorithms for + computing sample variances. Technical Report STAN-CS-79-773, Department of Computer Science, Stanford University. + """ + StreamingEstimationTransformer.__init__(self) + + # empty dummy model instance + self._model = VAMPModel() + self.set_params(lag=lag, dim=dim, scaling=scaling, right=right, + epsilon=epsilon, stride=stride, skip=skip, ncov_max=ncov_max) + self._covar = None + self._model.update_model_params(dim=dim, epsilon=epsilon) + + def _estimate(self, iterable, **kw): + self._covar = LaggedCovariance(c00=True, c0t=True, ctt=True, remove_data_mean=True, reversible=False, + lag=self.lag, bessel=False, stride=self.stride, skip=self.skip, weights=None, + ncov_max=self.ncov_max) + indim = iterable.dimension() + + if isinstance(self.dim, int) and not self.dim <= indim: + raise RuntimeError("requested more output dimensions (%i) than dimension" + " of input data (%i)" % (self.dim, indim)) + + if self._logger_is_active(self._loglevel_DEBUG): + self._logger.debug("Running VAMP with tau=%i; Estimating two covariance matrices" + " with dimension (%i, %i)" % (self._lag, indim, indim)) + + self._covar.estimate(iterable, **kw) + self._model.update_model_params(mean_0=self._covar.mean, + mean_t=self._covar.mean_tau, + C00=self._covar.C00_, + C0t=self._covar.C0t_, + Ctt=self._covar.Ctt_) + self._diagonalize() + + return self._model + + def partial_fit(self, X): + """ incrementally update the covariances and mean. + + Parameters + ---------- + X: array, list of arrays, PyEMMA reader + input data. + + Notes + ----- + The projection matrix is first being calculated upon its first access. + """ + from pyemma.coordinates import source + iterable = source(X) + + if isinstance(self.dim, int): + indim = iterable.dimension() + if not self.dim <= indim: + raise RuntimeError("requested more output dimensions (%i) than dimension" + " of input data (%i)" % (self.dim, indim)) + + if self._covar is None: + self._covar = LaggedCovariance(c00=True, c0t=True, ctt=True, remove_data_mean=True, reversible=False, + lag=self.lag, bessel=False, stride=self.stride, skip=self.skip, weights=None, + ncov_max=self.ncov_max) + self._covar.partial_fit(iterable) + self._model.update_model_params(mean_0=self._covar.mean, # TODO: inefficient, fixme + mean_t=self._covar.mean_tau, + C00=self._covar.C00_, + C0t=self._covar.C0t_, + Ctt=self._covar.Ctt_) + + # self._used_data = self._covar._used_data + self._estimated = False + + return self + + def _diagonalize(self): + # diagonalize with low rank approximation + self._logger.debug("diagonalize covariance matrices") + self.model._diagonalize(self.scaling) + self._logger.debug("finished diagonalization.") + self._estimated = True + + def dimension(self): + return self._model.dimension() + + def _transform_array(self, X): + r"""Projects the data onto the dominant singular functions. + + Parameters + ---------- + X : ndarray(n, m) + the input data + + Returns + ------- + Y : ndarray(n,) + the projected data + If `self.right` is True, projection will be on the right singular + functions. Otherwise, projection will be on the left singular + functions. + """ + # TODO: in principle get_output should not return data for *all* frames! + # TODO: implement our own iterators? This would also include random access to be complete... + if self.right: + X_meanfree = X - self._model.mean_t + Y = np.dot(X_meanfree, self._model.V[:, 0:self.dimension()]) + else: + X_meanfree = X - self._model.mean_0 + Y = np.dot(X_meanfree, self._model.U[:, 0:self.dimension()]) + + return Y.astype(self.output_type()) + + @property + def singular_values(self): + r"""Singular values of the half-weighted Koopman matrix (usually denoted :math:`\sigma`) + + Returns + ------- + singular values: 1-D np.array + """ + return self._model.singular_values + + @property + def singular_vectors_right(self): + r"""Tranformation matrix that represents the linear map from feature space to the space of right singular functions. + + Notes + ----- + Right "singular vectors" V of the VAMP problem (equation 13 in [1]_), columnwise + + Returns + ------- + vectors: 2-D ndarray + Coefficients that express the right singular functions in the + basis of mean-free input features. + + References + ---------- + .. [1] Wu, H. and Noe, F. 2017. Variational approach for learning Markov processes from time series data. + arXiv:1707.04659v1 + """ + return self._model.V + + @property + def singular_vectors_left(self): + r"""Tranformation matrix that represents the linear map from feature space to the space of left singular functions. + + Notes + ----- + Left "singular vectors" U of the VAMP problem (equation 13 in [1]_), columnwise + + Returns + ------- + vectors: 2-D ndarray + Coefficients that express the left singular functions in the + basis of mean-free input features. + + References + ---------- + .. [1] Wu, H. and Noe, F. 2017. Variational approach for learning Markov processes from time series data. + arXiv:1707.04659v1 + """ + return self._model.U + + @property + def cumvar(self): + r"""Cumulative sum of the squared and normalized singular values + + Returns + ------- + cumvar: 1D np.array + """ + return self._model.cumvar + + @property + def show_progress(self): + if self._covar is None: + return False + else: + return self._covar.show_progress + + @show_progress.setter + def show_progress(self, value): + if self._covar is not None: + self._covar.show_progress = value + + def expectation(self, observables, statistics, lag_multiple=1, observables_mean_free=False, + statistics_mean_free=False): + r"""Compute future expectation of observable or covariance using the approximated Koopman operator. + + Parameters + ---------- + observables : np.ndarray((input_dimension, n_observables)) + Coefficients that express one or multiple observables in + the basis of the input features. + + statistics : np.ndarray((input_dimension, n_statistics)), optional + Coefficients that express one or multiple statistics in + the basis of the input features. + This parameter can be None. In that case, this method + returns the future expectation value of the observable(s). + + lag_multiple : int + If > 1, extrapolate to a multiple of the estimator's lag + time by assuming Markovianity of the approximated Koopman + operator. + + observables_mean_free : bool, default=False + If true, coefficients in `observables` refer to the input + features with feature means removed. + If false, coefficients in `observables` refer to the + unmodified input features. + + statistics_mean_free : bool, default=False + If true, coefficients in `statistics` refer to the input + features with feature means removed. + If false, coefficients in `statistics` refer to the + unmodified input features. + + Notes + ----- + A "future expectation" of a observable g is the average of g computed + over a time window that has the same total length as the input data + from which the Koopman operator was estimated but is shifted + by lag_multiple*tau time steps into the future (where tau is the lag + time). + + It is computed with the equation: + + .. math:: + + \mathbb{E}[g]_{\rho_{n}}=\mathbf{q}^{T}\mathbf{P}^{n-1}\mathbf{e}_{1} + + where + + .. math:: + + P_{ij}=\sigma_{i}\langle\psi_{i},\phi_{j}\rangle_{\rho_{1}} + + and + + .. math:: + + q_{i}=\langle g,\phi_{i}\rangle_{\rho_{1}} + + and :math:`\mathbf{e}_{1}` is the first canonical unit vector. + + + A model prediction of time-lagged covariances between the + observable f and the statistic g at a lag-time of lag_multiple*tau + is computed with the equation: + + .. math:: + + \mathrm{cov}[g,\,f;n\tau]=\mathbf{q}^{T}\mathbf{P}^{n-1}\boldsymbol{\Sigma}\mathbf{r} + + where :math:`r_{i}=\langle\psi_{i},f\rangle_{\rho_{0}}` and + :math:`\boldsymbol{\Sigma}=\mathrm{diag(\boldsymbol{\sigma})}` . + """ + return self._model.expectation(observables, statistics, lag_multiple=lag_multiple, + statistics_mean_free=statistics_mean_free, + observables_mean_free=observables_mean_free) + + def cktest(self, n_observables=None, observables='phi', statistics='psi', mlags=10, n_jobs=1, show_progress=True, + iterable=None): + r"""Do the Chapman-Kolmogorov test by computing predictions for higher lag times and by performing estimations at higher lag times. + + Notes + ----- + + This method computes two sets of time-lagged covariance matrices + + * estimates at higher lag times : + + .. math:: + + \left\langle \mathbf{K}(n\tau)g_{i},f_{j}\right\rangle_{\rho_{0}} + + where :math:`\rho_{0}` is the empirical distribution implicitly defined + by all data points from time steps 0 to T-tau in all trajectories, + :math:`\mathbf{K}(n\tau)` is a rank-reduced Koopman matrix estimated + at the lag-time n*tau and g and f are some functions of the data. + Rank-reduction of the Koopman matrix is controlled by the `dim` + parameter of :func:`vamp `. + + * predictions at higher lag times : + + .. math:: + + \left\langle \mathbf{K}^{n}(\tau)g_{i},f_{j}\right\rangle_{\rho_{0}} + + where :math:`\mathbf{K}^{n}` is the n'th power of the rank-reduced + Koopman matrix contained in self. + + + The Champan-Kolmogorov test is to compare the predictions to the + estimates. + + Parameters + ---------- + n_observables : int, optional, default=None + Limit the number of default observables (and of default statistics) + to this number. + Only used if `observables` are None or `statistics` are None. + + observables : np.ndarray((input_dimension, n_observables)) or 'phi' + Coefficients that express one or multiple observables :math:`g` + in the basis of the input features. + This parameter can be 'phi'. In that case, the dominant + right singular functions of the Koopman operator estimated + at the smallest lag time are used as default observables. + + statistics : np.ndarray((input_dimension, n_statistics)) or 'psi' + Coefficients that express one or multiple statistics :math:`f` + in the basis of the input features. + This parameter can be 'psi'. In that case, the dominant + left singular functions of the Koopman operator estimated + at the smallest lag time are used as default statistics. + + mlags : int or int-array, default=10 + multiples of lag times for testing the Model, e.g. range(10). + A single int will trigger a range, i.e. mlags=10 maps to + mlags=range(10). + Note that you need to be able to do a model prediction for each + of these lag time multiples, e.g. the value 0 only make sense + if model.expectation(lag_multiple=0) will work. + + n_jobs : int, default=1 + how many jobs to use during calculation + + show_progress : bool, default=True + Show progressbars for calculation? + + iterable : any data format that `pyemma.coordinates.vamp()` accepts as input, optional + It `iterable` is None, the same data source with which VAMP + was initialized will be used for all estimation. + Otherwise, all estimates (not predictions) from data will be computed + from the data contained in `iterable`. + + Returns + ------- + vckv : :class:`VAMPChapmanKolmogorovValidator ` + Contains the estimated and the predicted covarince matrices. + The object can be plotted with :func:`plot_cktest ` with the option `y01=False`. + """ + if n_observables is not None: + if n_observables > self.dimension(): + warnings.warn('Selected singular functions as observables but dimension ' + 'is lower than requested number of observables.') + n_observables = self.dimension() + else: + n_observables = self.dimension() + + if isinstance(observables, str) and observables == 'phi': + observables = self.singular_vectors_right[:, 0:n_observables] + observables_mean_free = True + else: + ensure_ndarray(observables, ndim=2) + observables_mean_free = False + + if isinstance(statistics, str) and statistics == 'psi': + statistics = self.singular_vectors_left[:, 0:n_observables] + statistics_mean_free = True + else: + ensure_ndarray_or_None(statistics, ndim=2) + statistics_mean_free = False + + ck = VAMPChapmanKolmogorovValidator(self, self, observables, statistics, observables_mean_free, + statistics_mean_free, mlags=mlags, n_jobs=n_jobs, + show_progress=show_progress) + + if iterable is None: + iterable = self.data_producer + + ck.estimate(iterable) + return ck + + def score(self, test_data=None, score_method='VAMP2'): + """Compute the VAMP score for this model or the cross-validation score between self and a second model estimated form different data. + + Parameters + ---------- + test_data : any data format that `pyemma.coordinates.vamp()` accepts as input + + If `test_data` is not None, this method computes the cross-validation score + between self and a VAMP model estimated from `test_data`. It is assumed that + self was estimated from the "training" data and `test_data` is the test data. + The score is computed for one realization of self and `test_data`. Estimation + of the average cross-validation score and partitioning of data into test and + training part is not performed by this method. + + If `test_data` is None, this method computes the VAMP score for the model + contained in self. + + The model that is estimated from `test_data` will inherit all hyperparameters + from self. + + score_method : str, optional, default='VAMP2' + Available scores are based on the variational approach for Markov processes [1]_: + + * 'VAMP1' Sum of singular values of the half-weighted Koopman matrix [1]_ . + If the model is reversible, this is equal to the sum of + Koopman matrix eigenvalues, also called Rayleigh quotient [1]_. + * 'VAMP2' Sum of squared singular values of the half-weighted Koopman matrix [1]_ . + If the model is reversible, this is equal to the kinetic variance [2]_ . + * 'VAMPE' Approximation error of the estimated Koopman operator with respect to + the true Koopman operator up to an additive constant [1]_ . + + Returns + ------- + score : float + If `test_data` is not None, returns the cross-validation VAMP score between + self and the model estimated from `test_data`. Otherwise return the selected + VAMP-score of self. + + References + ---------- + .. [1] Wu, H. and Noe, F. 2017. Variational approach for learning Markov processes from time series data. + arXiv:1707.04659v1 + .. [2] Noe, F. and Clementi, C. 2015. Kinetic distance and kinetic maps from molecular dynamics simulation. + J. Chem. Theory. Comput. doi:10.1021/acs.jctc.5b00553 + """ + from pyemma._ext.sklearn.base import clone as clone_estimator + est = clone_estimator(self) + + if test_data is None: + return self.model.score(None, score_method=score_method) + else: + est.estimate(test_data) + return self.model.score(est.model, score_method=score_method) + + +class VAMPChapmanKolmogorovValidator(LaggedModelValidator): + __serialize_version = 0 + __serialize_fields = ('nsets', 'statistics', 'observables', 'observables_mean_free', 'statistics_mean_free') + + def __init__(self, model, estimator, observables, statistics, observables_mean_free, statistics_mean_free, + mlags=10, n_jobs=1, show_progress=True): + r""" + Note + ---- + It is recommended that you create this object by calling the + `cktest` method of a VAMP object created with + :func:`vamp `. + + Parameters + ---------- + model : Model + Model with the smallest lag time. Is used to make predictions + for larger lag times. + + estimator : Estimator + Parametrized Estimator that has produced the model. + Is used as a prototype for estimating models at higher lag times. + + observables : np.ndarray((input_dimension, n_observables)) + Coefficients that express one or multiple observables in + the basis of the input features. + + statistics : np.ndarray((input_dimension, n_statistics)) + Coefficients that express one or multiple statistics in + the basis of the input features. + + observables_mean_free : bool, default=False + If true, coefficients in `observables` refer to the input + features with feature means removed. + If false, coefficients in `observables` refer to the + unmodified input features. + + statistics_mean_free : bool, default=False + If true, coefficients in `statistics` refer to the input + features with feature means removed. + If false, coefficients in `statistics` refer to the + unmodified input features. + + mlags : int or int-array, default=10 + multiples of lag times for testing the Model, e.g. range(10). + A single int will trigger a range, i.e. mlags=10 maps to + mlags=range(10). + Note that you need to be able to do a model prediction for each + of these lag time multiples, e.g. the value 0 only make sense + if model.expectation(lag_multiple=0) will work. + + n_jobs : int, default=1 + how many jobs to use during calculation + + show_progress : bool, default=True + Show progressbars for calculation? + + Notes + ----- + The object can be plotted with :func:`plot_cktest ` + with the option `y01=False`. + """ + LaggedModelValidator.__init__(self, model, estimator, mlags=mlags, + n_jobs=n_jobs, show_progress=show_progress) + self.statistics = statistics + self.observables = observables + self.observables_mean_free = observables_mean_free + self.statistics_mean_free = statistics_mean_free + if self.statistics is not None: + self.nsets = min(self.observables.shape[1], self.statistics.shape[1]) + + def _compute_observables(self, model, estimator, mlag=1): + # for lag time 0 we return a matrix of nan, until the correct solution is implemented + if mlag == 0 or model is None: + if self.statistics is None: + return np.zeros(self.observables.shape[1]) + np.nan + else: + return np.zeros((self.observables.shape[1], self.statistics.shape[1])) + np.nan + else: + return model.expectation(statistics=self.statistics, observables=self.observables, lag_multiple=mlag, + statistics_mean_free=self.statistics_mean_free, + observables_mean_free=self.observables_mean_free) + + def _compute_observables_conf(self, model, estimator, mlag=1): + raise NotImplementedError('estimation of confidence intervals not yet implemented for VAMP') diff --git a/pyemma/msm/estimators/__init__.py b/pyemma/msm/estimators/__init__.py index b562a3a9e..56b52eb53 100644 --- a/pyemma/msm/estimators/__init__.py +++ b/pyemma/msm/estimators/__init__.py @@ -15,13 +15,13 @@ # # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see . - from __future__ import absolute_import + __author__ = 'noe' from .maximum_likelihood_msm import MaximumLikelihoodMSM from .maximum_likelihood_msm import OOMReweightedMSM -from .maximum_likelihood_msm import AugmentedMarkovModel +from .maximum_likelihood_msm import AugmentedMarkovModel from .bayesian_msm import BayesianMSM from .maximum_likelihood_hmsm import MaximumLikelihoodHMSM from .bayesian_hmsm import BayesianHMSM diff --git a/pyemma/msm/estimators/lagged_model_validators.py b/pyemma/msm/estimators/lagged_model_validators.py index a39b3c141..da45352cf 100644 --- a/pyemma/msm/estimators/lagged_model_validators.py +++ b/pyemma/msm/estimators/lagged_model_validators.py @@ -17,7 +17,7 @@ # along with this program. If not, see . from __future__ import absolute_import - +from six.moves import range import math import numpy as np @@ -81,7 +81,10 @@ def __init__(self, model, estimator, mlags=None, conf=0.95, err_est=False, self.test_estimator = estimator # set mlags - maxlength = np.max([len(dtraj) for dtraj in estimator.discrete_trajectories_full]) + try: + maxlength = np.max([len(dtraj) for dtraj in estimator.discrete_trajectories_full]) + except AttributeError: + maxlength = np.max(estimator.trajectory_lengths()) maxmlag = int(math.floor(maxlength / estimator.lag)) if mlags is None: mlags = maxmlag diff --git a/pyemma/msm/estimators/maximum_likelihood_hmsm.py b/pyemma/msm/estimators/maximum_likelihood_hmsm.py index d659e1d3c..5e77ab21c 100644 --- a/pyemma/msm/estimators/maximum_likelihood_hmsm.py +++ b/pyemma/msm/estimators/maximum_likelihood_hmsm.py @@ -17,7 +17,7 @@ # along with this program. If not, see . from __future__ import absolute_import -# +from six.moves import range from pyemma.util.annotators import alias, aliased, fix_docs import numpy as _np diff --git a/pyemma/msm/estimators/maximum_likelihood_msm.py b/pyemma/msm/estimators/maximum_likelihood_msm.py index e68b56dd3..3120b1141 100644 --- a/pyemma/msm/estimators/maximum_likelihood_msm.py +++ b/pyemma/msm/estimators/maximum_likelihood_msm.py @@ -228,7 +228,7 @@ def score(self, dtrajs, score_method=None, score_k=None): score_method : str Overwrite scoring method if desired. If `None`, the estimators scoring method will be used. See __init__ for documentation. - score_k : str + score_k : int or None Overwrite scoring rank if desired. If `None`, the estimators scoring rank will be used. See __init__ for documentation. score_method : str, optional, default='VAMP2' diff --git a/pyemma/util/_config.py b/pyemma/util/_config.py index 8bfe25849..86dac0225 100644 --- a/pyemma/util/_config.py +++ b/pyemma/util/_config.py @@ -17,6 +17,7 @@ from __future__ import absolute_import, print_function +import six from six.moves.configparser import ConfigParser import os import shutil @@ -32,6 +33,10 @@ class ReadConfigException(Exception): pass +if six.PY2: + class NotADirectoryError(Exception): + pass + __all__ = ('Config', ) @@ -172,10 +177,10 @@ def cfg_dir(self, pyemma_cfg_dir): if not os.path.exists(pyemma_cfg_dir): try: mkdir_p(pyemma_cfg_dir) - except EnvironmentError: - raise ConfigDirectoryException("could not create configuration directory '%s'" % pyemma_cfg_dir) except NotADirectoryError: # on Python 3 raise ConfigDirectoryException("pyemma cfg dir (%s) is not a directory" % pyemma_cfg_dir) + except EnvironmentError: + raise ConfigDirectoryException("could not create configuration directory '%s'" % pyemma_cfg_dir) if not os.path.isdir(pyemma_cfg_dir): raise ConfigDirectoryException("%s is no valid directory" % pyemma_cfg_dir) diff --git a/pyemma/util/annotators.py b/pyemma/util/annotators.py index 5843d4fc9..ecbe05f37 100644 --- a/pyemma/util/annotators.py +++ b/pyemma/util/annotators.py @@ -28,7 +28,6 @@ 'deprecated', 'shortcut', 'fix_docs', - 'estimation_required', ] diff --git a/pyemma/util/types.py b/pyemma/util/types.py index ca3957134..65a0834f3 100644 --- a/pyemma/util/types.py +++ b/pyemma/util/types.py @@ -27,6 +27,8 @@ import numbers import collections +from six import string_types + # ====================================================================================================================== # BASIC TYPE CHECKS # ====================================================================================================================== @@ -137,7 +139,7 @@ def is_float_array(l): return False def is_string(s): - return isinstance(s, str) + return isinstance(s, string_types) def is_iterable(I): return isinstance(I, collections.Iterable) @@ -147,7 +149,7 @@ def is_list(S): return isinstance(S, (list, tuple)) def is_list_of_string(S): - return isinstance(S, (list, tuple)) and (all(isinstance(s, str) for s in S)) + return isinstance(S, (list, tuple)) and (all(isinstance(s, string_types) for s in S)) def ensure_dtraj(dtraj): r"""Makes sure that dtraj is a discrete trajectory (array of int) @@ -171,8 +173,8 @@ def ensure_dtraj_list(dtrajs): if is_list_of_int(dtrajs): return [np.array(dtrajs, dtype=int)] else: - for i in range(len(dtrajs)): - dtrajs[i] = ensure_dtraj(dtrajs[i]) + for i, dtraj in enumerate(dtrajs): + dtrajs[i] = ensure_dtraj(dtraj) return dtrajs else: return [ensure_dtraj(dtrajs)] @@ -476,8 +478,8 @@ def ensure_traj_list(trajs): return [np.array(trajs)[:,None]] else: res = [] - for i in range(len(trajs)): - res.append(ensure_traj(trajs[i])) + for traj in trajs: + res.append(ensure_traj(traj)) return res else: # looks like this is one trajectory diff --git a/setup.py b/setup.py index b2f41c2ce..b201bad95 100755 --- a/setup.py +++ b/setup.py @@ -59,6 +59,7 @@ Operating System :: MacOS :: MacOS X Operating System :: POSIX Operating System :: Microsoft :: Windows +Programming Language :: Python :: 2.7 Programming Language :: Python :: 3 Topic :: Scientific/Engineering :: Bio-Informatics Topic :: Scientific/Engineering :: Chemistry @@ -66,7 +67,12 @@ Topic :: Scientific/Engineering :: Physics """ - +from setup_util import lazy_cythonize +try: + from setuptools import setup, Extension, find_packages +except ImportError as ie: + print("PyEMMA requires setuptools. Please install it with conda or pip.") + sys.exit(1) ############################################################################### # Extensions