diff --git a/devtools/ci/travis/install_miniconda.sh b/devtools/ci/travis/install_miniconda.sh
index ace5acd15..76b5ac968 100755
--- a/devtools/ci/travis/install_miniconda.sh
+++ b/devtools/ci/travis/install_miniconda.sh
@@ -30,5 +30,5 @@ else # if it does not exist, we need to install miniconda
 fi
 
 # we want to have an up to date conda-build.
-conda install conda-build=3.2
+conda install conda-build=3
 conda info -a # for debugging
diff --git a/devtools/conda-recipe/meta.yaml b/devtools/conda-recipe/meta.yaml
index 6095c74dd..e8f26ef92 100644
--- a/devtools/conda-recipe/meta.yaml
+++ b/devtools/conda-recipe/meta.yaml
@@ -24,7 +24,7 @@ requirements:
     - numpy 1.9.*  # [not (win and (py35 or py36))]
     - numpy 1.9.*  # [win and py35]
     - numpy 1.11.*  # [win and py36]
-    - python >=3
+    - python
     - scipy
     - setuptools
     - gcc # [ not win ]
@@ -42,10 +42,11 @@ requirements:
     - numpy >=1.11,<1.14 # [win and py36]
     - pathos
     - psutil >3.1
-    - python >=3
+    - python
     - pyyaml
     - scipy
     - setuptools
+    - six >=1.10
     - thermotools >=0.2.6
     - tqdm
 
diff --git a/devtools/conda-recipe/run_test.py b/devtools/conda-recipe/run_test.py
index 71bef7209..dcece6801 100644
--- a/devtools/conda-recipe/run_test.py
+++ b/devtools/conda-recipe/run_test.py
@@ -9,7 +9,9 @@
 # where to write junit xml
 junit_xml = os.path.join(os.getenv('CIRCLE_TEST_REPORTS', os.path.expanduser('~')),
                          'reports', 'junit.xml')
-os.makedirs(os.path.dirname(junit_xml), exist_ok=True)
+target_dir = os.path.dirname(junit_xml)
+if not os.path.exists(target_dir):
+    os.makedirs(target_dir)
 print('junit destination:', junit_xml)
 njobs_args = '-p no:xdist' if os.getenv('TRAVIS') else '-n2'
 
diff --git a/doc/source/CHANGELOG.rst b/doc/source/CHANGELOG.rst
index 35a82c182..22d0e1d7c 100644
--- a/doc/source/CHANGELOG.rst
+++ b/doc/source/CHANGELOG.rst
@@ -5,7 +5,7 @@ Changelog
 ----------------
 
 As of this version the usage of Python 2.7 is officially deprecated. Please upgrade
-your Python installation to at least version 3.5.
+your Python installation to at least version 3.5 to catch future updates.
 
 **New features**:
 
@@ -13,11 +13,12 @@ your Python installation to at least version 3.5.
   data into estimation of Markov models from molecular simulations. The method is described in [1]. #1111
 - msm: Added mincount_connectivity argument to MSM estimators. This option enables to omit counts below
   a given threshold. #1106
-- coodinates: selection based features allow alignment to a reference structure. #1184
+- coordinates: selection based features allow alignment to a reference structure. #1184
 - coordinates: two new center of mass features: ResidueCOMFeature() and GroupCOMFeature()
 - coordinates: new configuration variable 'default_chunksize' can be set to limit the size of a fragmented
   extracted per iteration from a data source. This is invariant to the dimension of data sets. #1190
 - datasets: added Prinz potential (quadwell). #1226
+- coordinates: added VAMP estimator. #1237
 
 
 - References:
diff --git a/pyemma/_base/estimator.py b/pyemma/_base/estimator.py
index 88564d92f..64a3d1b68 100644
--- a/pyemma/_base/estimator.py
+++ b/pyemma/_base/estimator.py
@@ -299,7 +299,8 @@ def estimate_param_scan(estimator, X, param_sets, evaluate=None, evaluate_args=N
     if evaluate is not None and evaluate_args is not None and len(evaluate) != len(evaluate_args):
         raise ValueError("length mismatch: evaluate ({}) and evaluate_args ({})".format(len(evaluate), len(evaluate_args)))
 
-    if progress_reporter is not None:
+    show_progress = progress_reporter is not None and show_progress
+    if show_progress:
         progress_reporter._progress_register(len(estimators), stage=0,
                                              description="estimating %s" % str(estimator.__class__.__name__))
 
@@ -317,8 +318,7 @@ def estimate_param_scan(estimator, X, param_sets, evaluate=None, evaluate_args=N
         from pathos.multiprocessing import Pool as Parallel
         pool = Parallel(processes=n_jobs)
         args = list(task_iter)
-        if progress_reporter is not None:
-            progress_reporter._progress_register(len(estimators), stage=0, description="estimating %s" % str(estimator.__class__.__name__))
+        if show_progress:
             from pyemma._base.model import SampledModel
             for a in args:
                 if isinstance(a[0], SampledModel):
@@ -352,7 +352,7 @@ def error_callback(*args, **kw):
             estimators[0].logger.debug('estimating %s with n_jobs=1 because of the setting or '
                                        'you not have a POSIX system', estimator)
         res = []
-        if progress_reporter is not None:
+        if show_progress:
             from pyemma._base.model import SampledModel
             if isinstance(estimator, SampledModel):
                 for e in estimators:
@@ -361,10 +361,10 @@ def error_callback(*args, **kw):
         for estimator, param_set in zip(estimators, param_sets):
             res.append(_estimate_param_scan_worker(estimator, param_set, X,
                                                    evaluate, evaluate_args, failfast, return_exceptions))
-            if progress_reporter is not None and show_progress:
+            if show_progress:
                 progress_reporter._progress_update(1, stage=0)
 
-    if progress_reporter is not None and show_progress:
+    if show_progress:
         progress_reporter._progress_force_finish(0)
 
     # done
diff --git a/pyemma/_ext/variational/solvers/direct.py b/pyemma/_ext/variational/solvers/direct.py
index db442aedf..d3c2be57b 100644
--- a/pyemma/_ext/variational/solvers/direct.py
+++ b/pyemma/_ext/variational/solvers/direct.py
@@ -125,7 +125,7 @@ def spd_inv(W, epsilon=1e-10, method='QR', canonical_signs=False):
     return Winv
 
 
-def spd_inv_sqrt(W, epsilon=1e-10, method='QR', canonical_signs=False):
+def spd_inv_sqrt(W, epsilon=1e-10, method='QR', canonical_signs=False, return_rank=False):
     """
     Computes :math:`W^{-1/2}` of symmetric positive-definite matrix :math:`W`.
 
@@ -153,14 +153,18 @@ def spd_inv_sqrt(W, epsilon=1e-10, method='QR', canonical_signs=False):
         Matrix :math:`L` from the decomposition :math:`W^{-1} = L L^T`.
 
     """
-    if (_np.shape(W)[0] == 1):
-        Winv = 1./_np.sqrt(W[0,0])
+    if _np.shape(W)[0] == 1:
+        Winv = 1./_np.sqrt(W[0, 0])
+        sm = _np.ones(1)
     else:
         sm, Vm = spd_eig(W, epsilon=epsilon, method=method, canonical_signs=canonical_signs)
         Winv = _np.dot(Vm, _np.diag(1.0 / _np.sqrt(sm))).dot(Vm.T)
 
     # return split
-    return Winv
+    if return_rank:
+        return Winv, sm.shape[0]
+    else:
+        return Winv
 
 
 def spd_inv_split(W, epsilon=1e-10, method='QR', canonical_signs=False):
diff --git a/pyemma/coordinates/__init__.py b/pyemma/coordinates/__init__.py
index c7b5dbfba..0b3adf034 100644
--- a/pyemma/coordinates/__init__.py
+++ b/pyemma/coordinates/__init__.py
@@ -51,6 +51,7 @@
 
    pca
    tica
+   vamp
 
 **Clustering Algorithms**
 
@@ -84,6 +85,7 @@
 
    transform.PCA
    transform.TICA
+   transform.VAMP
 
 **Covariance estimation**
 
diff --git a/pyemma/coordinates/acf.py b/pyemma/coordinates/acf.py
index 86eb8d489..746bcde3d 100644
--- a/pyemma/coordinates/acf.py
+++ b/pyemma/coordinates/acf.py
@@ -18,7 +18,6 @@
 
 
 
-from __future__ import absolute_import, print_function
 import numpy as np
 import sys
 
diff --git a/pyemma/coordinates/api.py b/pyemma/coordinates/api.py
index e8127c59b..75ad04717 100644
--- a/pyemma/coordinates/api.py
+++ b/pyemma/coordinates/api.py
@@ -51,6 +51,7 @@
            'save_trajs',
            'pca',  # transform
            'tica',
+           'vamp',
            'covariance_lagged',
            'cluster_regspace',  # cluster
            'cluster_kmeans',
@@ -375,9 +376,9 @@ def source(inp, features=None, top=None, chunksize=None, **kw):
 
     # CASE 1: input is a string or list of strings
     # check: if single string create a one-element list
-    if isinstance(inp, str) or (
+    if isinstance(inp, _string_types) or (
             isinstance(inp, (list, tuple))
-            and (any(isinstance(item, (list, tuple, str)) for item in inp) or len(inp) is 0)):
+            and (any(isinstance(item, (list, tuple, _string_types)) for item in inp) or len(inp) is 0)):
         reader = create_file_reader(inp, top, features, chunksize=cs, **kw)
 
     elif isinstance(inp, _np.ndarray) or (isinstance(inp, (list, tuple))
@@ -716,7 +717,7 @@ def save_traj(traj_inp, indexes, outfile, top=None, stride = 1, chunksize=None,
         # Do we have what we need?
         if not isinstance(traj_inp, (list, tuple)):
             raise TypeError("traj_inp has to be of type list, not %s" % type(traj_inp))
-        if not isinstance(top, (str, Topology, Trajectory)):
+        if not isinstance(top, (_string_types, Topology, Trajectory)):
             raise TypeError("traj_inp cannot be a list of files without an input "
                             "top of type str (eg filename.pdb), mdtraj.Trajectory or mdtraj.Topology. "
                             "Got type %s instead" % type(top))
@@ -1255,10 +1256,160 @@ def tica(data=None, lag=10, dim=-1, var_cutoff=0.95, kinetic_map=True, commute_m
     return res
 
 
-def covariance_lagged(data=None, c00=True, c0t=True, ctt=False, remove_constant_mean=None, remove_data_mean=False,
-                      reversible=False, bessel=True, lag=0, weights="empirical", stride=1, skip=0, chunksize=None):
+def vamp(data=None, lag=10, dim=None, scaling=None, right=True, ncov_max=float('inf'),
+         stride=1, skip=0, chunksize=None):
+    r""" Variational approach for Markov processes (VAMP) [1]_.
+
+      Parameters
+      ----------
+      lag : int
+          lag time
+      dim : float or int
+          Number of dimensions to keep:
+
+          * if dim is not set all available ranks are kept:
+              `n_components == min(n_samples, n_features)`
+          * if dim is an integer >= 1, this number specifies the number
+            of dimensions to keep. By default this will use the kinetic
+            variance.
+          * if dim is a float with ``0 < dim < 1``, select the number
+            of dimensions such that the amount of kinetic variance
+            that needs to be explained is greater than the percentage
+            specified by dim.
+      scaling : None or string
+          Scaling to be applied to the VAMP order parameters upon transformation
+
+          * None: no scaling will be applied, variance of the order parameters is 1
+          * 'kinetic map' or 'km': order parameters are scaled by singular value
+            Only the left singular functions induce a kinetic map.
+            Therefore scaling='km' is only effective if `right` is False.
+      right : boolean
+          Whether to compute the right singular functions.
+          If `right==True`, `get_output()` will return the right singular
+          functions. Otherwise, `get_output()` will return the left singular
+          functions.
+          Beware that only `frames[tau:, :]` of each trajectory returned
+          by `get_output()` contain valid values of the right singular
+          functions. Conversely, only `frames[0:-tau, :]` of each
+          trajectory returned by `get_output()` contain valid values of
+          the left singular functions. The remaining frames might
+          possibly be interpreted as some extrapolation.
+      epsilon : float
+          singular value cutoff. Singular values of :math:`C0` with
+          norms <= epsilon will be cut off. The remaining number of
+          singular values define the size of the output.
+      stride: int, optional, default = 1
+          Use only every stride-th time step. By default, every time step is used.
+      skip : int, default=0
+          skip the first initial n frames per trajectory.
+      ncov_max : int, default=infinity
+          limit the memory usage of the algorithm from [3]_ to an amount that corresponds
+          to ncov_max additional copies of each correlation matrix
+
+      Notes
+      -----
+      VAMP is a method for dimensionality reduction of Markov processes.
+
+      The Koopman operator :math:`\mathcal{K}` is an integral operator
+      that describes conditional future expectation values. Let
+      :math:`p(\mathbf{x},\,\mathbf{y})` be the conditional probability
+      density of visiting an infinitesimal phase space volume around
+      point :math:`\mathbf{y}` at time :math:`t+\tau` given that the phase
+      space point :math:`\mathbf{x}` was visited at the earlier time
+      :math:`t`. Then the action of the Koopman operator on a function
+      :math:`f` can be written as follows:
+
+      .. math::
+
+          \mathcal{K}f=\int p(\mathbf{x},\,\mathbf{y})f(\mathbf{y})\,\mathrm{dy}=\mathbb{E}\left[f(\mathbf{x}_{t+\tau}\mid\mathbf{x}_{t}=\mathbf{x})\right]
+
+      The Koopman operator is defined without any reference to an
+      equilibrium distribution. Therefore it is well-defined in
+      situations where the dynamics is irreversible or/and non-stationary
+      such that no equilibrium distribution exists.
+
+      If we approximate :math:`f` by a linear superposition of ansatz
+      functions :math:`\boldsymbol{\chi}` of the conformational
+      degrees of freedom (features), the operator :math:`\mathcal{K}`
+      can be approximated by a (finite-dimensional) matrix :math:`\mathbf{K}`.
+
+      The approximation is computed as follows: From the time-dependent
+      input features :math:`\boldsymbol{\chi}(t)`, we compute the mean
+      :math:`\boldsymbol{\mu}_{0}` (:math:`\boldsymbol{\mu}_{1}`) from
+      all data excluding the last (first) :math:`\tau` steps of every
+      trajectory as follows:
+
+      .. math::
+
+          \boldsymbol{\mu}_{0}	:=\frac{1}{T-\tau}\sum_{t=0}^{T-\tau}\boldsymbol{\chi}(t)
+
+          \boldsymbol{\mu}_{1}	:=\frac{1}{T-\tau}\sum_{t=\tau}^{T}\boldsymbol{\chi}(t)
+
+      Next, we compute the instantaneous covariance matrices
+      :math:`\mathbf{C}_{00}` and :math:`\mathbf{C}_{11}` and the
+      time-lagged covariance matrix :math:`\mathbf{C}_{01}` as follows:
+
+      .. math::
+
+          \mathbf{C}_{00}	:=\frac{1}{T-\tau}\sum_{t=0}^{T-\tau}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right]\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right]
+
+          \mathbf{C}_{11}	:=\frac{1}{T-\tau}\sum_{t=\tau}^{T}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{1}\right]\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{1}\right]
+
+          \mathbf{C}_{01}	:=\frac{1}{T-\tau}\sum_{t=0}^{T-\tau}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right]\left[\boldsymbol{\chi}(t+\tau)-\boldsymbol{\mu}_{1}\right]
+
+      The Koopman matrix is then computed as follows:
+
+      .. math::
+
+          \mathbf{K}=\mathbf{C}_{00}^{-1}\mathbf{C}_{01}
+
+      It can be shown [1]_ that the leading singular functions of the
+      half-weighted Koopman matrix
+
+      .. math::
+
+          \bar{\mathbf{K}}:=\mathbf{C}_{00}^{-\frac{1}{2}}\mathbf{C}_{01}\mathbf{C}_{11}^{-\frac{1}{2}}
+
+      encode the best reduced dynamical model for the time series.
+
+      The singular functions can be computed by first performing the
+      singular value decomposition
+
+      .. math::
+
+          \bar{\mathbf{K}}=\mathbf{U}^{\prime}\mathbf{S}\mathbf{V}^{\prime}
+
+      and then mapping the input conformation to the left singular
+      functions :math:`\boldsymbol{\psi}` and right singular
+      functions :math:`\boldsymbol{\phi}` as follows:
+
+      .. math::
+
+          \boldsymbol{\psi}(t):=\mathbf{U}^{\prime\top}\mathbf{C}_{00}^{-\frac{1}{2}}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right]
+
+          \boldsymbol{\phi}(t):=\mathbf{V}^{\prime\top}\mathbf{C}_{11}^{-\frac{1}{2}}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{1}\right]
+
+
+      References
+      ----------
+      .. [1] Wu, H. and Noe, F. 2017. Variational approach for learning Markov processes from time series data.
+          arXiv:1707.04659v1
+      .. [2] Noe, F. and Clementi, C. 2015. Kinetic distance and kinetic maps from molecular dynamics simulation.
+          J. Chem. Theory. Comput. doi:10.1021/acs.jctc.5b00553
+      .. [3] Chan, T. F., Golub G. H., LeVeque R. J. 1979. Updating formulae and pairwiese algorithms for
+         computing sample variances. Technical Report STAN-CS-79-773, Department of Computer Science, Stanford University.
     """
-    Compute lagged covariances between time series. If data is available as an array of size (TxN), where T is the
+    from pyemma.coordinates.transform.vamp import VAMP
+    res = VAMP(lag, dim=dim, scaling=scaling, right=right, skip=skip, ncov_max=ncov_max)
+    if data is not None:
+        res.estimate(data, stride=stride, chunksize=chunksize)
+    return res
+
+
+def covariance_lagged(data=None, c00=True, c0t=True, ctt=False, remove_constant_mean=None, remove_data_mean=False,
+                      reversible=False, bessel=True, lag=0, weights="empirical", stride=1, skip=0, chunksize=None,
+                      ncov_max=float('inf')):
+    r"""Compute lagged covariances between time series. If data is available as an array of size (TxN), where T is the
     number of time steps and N the number of dimensions, this function can compute lagged covariances like
 
     .. math::
@@ -1306,6 +1457,9 @@ def covariance_lagged(data=None, c00=True, c0t=True, ctt=False, remove_constant_
         to optimize thread usage and gain processing speed. If None is passed,
         use the default value of the underlying reader/data source. Choose zero to
         disable chunking at all.
+    ncov_max : int, default=infinity
+        limit the memory usage of the algorithm from [2]_ to an amount that corresponds
+        to ncov_max additional copies of each correlation matrix
 
     Returns
     -------
@@ -1314,17 +1468,17 @@ def covariance_lagged(data=None, c00=True, c0t=True, ctt=False, remove_constant_
 
     .. [1] Wu, H., Nueske, F., Paul, F., Klus, S., Koltai, P., and Noe, F. 2016. Bias reduced variational
        approximation of molecular kinetics from short off-equilibrium simulations. J. Chem. Phys. (submitted)
-
+    .. [2] Chan, T. F., Golub G. H., LeVeque R. J. 1979. Updating formulae and pairwiese algorithms for
+        computing sample variances. Technical Report STAN-CS-79-773, Department of Computer Science, Stanford University.
     """
-
     from pyemma.coordinates.estimation.covariance import LaggedCovariance
     from pyemma.coordinates.estimation.koopman import _KoopmanEstimator
     import types
-    if isinstance(weights, str):
+    if isinstance(weights, _string_types):
         if weights== "koopman":
             if data is None:
                 raise ValueError("Data must be supplied for reweighting='koopman'")
-            koop = _KoopmanEstimator(lag=lag, stride=stride, skip=skip)
+            koop = _KoopmanEstimator(lag=lag, stride=stride, skip=skip, ncov_max=ncov_max)
             koop.estimate(data, chunksize=chunksize)
             weights = koop.weights
         elif weights == "empirical":
@@ -1342,7 +1496,7 @@ def covariance_lagged(data=None, c00=True, c0t=True, ctt=False, remove_constant_
     # chunksize is an estimation parameter for now.
     lc = LaggedCovariance(c00=c00, c0t=c0t, ctt=ctt, remove_constant_mean=remove_constant_mean,
                           remove_data_mean=remove_data_mean, reversible=reversible, bessel=bessel, lag=lag,
-                          weights=weights, stride=stride, skip=skip)
+                          weights=weights, stride=stride, skip=skip, ncov_max=ncov_max)
     if data is not None:
         lc.estimate(data, chunksize=chunksize)
     return lc
diff --git a/pyemma/coordinates/data/sources_merger.py b/pyemma/coordinates/data/sources_merger.py
index 1e663352f..0b1e7351d 100644
--- a/pyemma/coordinates/data/sources_merger.py
+++ b/pyemma/coordinates/data/sources_merger.py
@@ -18,10 +18,10 @@ class SourcesMerger(DataSource, SerializableMixIn):
     sources : list, tuple
         list of DataSources (Readers, StreamingTransformers etc.) to combine for streaming access.
 
-    chunk: int
+    chunk: int or None
         chunk size to use for underlying iterators.
     """
-    def __init__(self, sources, chunk=5000):
+    def __init__(self, sources, chunk=None):
         super(SourcesMerger, self).__init__(chunksize=chunk)
         self.sources = sources
         self._is_reader = True
diff --git a/pyemma/coordinates/data/util/reader_utils.py b/pyemma/coordinates/data/util/reader_utils.py
index db1024639..907d1a6f9 100644
--- a/pyemma/coordinates/data/util/reader_utils.py
+++ b/pyemma/coordinates/data/util/reader_utils.py
@@ -23,6 +23,8 @@
 import numpy as np
 import os
 
+from six import string_types
+
 
 def create_file_reader(input_files, topology, featurizer, chunksize=None, **kw):
     r"""
@@ -43,8 +45,6 @@ def create_file_reader(input_files, topology, featurizer, chunksize=None, **kw):
     from pyemma.coordinates.data.py_csv_reader import PyCSVReader
     from pyemma.coordinates.data import FeatureReader
     from pyemma.coordinates.data.fragmented_trajectory_reader import FragmentedTrajectoryReader
-    import six
-    str = six.string_types
 
     # fragmented trajectories
     if (isinstance(input_files, (list, tuple)) and len(input_files) > 0 and
@@ -52,15 +52,15 @@ def create_file_reader(input_files, topology, featurizer, chunksize=None, **kw):
         return FragmentedTrajectoryReader(input_files, topology, chunksize, featurizer)
 
     # normal trajectories
-    if (isinstance(input_files, str)
+    if (isinstance(input_files, string_types)
             or (isinstance(input_files, (list, tuple))
-                and (any(isinstance(item, str) for item in input_files)
+                and (any(isinstance(item, string_types) for item in input_files)
                      or len(input_files) is 0))):
         reader = None
         # check: if single string create a one-element list
-        if isinstance(input_files, str):
+        if isinstance(input_files, string_types):
             input_list = [input_files]
-        elif len(input_files) > 0 and all(isinstance(item, str) for item in input_files):
+        elif len(input_files) > 0 and all(isinstance(item, string_types) for item in input_files):
             input_list = input_files
         else:
             if len(input_files) is 0:
@@ -177,7 +177,7 @@ def preallocate_empty_trajectory(top, n_frames=1):
 
 
 def enforce_top(top):
-    if isinstance(top, str):
+    if isinstance(top, string_types):
         top = md.load(top).top
     elif isinstance(top, md.Trajectory):
         top = top.top
diff --git a/pyemma/coordinates/estimation/covariance.py b/pyemma/coordinates/estimation/covariance.py
index 2405f3200..5d84726a9 100644
--- a/pyemma/coordinates/estimation/covariance.py
+++ b/pyemma/coordinates/estimation/covariance.py
@@ -77,14 +77,19 @@ class LaggedCovariance(StreamingEstimator):
          Use only every stride-th time step. By default, every time step is used.
      skip : int, optional, default=0
          skip the first initial n frames per trajectory.
-     chunksize : deprecated, default=NoTImplemented
-         The chunk size can be se during estimation.
+     chunksize : deprecated, default=NotImplemented
+         The chunk size should now be set during estimation.
 
      """
     def __init__(self, c00=True, c0t=False, ctt=False, remove_constant_mean=None, remove_data_mean=False, reversible=False,
                  bessel=True, sparse_mode='auto', modify_data=False, lag=0, weights=None, stride=1, skip=0,
                  chunksize=NotImplemented, ncov_max=float('inf')):
         super(LaggedCovariance, self).__init__()
+        if chunksize is not NotImplemented:
+            import warnings
+            from pyemma.util.exceptions import PyEMMA_DeprecationWarning
+            warnings.warn('passed deprecated argument chunksize to LaggedCovariance. Will be ignored!',
+                          category=PyEMMA_DeprecationWarning)
 
         if (c0t or ctt) and lag == 0:
             raise ValueError("lag must be positive if c0t=True or ctt=True")
diff --git a/pyemma/coordinates/tests/test_vamp.py b/pyemma/coordinates/tests/test_vamp.py
new file mode 100644
index 000000000..2d04a8eb1
--- /dev/null
+++ b/pyemma/coordinates/tests/test_vamp.py
@@ -0,0 +1,283 @@
+# This file is part of PyEMMA.
+#
+# Copyright (c) 2017 Computational Molecular Biology Group, Freie Universitaet Berlin (GER)
+#
+# PyEMMA is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+"""
+@author: paul
+"""
+
+from __future__ import absolute_import
+import unittest
+import numpy as np
+from pyemma.coordinates import vamp as pyemma_api_vamp
+from pyemma.msm import estimate_markov_model
+from logging import getLogger
+
+from pyemma.msm.estimators._dtraj_stats import cvsplit_dtrajs
+
+logger = getLogger('pyemma.'+'TestVAMP')
+
+
+def random_matrix(n, rank=None, eps=0.01):
+    m = np.random.randn(n, n)
+    u, s, v = np.linalg.svd(m)
+    if rank is None:
+        rank = n
+    if rank > n:
+        rank = n
+    s = np.concatenate((np.maximum(s, eps)[0:rank], np.zeros(n-rank)))
+    return u.dot(np.diag(s)).dot(v)
+
+
+class TestVAMPEstimatorSelfConsistency(unittest.TestCase):
+    def test_full_rank(self):
+        self.do_test(20, 20, test_partial_fit=True)
+
+    def test_low_rank(self):
+        dim = 30
+        rank = 15
+        self.do_test(dim, rank, test_partial_fit=True)
+
+    def do_test(self, dim, rank, test_partial_fit=False):
+        # setup
+        N_frames = [123, 456, 789]
+        N_trajs = len(N_frames)
+        A = random_matrix(dim, rank)
+        trajs = []
+        mean = np.random.randn(dim)
+        for i in range(N_trajs):
+            # set up data
+            white = np.random.randn(N_frames[i], dim)
+            brown = np.cumsum(white, axis=0)
+            correlated = np.dot(brown, A)
+            trajs.append(correlated + mean)
+
+        # test
+        tau = 50
+        vamp = pyemma_api_vamp(trajs, lag=tau, scaling=None)
+        vamp.right = True
+
+        assert vamp.dimension() <= rank
+
+        atol = np.finfo(vamp.output_type()).eps*10.0
+        phi_trajs = [ sf[tau:, :] for sf in vamp.get_output() ]
+        phi = np.concatenate(phi_trajs)
+        mean_right = phi.sum(axis=0) / phi.shape[0]
+        cov_right = phi.T.dot(phi) / phi.shape[0]
+        np.testing.assert_allclose(mean_right, 0.0, atol=atol)
+        np.testing.assert_allclose(cov_right, np.eye(vamp.dimension()), atol=atol)
+
+        vamp.right = False
+        psi_trajs = [ sf[0:-tau, :] for sf in vamp.get_output() ]
+        psi = np.concatenate(psi_trajs)
+        mean_left = psi.sum(axis=0) / psi.shape[0]
+        cov_left = psi.T.dot(psi) / psi.shape[0]
+        np.testing.assert_allclose(mean_left, 0.0, atol=atol)
+        np.testing.assert_allclose(cov_left, np.eye(vamp.dimension()), atol=atol)
+
+        # compute correlation between left and right
+        assert phi.shape[0]==psi.shape[0]
+        C01_psi_phi = psi.T.dot(phi) / phi.shape[0]
+        n = max(C01_psi_phi.shape)
+        C01_psi_phi = C01_psi_phi[0:n,:][:, 0:n]
+        np.testing.assert_allclose(C01_psi_phi, np.diag(vamp.singular_values[0:vamp.dimension()]), atol=atol)
+
+        if test_partial_fit:
+            vamp2 = pyemma_api_vamp(lag=tau, scaling=None)
+            for t in trajs:
+                vamp2.partial_fit(t)
+
+            model_params = vamp._model.get_model_params()
+            model_params2 = vamp2._model.get_model_params()
+
+            atol = 1e-15
+            rtol = 1e-6
+
+            for n in model_params.keys():
+                if model_params[n] is not None and model_params2[n] is not None:
+                    if n not in ('U', 'V'):
+                        np.testing.assert_allclose(model_params[n], model_params2[n], rtol=rtol, atol=atol,
+                                               err_msg='failed for model param %s' % n)
+                    else:
+                        assert_allclose_ignore_phase(model_params[n], model_params2[n], atol=atol)
+
+            vamp2.singular_values # trigger diagonalization
+
+            vamp2.right = True
+            for t, ref in zip(trajs, phi_trajs):
+                assert_allclose_ignore_phase(vamp2.transform(t[tau:]), ref, rtol=rtol, atol=atol)
+
+            vamp2.right = False
+            for t, ref in zip(trajs, psi_trajs):
+                assert_allclose_ignore_phase(vamp2.transform(t[0:-tau]), ref, rtol=rtol, atol=atol)
+
+
+def generate(T, N_steps, s0=0):
+    dtraj = np.zeros(N_steps, dtype=int)
+    s = s0
+    T_cdf = T.cumsum(axis=1)
+    for t in range(N_steps):
+        dtraj[t] = s
+        s = np.searchsorted(T_cdf[s, :], np.random.rand())
+    return dtraj
+
+
+def assert_allclose_ignore_phase(A, B, atol, rtol=1e-5):
+    A = np.atleast_2d(A)
+    B = np.atleast_2d(B)
+    assert A.shape == B.shape
+    for i in range(B.shape[1]):
+        assert (np.allclose(A[:, i], B[:, i], atol=atol, rtol=rtol)
+                or np.allclose(A[:, i], -B[:, i], atol=atol, rtol=rtol))
+
+
+class TestVAMPModel(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        N_steps = 10000
+        N_traj = 20
+        lag = 1
+        T = np.linalg.matrix_power(np.array([[0.7, 0.2, 0.1], [0.1, 0.8, 0.1], [0.1, 0.1, 0.8]]), lag)
+        dtrajs = [generate(T, N_steps) for _ in range(N_traj)]
+        p0 = np.zeros(3)
+        p1 = np.zeros(3)
+        trajs = []
+        for dtraj in dtrajs:
+            traj = np.zeros((N_steps, T.shape[0]))
+            traj[np.arange(len(dtraj)), dtraj] = 1.0
+            trajs.append(traj)
+            p0 += traj[:-lag, :].sum(axis=0)
+            p1 += traj[lag:, :].sum(axis=0)
+        vamp = pyemma_api_vamp(trajs, lag=lag, scaling=None, dim=1.0)
+        msm = estimate_markov_model(dtrajs, lag=lag, reversible=False)
+        cls.trajs = trajs
+        cls.dtrajs = dtrajs
+        cls.lag = lag
+        cls.msm = msm
+        cls.vamp = vamp
+        cls.p0 = p0 / p0.sum()
+        cls.p1 = p1 / p1.sum()
+        cls.atol = np.finfo(vamp.output_type()).eps*1000.0
+
+    def test_K_is_T(self):
+        m0 = self.vamp.model.mean_0
+        mt = self.vamp.model.mean_t
+        C0 = self.vamp.model.C00 + m0[:, np.newaxis]*m0[np.newaxis, :]
+        C1 = self.vamp.model.C0t + m0[:, np.newaxis]*mt[np.newaxis, :]
+        K = np.linalg.inv(C0).dot(C1)
+        np.testing.assert_allclose(K, self.msm.P, atol=1E-5)
+
+        Tsym = np.diag(self.p0 ** 0.5).dot(self.msm.P).dot(np.diag(self.p1 ** -0.5))
+        np.testing.assert_allclose(np.linalg.svd(Tsym)[1][1:], self.vamp.singular_values[0:2], atol=1E-7)
+
+    def test_singular_functions_against_MSM(self):
+        Tsym = np.diag(self.p0 ** 0.5).dot(self.msm.P).dot(np.diag(self.p1 ** -0.5))
+        Up, S, Vhp = np.linalg.svd(Tsym)
+        Vp = Vhp.T
+        U = Up * (self.p0 ** -0.5)[:, np.newaxis]
+        V = Vp * (self.p1 ** -0.5)[:, np.newaxis]
+        assert_allclose_ignore_phase(U[:, 0], np.ones(3), atol=1E-5)
+        assert_allclose_ignore_phase(V[:, 0], np.ones(3), atol=1E-5)
+        U = U[:, 1:]
+        V = V[:, 1:]
+        self.vamp.right = True
+        phi = self.vamp.transform(np.eye(3))
+        self.vamp.right = False
+        psi = self.vamp.transform(np.eye(3))
+        assert_allclose_ignore_phase(U, psi, atol=1E-5)
+        assert_allclose_ignore_phase(V, phi, atol=1E-5)
+        references_sf = [U.T.dot(np.diag(self.p0)).dot(np.linalg.matrix_power(self.msm.P, k*self.lag)).dot(V).T for k in
+                         range(10-1)]
+        cktest = self.vamp.cktest(n_observables=2, mlags=10)
+        pred_sf = cktest.predictions
+        esti_sf = cktest.estimates
+        for e, p, r in zip(esti_sf[1:], pred_sf[1:], references_sf[1:]):
+            np.testing.assert_allclose(np.diag(p), np.diag(r), atol=1E-6)
+            np.testing.assert_allclose(np.abs(p), np.abs(r), atol=1E-6)
+
+    def test_CK_expectation_against_MSM(self):
+        obs = np.eye(3) # observe every state
+        cktest = self.vamp.cktest(observables=obs, statistics=None, mlags=4)
+        pred = cktest.predictions[1:]
+        est = cktest.estimates[1:]
+
+        for i, (est_, pred_) in enumerate(zip(est, pred)):
+            msm = estimate_markov_model(dtrajs=self.dtrajs, lag=self.lag*(i+1), reversible=False)
+            msm_esti = self.p0.T.dot(msm.P).dot(obs)
+            msm_pred = self.p0.T.dot(np.linalg.matrix_power(self.msm.P, (i+1))).dot(obs)
+            np.testing.assert_allclose(pred_,  msm_pred, atol=self.atol)
+            np.testing.assert_allclose(est_, msm_esti, atol=self.atol)
+            np.testing.assert_allclose(est_, pred_, atol=0.006)
+
+    def test_CK_covariances_of_singular_functions(self):
+        cktest = self.vamp.cktest(n_observables=2, mlags=4)  # auto
+        pred = cktest.predictions[1:]
+        est = cktest.estimates[1:]
+        error = np.max(np.abs(np.array(pred) - np.array(est))) / max(np.max(pred), np.max(est))
+        assert error < 0.05
+
+    def test_CK_covariances_against_MSM(self):
+        obs = np.eye(3) # observe every state
+        sta = np.eye(3) # restrict p0 to every state
+        cktest = self.vamp.cktest(observables=obs, statistics=sta, mlags=4, show_progress=True)
+        pred = cktest.predictions[1:]
+        est = cktest.estimates[1:]
+
+        for i, (est_, pred_) in enumerate(zip(est, pred)):
+            msm = estimate_markov_model(dtrajs=self.dtrajs, lag=self.lag*(i+1), reversible=False)
+            msm_esti = (self.p0 * sta).T.dot(msm.P).dot(obs).T
+            msm_pred = (self.p0 * sta).T.dot(np.linalg.matrix_power(self.msm.P, (i+1))).dot(obs).T
+            np.testing.assert_allclose(np.diag(pred_),  np.diag(msm_pred), atol=self.atol)
+            np.testing.assert_allclose(np.diag(est_), np.diag(msm_esti), atol=self.atol)
+            np.testing.assert_allclose(np.diag(est_), np.diag(pred_), atol=0.006)
+
+    def test_self_score_with_MSM(self):
+        T = self.msm.P
+        Tadj = np.diag(1./self.p1).dot(T.T).dot(np.diag(self.p0))
+        NFro = np.trace(T.dot(Tadj))
+        s2 = self.vamp.score(score_method='VAMP2')
+        np.testing.assert_allclose(s2, NFro)
+
+        Tsym = np.diag(self.p0**0.5).dot(T).dot(np.diag(self.p1**-0.5))
+        Nnuc = np.linalg.norm(Tsym, ord='nuc')
+        s1 = self.vamp.score(score_method='VAMP1')
+        np.testing.assert_allclose(s1, Nnuc)
+
+        # TODO: check why this is not equal
+        sE = self.vamp.score(score_method='VAMPE')
+        np.testing.assert_allclose(sE, NFro)  # see paper appendix H.2
+
+    def test_score_vs_MSM(self):
+        from pyemma.util.contexts import numpy_random_seed
+        with numpy_random_seed(32):
+            trajs_test, trajs_train = cvsplit_dtrajs(self.trajs)
+        with numpy_random_seed(32):
+            dtrajs_test, dtrajs_train = cvsplit_dtrajs(self.dtrajs)
+
+        methods = ('VAMP1', 'VAMP2', 'VAMPE')
+
+        for m in methods:
+            msm_train = estimate_markov_model(dtrajs=dtrajs_train, lag=self.lag, reversible=False)
+            score_msm = msm_train.score(dtrajs_test, score_method=m, score_k=None)
+
+            vamp_train = pyemma_api_vamp(data=trajs_train, lag=self.lag, dim=1.0)
+            score_vamp = vamp_train.score(test_data=trajs_test, score_method=m)
+
+            self.assertAlmostEqual(score_msm, score_vamp, places=2 if m == 'VAMPE' else 3, msg=m)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/pyemma/coordinates/transform/__init__.py b/pyemma/coordinates/transform/__init__.py
index de8366d13..b7f976ceb 100644
--- a/pyemma/coordinates/transform/__init__.py
+++ b/pyemma/coordinates/transform/__init__.py
@@ -28,7 +28,11 @@
 
     PCA - principal components
     TICA - time independent components
+    VAMP - Variational approach for Markov processes
+    VAMPModel - Kinetic model form the Variational approach for Markov processes
+    VAMPChapmanKolmogorovValidator - Chapman Kolmogorov test for the Variational approach for Markov processes
 """
 
 from .pca import *
 from .tica import *
+from .vamp import *
diff --git a/pyemma/coordinates/transform/vamp.py b/pyemma/coordinates/transform/vamp.py
new file mode 100644
index 000000000..981e1080b
--- /dev/null
+++ b/pyemma/coordinates/transform/vamp.py
@@ -0,0 +1,1018 @@
+# This file is part of PyEMMA.
+#
+# Copyright (c) 2017 Computational Molecular Biology Group, Freie Universitaet Berlin (GER)
+#
+# PyEMMA is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+'''
+@author: paul, marscher, wu, noe
+'''
+
+from __future__ import absolute_import
+
+import numpy as np
+
+from pyemma._base.model import Model
+from pyemma._base.serialization.serialization import SerializableMixIn
+from pyemma.util.annotators import fix_docs
+from pyemma.util.types import ensure_ndarray_or_None, ensure_ndarray
+from pyemma._ext.variational.solvers.direct import spd_inv_sqrt
+from pyemma.coordinates.estimation.covariance import LaggedCovariance
+from pyemma.coordinates.data._base.transformer import StreamingEstimationTransformer
+from pyemma.msm.estimators.lagged_model_validators import LaggedModelValidator
+from pyemma.util.linalg import mdot
+
+import warnings
+
+__all__ = ['VAMP', 'VAMPModel', 'VAMPChapmanKolmogorovValidator']
+
+
+class VAMPModel(Model, SerializableMixIn):
+    __serialize_version = 0
+    __serialize_fields = ('_U', '_V', '_svd_performed')
+
+    def set_model_params(self, mean_0, mean_t, C00, Ctt, C0t, U, V, singular_values, cumvar, dim, epsilon):
+        self.mean_0 = mean_0
+        self.mean_t = mean_t
+        self.C00 = C00
+        self.Ctt = Ctt
+        self.C0t = C0t
+        self._svd_performed = False
+        self._U = U
+        self._V = V
+        self._singular_values = singular_values
+        self.cumvar = cumvar
+        self.dim = dim
+        self.epsilon = epsilon
+
+    @property
+    def U(self):
+        "Tranformation matrix that represents the linear map from mean-free feature space to the space of left singular functions."
+        if not self._svd_performed:
+            self._diagonalize()
+        return self._U
+
+    @property
+    def V(self):
+        "Tranformation matrix that represents the linear map from mean-free feature space to the space of right singular functions."
+        if not self._svd_performed:
+            self._diagonalize()
+        return self._V
+
+    @property
+    def singular_values(self):
+        "The singular values of the half-weighted Koopman matrix"
+        if not self._svd_performed:
+            self._diagonalize()
+        return self._singular_values
+
+    @property
+    def C00(self):
+        return self._C00
+
+    @C00.setter
+    def C00(self, val):
+        self._svd_performed = False
+        self._C00 = val
+
+    @property
+    def C0t(self):
+        return self._C0t
+
+    @C0t.setter
+    def C0t(self, val):
+        self._svd_performed = False
+        self._C0t = val
+
+    @property
+    def Ctt(self):
+        return self._Ctt
+
+    @Ctt.setter
+    def Ctt(self, val):
+        self._svd_performed = False
+        self._Ctt = val
+
+    def dimension(self):
+        """ output dimension """
+        if self.dim is None or (isinstance(self.dim, float) and self.dim == 1.0):
+            if hasattr(self, '_rank0'):
+                return min(self._rank0, self._rankt)
+            else:
+                raise RuntimeError('Requested dimension, but the dimension depends on the singular values of C00 and C11'
+                                   ' and the transformer has not yet been estimated. Call estimate() before.')
+        if isinstance(self.dim, float):
+            if hasattr(self, 'cumvar') and self.cumvar is not None:
+                return np.count_nonzero(self.cumvar >= self.dim)
+            else:
+                raise RuntimeError('Requested dimension, but the dimension depends on the cumulative variance and the '
+                                   'transformer has not yet been estimated. Call estimate() before.')
+        else:
+            if hasattr(self, '_rank0'):
+                return np.min([self._rank0, self._rankt, self.dim])
+            else:
+                warnings.warn(
+                    RuntimeWarning('Requested dimension, but the dimension depends on the singular values of C00 and C11'
+                                   ' and the transformer has not yet been estimated. Result is only an approximation.'))
+                return self.dim
+
+    def expectation(self, observables, statistics, lag_multiple=1, observables_mean_free=False, statistics_mean_free=False):
+        r"""Compute future expectation of observable or covariance using the approximated Koopman operator.
+
+        Parameters
+        ----------
+        observables : np.ndarray((input_dimension, n_observables))
+            Coefficients that express one or multiple observables in
+            the basis of the input features.
+
+        statistics : np.ndarray((input_dimension, n_statistics)), optional
+            Coefficients that express one or multiple statistics in
+            the basis of the input features.
+            This parameter can be None. In that case, this method
+            returns the future expectation value of the observable(s).
+
+        lag_multiple : int
+            If > 1, extrapolate to a multiple of the estimator's lag
+            time by assuming Markovianity of the approximated Koopman
+            operator.
+
+        observables_mean_free : bool, default=False
+            If true, coefficients in `observables` refer to the input
+            features with feature means removed.
+            If false, coefficients in `observables` refer to the
+            unmodified input features.
+
+        statistics_mean_free : bool, default=False
+            If true, coefficients in `statistics` refer to the input
+            features with feature means removed.
+            If false, coefficients in `statistics` refer to the
+            unmodified input features.
+
+        Notes
+        -----
+        A "future expectation" of a observable g is the average of g computed
+        over a time window that has the same total length as the input data
+        from which the Koopman operator was estimated but is shifted
+        by lag_multiple*tau time steps into the future (where tau is the lag
+        time).
+
+        It is computed with the equation:
+
+        .. math::
+
+            \mathbb{E}[g]_{\rho_{n}}=\mathbf{q}^{T}\mathbf{P}^{n-1}\mathbf{e}_{1}
+
+        where
+
+        .. math::
+
+            P_{ij}=\sigma_{i}\langle\psi_{i},\phi_{j}\rangle_{\rho_{1}}
+
+        and
+
+        .. math::
+
+            q_{i}=\langle g,\phi_{i}\rangle_{\rho_{1}}
+
+        and :math:`\mathbf{e}_{1}` is the first canonical unit vector.
+
+
+        A model prediction of time-lagged covariances between the
+        observable f and the statistic g at a lag-time of lag_multiple*tau
+        is computed with the equation:
+
+        .. math::
+
+            \mathrm{cov}[g,\,f;n\tau]=\mathbf{q}^{T}\mathbf{P}^{n-1}\boldsymbol{\Sigma}\mathbf{r}
+
+        where :math:`r_{i}=\langle\psi_{i},f\rangle_{\rho_{0}}` and
+        :math:`\boldsymbol{\Sigma}=\mathrm{diag(\boldsymbol{\sigma})}` .
+        """
+        # TODO: implement the case lag_multiple=0
+
+        dim = self.dimension()
+
+        S = np.diag(np.concatenate(([1.0], self.singular_values[0:dim])))
+        V = self.V[:, 0:dim]
+        U = self.U[:, 0:dim]
+        m_0 = self.mean_0
+        m_t = self.mean_t
+
+        assert lag_multiple >= 1, 'lag_multiple = 0 not implemented'
+
+        if lag_multiple == 1:
+            P = S
+        else:
+            p = np.zeros((dim + 1, dim + 1))
+            p[0, 0] = 1.0
+            p[1:, 0] = U.T.dot(m_t - m_0)
+            p[1:, 1:] = U.T.dot(self.Ctt).dot(V)
+            P = np.linalg.matrix_power(S.dot(p), lag_multiple - 1).dot(S)
+
+        Q = np.zeros((observables.shape[1], dim + 1))
+        if not observables_mean_free:
+            Q[:, 0] = observables.T.dot(m_t)
+        Q[:, 1:] = observables.T.dot(self.Ctt).dot(V)
+
+        if statistics is not None:
+            # compute covariance
+            R = np.zeros((statistics.shape[1], dim + 1))
+            if not statistics_mean_free:
+                R[:, 0] = statistics.T.dot(m_0)
+            R[:, 1:] = statistics.T.dot(self.C00).dot(U)
+
+        if statistics is not None:
+            # compute lagged covariance
+            return Q.dot(P).dot(R.T)
+            # TODO: discuss whether we want to return this or the transpose
+            # TODO: from MSMs one might expect to first index to refer to the statistics, here it is the other way round
+        else:
+            # compute future expectation
+            return Q.dot(P)[:, 0]
+
+    def _diagonalize(self, scaling=None):
+        """Performs SVD on covariance matrices and save left, right singular vectors and values in the model.
+
+        Parameters
+        ----------
+        scaling : None or string, default=None
+            Scaling to be applied to the VAMP modes upon transformation
+            * None: no scaling will be applied, variance of the singular
+              functions is 1
+            * 'kinetic map' or 'km': singular functions are scaled by
+              singular value. Note that only the left singular functions
+              induce a kinetic map.
+        """
+
+        L0, self._rank0 = spd_inv_sqrt(self.C00, epsilon=self.epsilon, return_rank=True)
+        Lt, self._rankt = spd_inv_sqrt(self.Ctt, epsilon=self.epsilon, return_rank=True)
+        A = L0.T.dot(self.C0t).dot(Lt)
+
+        Uprime, s, Vprimeh = np.linalg.svd(A, compute_uv=True)
+        self._singular_values = s
+
+        # compute cumulative variance
+        cumvar = np.cumsum(s ** 2)
+        cumvar /= cumvar[-1]
+        self.cumvar = cumvar
+
+        self._L0 = L0
+        self._Lt = Lt
+
+        m = self.dimension()
+
+        U = L0.dot(Uprime[:, :m])  # U in the paper singular_vectors_left
+        V = Lt.dot(Vprimeh[:m, :].T)  # V in the paper singular_vectors_right
+
+        # scale vectors
+        if scaling is None:
+            pass
+        elif scaling in ['km', 'kinetic map']:
+            U *= s[np.newaxis, 0:m]
+        else:
+            raise ValueError('unexpected value (%s) of "scaling"' % scaling)
+
+        self._U = U
+        self._V = V
+        self._svd_performed = True
+
+    def score(self, test_model=None, score_method='VAMP2'):
+        """Compute the VAMP score for this model or the cross-validation score between self and a second model.
+
+        Parameters
+        ----------
+        test_model : VAMPModel, optional, default=None
+
+            If `test_model` is not None, this method computes the cross-validation score
+            between self and `test_model`. It is assumed that self was estimated from
+            the "training" data and `test_model` was estimated from the "test" data. The
+            score is computed for one realization of self and `test_model`. Estimation
+            of the average cross-validation score and partitioning of data into test and
+            training part is not performed by this method.
+
+            If `test_model` is None, this method computes the VAMP score for the model
+            contained in self.
+
+        score_method : str, optional, default='VAMP2'
+            Available scores are based on the variational approach for Markov processes [1]_:
+
+            *  'VAMP1'  Sum of singular values of the half-weighted Koopman matrix [1]_ .
+                        If the model is reversible, this is equal to the sum of
+                        Koopman matrix eigenvalues, also called Rayleigh quotient [1]_.
+            *  'VAMP2'  Sum of squared singular values of the half-weighted Koopman matrix [1]_ .
+                        If the model is reversible, this is equal to the kinetic variance [2]_ .
+            *  'VAMPE'  Approximation error of the estimated Koopman operator with respect to
+                        the true Koopman operator up to an additive constant [1]_ .
+
+        Returns
+        -------
+        score : float
+            If `test_model` is not None, returns the cross-validation VAMP score between
+            self and `test_model`. Otherwise return the selected VAMP-score of self.
+
+        References
+        ----------
+        .. [1] Wu, H. and Noe, F. 2017. Variational approach for learning Markov processes from time series data.
+            arXiv:1707.04659v1
+        .. [2] Noe, F. and Clementi, C. 2015. Kinetic distance and kinetic maps from molecular dynamics simulation.
+            J. Chem. Theory. Comput. doi:10.1021/acs.jctc.5b00553
+        """
+        # TODO: implement for TICA too
+        if test_model is None:
+            test_model = self
+        Uk = self.U[:, 0:self.dimension()]
+        Vk = self.V[:, 0:self.dimension()]
+        res = None
+        if score_method == 'VAMP1' or score_method == 'VAMP2':
+            A = spd_inv_sqrt(Uk.T.dot(test_model.C00).dot(Uk))
+            B = Uk.T.dot(test_model.C0t).dot(Vk)
+            C = spd_inv_sqrt(Vk.T.dot(test_model.Ctt).dot(Vk))
+            ABC = mdot(A, B, C)
+            if score_method == 'VAMP1':
+                res = np.linalg.norm(ABC, ord='nuc')
+            elif score_method == 'VAMP2':
+                res = np.linalg.norm(ABC, ord='fro')**2
+        elif score_method == 'VAMPE':
+            Sk = np.diag(self.singular_values[0:self.dimension()])
+            res = np.trace(2.0 * mdot(Vk, Sk, Uk.T, test_model.C0t) - mdot(Vk, Sk, Uk.T, test_model.C00, Uk, Sk, Vk.T, test_model.Ctt))
+        else:
+            raise ValueError('"score" should be one of VAMP1, VAMP2 or VAMPE')
+        # add the contribution (+1) of the constant singular functions to the result
+        assert res
+        return res + 1
+
+
+@fix_docs
+class VAMP(StreamingEstimationTransformer, SerializableMixIn):
+    r"""Variational approach for Markov processes (VAMP)"""
+
+    __serialize_version = 0
+
+    def describe(self):
+        return "[VAMP, lag = %i; max. output dim. = %s]" % (self._lag, str(self.dim))
+
+    def __init__(self, lag, dim=None, scaling=None, right=True, epsilon=1e-6,
+                 stride=1, skip=0, ncov_max=float('inf')):
+        r""" Variational approach for Markov processes (VAMP) [1]_.
+
+          Parameters
+          ----------
+          lag : int
+              lag time
+          dim : float or int
+              Number of dimensions to keep:
+
+              * if dim is not set all available ranks are kept:
+                  `n_components == min(n_samples, n_features)`
+              * if dim is an integer >= 1, this number specifies the number
+                of dimensions to keep. By default this will use the kinetic
+                variance.
+              * if dim is a float with ``0 < dim < 1``, select the number
+                of dimensions such that the amount of kinetic variance
+                that needs to be explained is greater than the percentage
+                specified by dim.
+          scaling : None or string
+              Scaling to be applied to the VAMP order parameters upon transformation
+
+              * None: no scaling will be applied, variance of the order parameters is 1
+              * 'kinetic map' or 'km': order parameters are scaled by singular value
+                Only the left singular functions induce a kinetic map.
+                Therefore scaling='km' is only effective if `right` is False.
+          right : boolean
+              Whether to compute the right singular functions.
+              If `right==True`, `get_output()` will return the right singular
+              functions. Otherwise, `get_output()` will return the left singular
+              functions.
+              Beware that only `frames[tau:, :]` of each trajectory returned
+              by `get_output()` contain valid values of the right singular
+              functions. Conversely, only `frames[0:-tau, :]` of each
+              trajectory returned by `get_output()` contain valid values of
+              the left singular functions. The remaining frames might
+              possibly be interpreted as some extrapolation.
+          epsilon : float
+              singular value cutoff. Singular values of :math:`C0` with
+              norms <= epsilon will be cut off. The remaining number of
+              singular values define the size of the output.
+          stride: int, optional, default = 1
+              Use only every stride-th time step. By default, every time step is used.
+          skip : int, default=0
+              skip the first initial n frames per trajectory.
+          ncov_max : int, default=infinity
+              limit the memory usage of the algorithm from [3]_ to an amount that corresponds
+              to ncov_max additional copies of each correlation matrix
+
+          Notes
+          -----
+          VAMP is a method for dimensionality reduction of Markov processes.
+
+          The Koopman operator :math:`\mathcal{K}` is an integral operator
+          that describes conditional future expectation values. Let
+          :math:`p(\mathbf{x},\,\mathbf{y})` be the conditional probability
+          density of visiting an infinitesimal phase space volume around
+          point :math:`\mathbf{y}` at time :math:`t+\tau` given that the phase
+          space point :math:`\mathbf{x}` was visited at the earlier time
+          :math:`t`. Then the action of the Koopman operator on a function
+          :math:`f` can be written as follows:
+
+          .. math::
+
+              \mathcal{K}f=\int p(\mathbf{x},\,\mathbf{y})f(\mathbf{y})\,\mathrm{dy}=\mathbb{E}\left[f(\mathbf{x}_{t+\tau}\mid\mathbf{x}_{t}=\mathbf{x})\right]
+
+          The Koopman operator is defined without any reference to an
+          equilibrium distribution. Therefore it is well-defined in
+          situations where the dynamics is irreversible or/and non-stationary
+          such that no equilibrium distribution exists.
+
+          If we approximate :math:`f` by a linear superposition of ansatz
+          functions :math:`\boldsymbol{\chi}` of the conformational
+          degrees of freedom (features), the operator :math:`\mathcal{K}`
+          can be approximated by a (finite-dimensional) matrix :math:`\mathbf{K}`.
+
+          The approximation is computed as follows: From the time-dependent
+          input features :math:`\boldsymbol{\chi}(t)`, we compute the mean
+          :math:`\boldsymbol{\mu}_{0}` (:math:`\boldsymbol{\mu}_{1}`) from
+          all data excluding the last (first) :math:`\tau` steps of every
+          trajectory as follows:
+
+          .. math::
+
+              \boldsymbol{\mu}_{0}	:=\frac{1}{T-\tau}\sum_{t=0}^{T-\tau}\boldsymbol{\chi}(t)
+
+              \boldsymbol{\mu}_{1}	:=\frac{1}{T-\tau}\sum_{t=\tau}^{T}\boldsymbol{\chi}(t)
+
+          Next, we compute the instantaneous covariance matrices
+          :math:`\mathbf{C}_{00}` and :math:`\mathbf{C}_{11}` and the
+          time-lagged covariance matrix :math:`\mathbf{C}_{01}` as follows:
+
+          .. math::
+
+              \mathbf{C}_{00}	:=\frac{1}{T-\tau}\sum_{t=0}^{T-\tau}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right]\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right]
+
+              \mathbf{C}_{11}	:=\frac{1}{T-\tau}\sum_{t=\tau}^{T}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{1}\right]\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{1}\right]
+
+              \mathbf{C}_{01}	:=\frac{1}{T-\tau}\sum_{t=0}^{T-\tau}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right]\left[\boldsymbol{\chi}(t+\tau)-\boldsymbol{\mu}_{1}\right]
+
+          The Koopman matrix is then computed as follows:
+
+          .. math::
+
+              \mathbf{K}=\mathbf{C}_{00}^{-1}\mathbf{C}_{01}
+
+          It can be shown [1]_ that the leading singular functions of the
+          half-weighted Koopman matrix
+
+          .. math::
+
+              \bar{\mathbf{K}}:=\mathbf{C}_{00}^{-\frac{1}{2}}\mathbf{C}_{01}\mathbf{C}_{11}^{-\frac{1}{2}}
+
+          encode the best reduced dynamical model for the time series.
+
+          The singular functions can be computed by first performing the
+          singular value decomposition
+
+          .. math::
+
+              \bar{\mathbf{K}}=\mathbf{U}^{\prime}\mathbf{S}\mathbf{V}^{\prime}
+
+          and then mapping the input conformation to the left singular
+          functions :math:`\boldsymbol{\psi}` and right singular
+          functions :math:`\boldsymbol{\phi}` as follows:
+
+          .. math::
+
+              \boldsymbol{\psi}(t):=\mathbf{U}^{\prime\top}\mathbf{C}_{00}^{-\frac{1}{2}}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right]
+
+              \boldsymbol{\phi}(t):=\mathbf{V}^{\prime\top}\mathbf{C}_{11}^{-\frac{1}{2}}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{1}\right]
+
+
+          References
+          ----------
+          .. [1] Wu, H. and Noe, F. 2017. Variational approach for learning Markov processes from time series data.
+              arXiv:1707.04659v1
+          .. [2] Noe, F. and Clementi, C. 2015. Kinetic distance and kinetic maps from molecular dynamics simulation.
+              J. Chem. Theory. Comput. doi:10.1021/acs.jctc.5b00553
+          .. [3] Chan, T. F., Golub G. H., LeVeque R. J. 1979. Updating formulae and pairwiese algorithms for
+             computing sample variances. Technical Report STAN-CS-79-773, Department of Computer Science, Stanford University.
+          """
+        StreamingEstimationTransformer.__init__(self)
+
+        # empty dummy model instance
+        self._model = VAMPModel()
+        self.set_params(lag=lag, dim=dim, scaling=scaling, right=right,
+                        epsilon=epsilon, stride=stride, skip=skip, ncov_max=ncov_max)
+        self._covar = None
+        self._model.update_model_params(dim=dim, epsilon=epsilon)
+
+    def _estimate(self, iterable, **kw):
+        self._covar = LaggedCovariance(c00=True, c0t=True, ctt=True, remove_data_mean=True, reversible=False,
+                                       lag=self.lag, bessel=False, stride=self.stride, skip=self.skip, weights=None,
+                                       ncov_max=self.ncov_max)
+        indim = iterable.dimension()
+
+        if isinstance(self.dim, int) and not self.dim <= indim:
+            raise RuntimeError("requested more output dimensions (%i) than dimension"
+                               " of input data (%i)" % (self.dim, indim))
+
+        if self._logger_is_active(self._loglevel_DEBUG):
+            self._logger.debug("Running VAMP with tau=%i; Estimating two covariance matrices"
+                               " with dimension (%i, %i)" % (self._lag, indim, indim))
+
+        self._covar.estimate(iterable, **kw)
+        self._model.update_model_params(mean_0=self._covar.mean,
+                                        mean_t=self._covar.mean_tau,
+                                        C00=self._covar.C00_,
+                                        C0t=self._covar.C0t_,
+                                        Ctt=self._covar.Ctt_)
+        self._diagonalize()
+
+        return self._model
+
+    def partial_fit(self, X):
+        """ incrementally update the covariances and mean.
+
+        Parameters
+        ----------
+        X: array, list of arrays, PyEMMA reader
+            input data.
+
+        Notes
+        -----
+        The projection matrix is first being calculated upon its first access.
+        """
+        from pyemma.coordinates import source
+        iterable = source(X)
+
+        if isinstance(self.dim, int):
+            indim = iterable.dimension()
+            if not self.dim <= indim:
+                raise RuntimeError("requested more output dimensions (%i) than dimension"
+                                   " of input data (%i)" % (self.dim, indim))
+
+        if self._covar is None:
+            self._covar = LaggedCovariance(c00=True, c0t=True, ctt=True, remove_data_mean=True, reversible=False,
+                                           lag=self.lag, bessel=False, stride=self.stride, skip=self.skip, weights=None,
+                                           ncov_max=self.ncov_max)
+        self._covar.partial_fit(iterable)
+        self._model.update_model_params(mean_0=self._covar.mean,  # TODO: inefficient, fixme
+                                        mean_t=self._covar.mean_tau,
+                                        C00=self._covar.C00_,
+                                        C0t=self._covar.C0t_,
+                                        Ctt=self._covar.Ctt_)
+
+        # self._used_data = self._covar._used_data
+        self._estimated = False
+
+        return self
+
+    def _diagonalize(self):
+        # diagonalize with low rank approximation
+        self._logger.debug("diagonalize covariance matrices")
+        self.model._diagonalize(self.scaling)
+        self._logger.debug("finished diagonalization.")
+        self._estimated = True
+
+    def dimension(self):
+        return self._model.dimension()
+
+    def _transform_array(self, X):
+        r"""Projects the data onto the dominant singular functions.
+
+        Parameters
+        ----------
+        X : ndarray(n, m)
+            the input data
+
+        Returns
+        -------
+        Y : ndarray(n,)
+            the projected data
+            If `self.right` is True, projection will be on the right singular
+            functions. Otherwise, projection will be on the left singular
+            functions.
+        """
+        # TODO: in principle get_output should not return data for *all* frames!
+        # TODO: implement our own iterators? This would also include random access to be complete...
+        if self.right:
+            X_meanfree = X - self._model.mean_t
+            Y = np.dot(X_meanfree, self._model.V[:, 0:self.dimension()])
+        else:
+            X_meanfree = X - self._model.mean_0
+            Y = np.dot(X_meanfree, self._model.U[:, 0:self.dimension()])
+
+        return Y.astype(self.output_type())
+
+    @property
+    def singular_values(self):
+        r"""Singular values of the half-weighted Koopman matrix (usually denoted :math:`\sigma`)
+
+        Returns
+        -------
+        singular values: 1-D np.array
+        """
+        return self._model.singular_values
+
+    @property
+    def singular_vectors_right(self):
+        r"""Tranformation matrix that represents the linear map from feature space to the space of right singular functions.
+
+        Notes
+        -----
+        Right "singular vectors" V of the VAMP problem (equation 13 in [1]_), columnwise
+
+        Returns
+        -------
+        vectors: 2-D ndarray
+            Coefficients that express the right singular functions in the
+            basis of mean-free input features.
+
+        References
+        ----------
+        .. [1] Wu, H. and Noe, F. 2017. Variational approach for learning Markov processes from time series data.
+            arXiv:1707.04659v1
+        """
+        return self._model.V
+
+    @property
+    def singular_vectors_left(self):
+        r"""Tranformation matrix that represents the linear map from feature space to the space of left singular functions.
+        
+        Notes
+        -----
+        Left "singular vectors" U of the VAMP problem (equation 13 in [1]_), columnwise
+
+        Returns
+        -------
+        vectors: 2-D ndarray
+            Coefficients that express the left singular functions in the
+            basis of mean-free input features.
+
+        References
+        ----------
+        .. [1] Wu, H. and Noe, F. 2017. Variational approach for learning Markov processes from time series data.
+            arXiv:1707.04659v1
+        """
+        return self._model.U
+
+    @property
+    def cumvar(self):
+        r"""Cumulative sum of the squared and normalized singular values
+
+        Returns
+        -------
+        cumvar: 1D np.array
+        """
+        return self._model.cumvar
+
+    @property
+    def show_progress(self):
+        if self._covar is None:
+            return False
+        else:
+            return self._covar.show_progress
+
+    @show_progress.setter
+    def show_progress(self, value):
+        if self._covar is not None:
+            self._covar.show_progress = value
+
+    def expectation(self, observables, statistics, lag_multiple=1, observables_mean_free=False,
+                    statistics_mean_free=False):
+        r"""Compute future expectation of observable or covariance using the approximated Koopman operator.
+
+        Parameters
+        ----------
+        observables : np.ndarray((input_dimension, n_observables))
+            Coefficients that express one or multiple observables in
+            the basis of the input features.
+
+        statistics : np.ndarray((input_dimension, n_statistics)), optional
+            Coefficients that express one or multiple statistics in
+            the basis of the input features.
+            This parameter can be None. In that case, this method
+            returns the future expectation value of the observable(s).
+
+        lag_multiple : int
+            If > 1, extrapolate to a multiple of the estimator's lag
+            time by assuming Markovianity of the approximated Koopman
+            operator.
+
+        observables_mean_free : bool, default=False
+            If true, coefficients in `observables` refer to the input
+            features with feature means removed.
+            If false, coefficients in `observables` refer to the
+            unmodified input features.
+
+        statistics_mean_free : bool, default=False
+            If true, coefficients in `statistics` refer to the input
+            features with feature means removed.
+            If false, coefficients in `statistics` refer to the
+            unmodified input features.
+
+        Notes
+        -----
+        A "future expectation" of a observable g is the average of g computed
+        over a time window that has the same total length as the input data
+        from which the Koopman operator was estimated but is shifted
+        by lag_multiple*tau time steps into the future (where tau is the lag
+        time).
+
+        It is computed with the equation:
+
+        .. math::
+
+            \mathbb{E}[g]_{\rho_{n}}=\mathbf{q}^{T}\mathbf{P}^{n-1}\mathbf{e}_{1}
+
+        where
+
+        .. math::
+
+            P_{ij}=\sigma_{i}\langle\psi_{i},\phi_{j}\rangle_{\rho_{1}}
+
+        and
+
+        .. math::
+
+            q_{i}=\langle g,\phi_{i}\rangle_{\rho_{1}}
+
+        and :math:`\mathbf{e}_{1}` is the first canonical unit vector.
+
+
+        A model prediction of time-lagged covariances between the
+        observable f and the statistic g at a lag-time of lag_multiple*tau
+        is computed with the equation:
+
+        .. math::
+
+            \mathrm{cov}[g,\,f;n\tau]=\mathbf{q}^{T}\mathbf{P}^{n-1}\boldsymbol{\Sigma}\mathbf{r}
+
+        where :math:`r_{i}=\langle\psi_{i},f\rangle_{\rho_{0}}` and
+        :math:`\boldsymbol{\Sigma}=\mathrm{diag(\boldsymbol{\sigma})}` .
+        """
+        return self._model.expectation(observables, statistics, lag_multiple=lag_multiple,
+                                       statistics_mean_free=statistics_mean_free,
+                                       observables_mean_free=observables_mean_free)
+
+    def cktest(self, n_observables=None, observables='phi', statistics='psi', mlags=10, n_jobs=1, show_progress=True,
+               iterable=None):
+        r"""Do the Chapman-Kolmogorov test by computing predictions for higher lag times and by performing estimations at higher lag times.
+
+        Notes
+        -----
+
+        This method computes two sets of time-lagged covariance matrices
+
+        * estimates at higher lag times :
+
+          .. math::
+
+              \left\langle \mathbf{K}(n\tau)g_{i},f_{j}\right\rangle_{\rho_{0}}
+
+          where :math:`\rho_{0}` is the empirical distribution implicitly defined
+          by all data points from time steps 0 to T-tau in all trajectories,
+          :math:`\mathbf{K}(n\tau)` is a rank-reduced Koopman matrix estimated
+          at the lag-time n*tau and g and f are some functions of the data.
+          Rank-reduction of the Koopman matrix is controlled by the `dim`
+          parameter of :func:`vamp <pyemma.coordinates.vamp>`.
+
+        * predictions at higher lag times :
+
+          .. math::
+
+              \left\langle \mathbf{K}^{n}(\tau)g_{i},f_{j}\right\rangle_{\rho_{0}}
+
+          where :math:`\mathbf{K}^{n}` is the n'th power of the rank-reduced
+          Koopman matrix contained in self.
+
+
+        The Champan-Kolmogorov test is to compare the predictions to the
+        estimates.
+
+        Parameters
+        ----------
+        n_observables : int, optional, default=None
+            Limit the number of default observables (and of default statistics)
+            to this number.
+            Only used if `observables` are None or `statistics` are None.
+
+        observables : np.ndarray((input_dimension, n_observables)) or 'phi'
+            Coefficients that express one or multiple observables :math:`g`
+            in the basis of the input features.
+            This parameter can be 'phi'. In that case, the dominant
+            right singular functions of the Koopman operator estimated
+            at the smallest lag time are used as default observables.
+
+        statistics : np.ndarray((input_dimension, n_statistics)) or 'psi'
+            Coefficients that express one or multiple statistics :math:`f`
+            in the basis of the input features.
+            This parameter can be 'psi'. In that case, the dominant
+            left singular functions of the Koopman operator estimated
+            at the smallest lag time are used as default statistics.
+
+        mlags : int or int-array, default=10
+            multiples of lag times for testing the Model, e.g. range(10).
+            A single int will trigger a range, i.e. mlags=10 maps to
+            mlags=range(10).
+            Note that you need to be able to do a model prediction for each
+            of these lag time multiples, e.g. the value 0 only make sense
+            if model.expectation(lag_multiple=0) will work.
+
+        n_jobs : int, default=1
+            how many jobs to use during calculation
+
+        show_progress : bool, default=True
+            Show progressbars for calculation?
+
+        iterable : any data format that `pyemma.coordinates.vamp()` accepts as input, optional
+            It `iterable` is None, the same data source with which VAMP
+            was initialized will be used for all estimation.
+            Otherwise, all estimates (not predictions) from data will be computed
+            from the data contained in `iterable`.
+
+        Returns
+        -------
+        vckv : :class:`VAMPChapmanKolmogorovValidator <pyemma.coordinates.transform.VAMPChapmanKolmogorovValidator>`
+            Contains the estimated and the predicted covarince matrices.
+            The object can be plotted with :func:`plot_cktest <pyemma.plots.plot_cktest>` with the option `y01=False`.
+        """
+        if n_observables is not None:
+            if n_observables > self.dimension():
+                warnings.warn('Selected singular functions as observables but dimension '
+                              'is lower than requested number of observables.')
+                n_observables = self.dimension()
+        else:
+            n_observables = self.dimension()
+
+        if isinstance(observables, str) and observables == 'phi':
+            observables = self.singular_vectors_right[:, 0:n_observables]
+            observables_mean_free = True
+        else:
+            ensure_ndarray(observables, ndim=2)
+            observables_mean_free = False
+
+        if isinstance(statistics, str) and statistics == 'psi':
+            statistics = self.singular_vectors_left[:, 0:n_observables]
+            statistics_mean_free = True
+        else:
+            ensure_ndarray_or_None(statistics, ndim=2)
+            statistics_mean_free = False
+
+        ck = VAMPChapmanKolmogorovValidator(self, self, observables, statistics, observables_mean_free,
+                                            statistics_mean_free, mlags=mlags, n_jobs=n_jobs,
+                                            show_progress=show_progress)
+
+        if iterable is None:
+            iterable = self.data_producer
+
+        ck.estimate(iterable)
+        return ck
+
+    def score(self, test_data=None, score_method='VAMP2'):
+        """Compute the VAMP score for this model or the cross-validation score between self and a second model estimated form different data.
+
+        Parameters
+        ----------
+        test_data : any data format that `pyemma.coordinates.vamp()` accepts as input
+
+            If `test_data` is not None, this method computes the cross-validation score
+            between self and a VAMP model estimated from `test_data`. It is assumed that
+            self was estimated from the "training" data and `test_data` is the test data.
+            The score is computed for one realization of self and `test_data`. Estimation
+            of the average cross-validation score and partitioning of data into test and
+            training part is not performed by this method.
+
+            If `test_data` is None, this method computes the VAMP score for the model
+            contained in self.
+
+            The model that is estimated from `test_data` will inherit all hyperparameters
+            from self.
+
+        score_method : str, optional, default='VAMP2'
+            Available scores are based on the variational approach for Markov processes [1]_:
+
+            *  'VAMP1'  Sum of singular values of the half-weighted Koopman matrix [1]_ .
+                        If the model is reversible, this is equal to the sum of
+                        Koopman matrix eigenvalues, also called Rayleigh quotient [1]_.
+            *  'VAMP2'  Sum of squared singular values of the half-weighted Koopman matrix [1]_ .
+                        If the model is reversible, this is equal to the kinetic variance [2]_ .
+            *  'VAMPE'  Approximation error of the estimated Koopman operator with respect to
+                        the true Koopman operator up to an additive constant [1]_ .
+
+        Returns
+        -------
+        score : float
+            If `test_data` is not None, returns the cross-validation VAMP score between
+            self and the model estimated from `test_data`. Otherwise return the selected
+            VAMP-score of self.
+
+        References
+        ----------
+        .. [1] Wu, H. and Noe, F. 2017. Variational approach for learning Markov processes from time series data.
+            arXiv:1707.04659v1
+        .. [2] Noe, F. and Clementi, C. 2015. Kinetic distance and kinetic maps from molecular dynamics simulation.
+            J. Chem. Theory. Comput. doi:10.1021/acs.jctc.5b00553
+        """
+        from pyemma._ext.sklearn.base import clone as clone_estimator
+        est = clone_estimator(self)
+
+        if test_data is None:
+            return self.model.score(None, score_method=score_method)
+        else:
+            est.estimate(test_data)
+            return self.model.score(est.model, score_method=score_method)
+
+
+class VAMPChapmanKolmogorovValidator(LaggedModelValidator):
+    __serialize_version = 0
+    __serialize_fields = ('nsets', 'statistics', 'observables', 'observables_mean_free', 'statistics_mean_free')
+
+    def __init__(self, model, estimator, observables, statistics, observables_mean_free, statistics_mean_free,
+                 mlags=10, n_jobs=1, show_progress=True):
+        r"""
+         Note
+         ----
+         It is recommended that you create this object by calling the
+         `cktest` method of a VAMP object created with
+         :func:`vamp <pyemma.coordinates.vamp>`.
+
+         Parameters
+         ----------
+         model : Model
+             Model with the smallest lag time. Is used to make predictions
+             for larger lag times.
+
+         estimator : Estimator
+             Parametrized Estimator that has produced the model.
+             Is used as a prototype for estimating models at higher lag times.
+
+         observables : np.ndarray((input_dimension, n_observables))
+             Coefficients that express one or multiple observables in
+             the basis of the input features.
+
+         statistics : np.ndarray((input_dimension, n_statistics))
+             Coefficients that express one or multiple statistics in
+             the basis of the input features.
+
+         observables_mean_free : bool, default=False
+             If true, coefficients in `observables` refer to the input
+             features with feature means removed.
+             If false, coefficients in `observables` refer to the
+             unmodified input features.
+
+         statistics_mean_free : bool, default=False
+             If true, coefficients in `statistics` refer to the input
+             features with feature means removed.
+             If false, coefficients in `statistics` refer to the
+             unmodified input features.
+
+         mlags : int or int-array, default=10
+             multiples of lag times for testing the Model, e.g. range(10).
+             A single int will trigger a range, i.e. mlags=10 maps to
+             mlags=range(10).
+             Note that you need to be able to do a model prediction for each
+             of these lag time multiples, e.g. the value 0 only make sense
+             if model.expectation(lag_multiple=0) will work.
+
+         n_jobs : int, default=1
+             how many jobs to use during calculation
+
+         show_progress : bool, default=True
+             Show progressbars for calculation?
+
+         Notes
+         -----
+         The object can be plotted with :func:`plot_cktest <pyemma.plots.plot_cktest>`
+         with the option `y01=False`.
+         """
+        LaggedModelValidator.__init__(self, model, estimator, mlags=mlags,
+                                      n_jobs=n_jobs, show_progress=show_progress)
+        self.statistics = statistics
+        self.observables = observables
+        self.observables_mean_free = observables_mean_free
+        self.statistics_mean_free = statistics_mean_free
+        if self.statistics is not None:
+            self.nsets = min(self.observables.shape[1], self.statistics.shape[1])
+
+    def _compute_observables(self, model, estimator, mlag=1):
+        # for lag time 0 we return a matrix of nan, until the correct solution is implemented
+        if mlag == 0 or model is None:
+            if self.statistics is None:
+                return np.zeros(self.observables.shape[1]) + np.nan
+            else:
+                return np.zeros((self.observables.shape[1], self.statistics.shape[1])) + np.nan
+        else:
+            return model.expectation(statistics=self.statistics, observables=self.observables, lag_multiple=mlag,
+                                     statistics_mean_free=self.statistics_mean_free,
+                                     observables_mean_free=self.observables_mean_free)
+
+    def _compute_observables_conf(self, model, estimator, mlag=1):
+        raise NotImplementedError('estimation of confidence intervals not yet implemented for VAMP')
diff --git a/pyemma/msm/estimators/__init__.py b/pyemma/msm/estimators/__init__.py
index b562a3a9e..56b52eb53 100644
--- a/pyemma/msm/estimators/__init__.py
+++ b/pyemma/msm/estimators/__init__.py
@@ -15,13 +15,13 @@
 #
 # You should have received a copy of the GNU Lesser General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
 from __future__ import absolute_import
+
 __author__ = 'noe'
 
 from .maximum_likelihood_msm import MaximumLikelihoodMSM
 from .maximum_likelihood_msm import OOMReweightedMSM
-from .maximum_likelihood_msm import AugmentedMarkovModel 
+from .maximum_likelihood_msm import AugmentedMarkovModel
 from .bayesian_msm import BayesianMSM
 from .maximum_likelihood_hmsm import MaximumLikelihoodHMSM
 from .bayesian_hmsm import BayesianHMSM
diff --git a/pyemma/msm/estimators/lagged_model_validators.py b/pyemma/msm/estimators/lagged_model_validators.py
index a39b3c141..da45352cf 100644
--- a/pyemma/msm/estimators/lagged_model_validators.py
+++ b/pyemma/msm/estimators/lagged_model_validators.py
@@ -17,7 +17,7 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 from __future__ import absolute_import
-
+from six.moves import range
 
 import math
 import numpy as np
@@ -81,7 +81,10 @@ def __init__(self, model, estimator, mlags=None, conf=0.95, err_est=False,
         self.test_estimator = estimator
 
         # set mlags
-        maxlength = np.max([len(dtraj) for dtraj in estimator.discrete_trajectories_full])
+        try:
+            maxlength = np.max([len(dtraj) for dtraj in estimator.discrete_trajectories_full])
+        except AttributeError:
+            maxlength = np.max(estimator.trajectory_lengths())
         maxmlag = int(math.floor(maxlength / estimator.lag))
         if mlags is None:
             mlags = maxmlag
diff --git a/pyemma/msm/estimators/maximum_likelihood_hmsm.py b/pyemma/msm/estimators/maximum_likelihood_hmsm.py
index d659e1d3c..5e77ab21c 100644
--- a/pyemma/msm/estimators/maximum_likelihood_hmsm.py
+++ b/pyemma/msm/estimators/maximum_likelihood_hmsm.py
@@ -17,7 +17,7 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 from __future__ import absolute_import
-# 
+from six.moves import range
 from pyemma.util.annotators import alias, aliased, fix_docs
 
 import numpy as _np
diff --git a/pyemma/msm/estimators/maximum_likelihood_msm.py b/pyemma/msm/estimators/maximum_likelihood_msm.py
index e68b56dd3..3120b1141 100644
--- a/pyemma/msm/estimators/maximum_likelihood_msm.py
+++ b/pyemma/msm/estimators/maximum_likelihood_msm.py
@@ -228,7 +228,7 @@ def score(self, dtrajs, score_method=None, score_k=None):
         score_method : str
             Overwrite scoring method if desired. If `None`, the estimators scoring
             method will be used. See __init__ for documentation.
-        score_k : str
+        score_k : int or None
             Overwrite scoring rank if desired. If `None`, the estimators scoring
             rank will be used. See __init__ for documentation.
         score_method : str, optional, default='VAMP2'
diff --git a/pyemma/util/_config.py b/pyemma/util/_config.py
index 8bfe25849..86dac0225 100644
--- a/pyemma/util/_config.py
+++ b/pyemma/util/_config.py
@@ -17,6 +17,7 @@
 
 from __future__ import absolute_import, print_function
 
+import six
 from six.moves.configparser import ConfigParser
 import os
 import shutil
@@ -32,6 +33,10 @@
 class ReadConfigException(Exception):
     pass
 
+if six.PY2:
+    class NotADirectoryError(Exception):
+        pass
+
 __all__ = ('Config', )
 
 
@@ -172,10 +177,10 @@ def cfg_dir(self, pyemma_cfg_dir):
         if not os.path.exists(pyemma_cfg_dir):
             try:
                 mkdir_p(pyemma_cfg_dir)
-            except EnvironmentError:
-                raise ConfigDirectoryException("could not create configuration directory '%s'" % pyemma_cfg_dir)
             except NotADirectoryError:  # on Python 3
                 raise ConfigDirectoryException("pyemma cfg dir (%s) is not a directory" % pyemma_cfg_dir)
+            except EnvironmentError:
+                raise ConfigDirectoryException("could not create configuration directory '%s'" % pyemma_cfg_dir)
 
         if not os.path.isdir(pyemma_cfg_dir):
             raise ConfigDirectoryException("%s is no valid directory" % pyemma_cfg_dir)
diff --git a/pyemma/util/annotators.py b/pyemma/util/annotators.py
index 5843d4fc9..ecbe05f37 100644
--- a/pyemma/util/annotators.py
+++ b/pyemma/util/annotators.py
@@ -28,7 +28,6 @@
            'deprecated',
            'shortcut',
            'fix_docs',
-           'estimation_required',
            ]
 
 
diff --git a/pyemma/util/types.py b/pyemma/util/types.py
index ca3957134..65a0834f3 100644
--- a/pyemma/util/types.py
+++ b/pyemma/util/types.py
@@ -27,6 +27,8 @@
 import numbers
 import collections
 
+from six import string_types
+
 # ======================================================================================================================
 # BASIC TYPE CHECKS
 # ======================================================================================================================
@@ -137,7 +139,7 @@ def is_float_array(l):
     return False
 
 def is_string(s):
-    return isinstance(s, str)
+    return isinstance(s, string_types)
 
 def is_iterable(I):
     return isinstance(I, collections.Iterable)
@@ -147,7 +149,7 @@ def is_list(S):
     return isinstance(S, (list, tuple))
 
 def is_list_of_string(S):
-    return isinstance(S, (list, tuple)) and (all(isinstance(s, str) for s in S))
+    return isinstance(S, (list, tuple)) and (all(isinstance(s, string_types) for s in S))
 
 def ensure_dtraj(dtraj):
     r"""Makes sure that dtraj is a discrete trajectory (array of int)
@@ -171,8 +173,8 @@ def ensure_dtraj_list(dtrajs):
         if is_list_of_int(dtrajs):
             return [np.array(dtrajs, dtype=int)]
         else:
-            for i in range(len(dtrajs)):
-                dtrajs[i] = ensure_dtraj(dtrajs[i])
+            for i, dtraj in enumerate(dtrajs):
+                dtrajs[i] = ensure_dtraj(dtraj)
             return dtrajs
     else:
         return [ensure_dtraj(dtrajs)]
@@ -476,8 +478,8 @@ def ensure_traj_list(trajs):
             return [np.array(trajs)[:,None]]
         else:
             res = []
-            for i in range(len(trajs)):
-                res.append(ensure_traj(trajs[i]))
+            for traj in trajs:
+                res.append(ensure_traj(traj))
             return res
     else:
         # looks like this is one trajectory
diff --git a/setup.py b/setup.py
index b2f41c2ce..b201bad95 100755
--- a/setup.py
+++ b/setup.py
@@ -59,6 +59,7 @@
 Operating System :: MacOS :: MacOS X
 Operating System :: POSIX
 Operating System :: Microsoft :: Windows
+Programming Language :: Python :: 2.7
 Programming Language :: Python :: 3
 Topic :: Scientific/Engineering :: Bio-Informatics
 Topic :: Scientific/Engineering :: Chemistry
@@ -66,7 +67,12 @@
 Topic :: Scientific/Engineering :: Physics
 
 """
-
+from setup_util import lazy_cythonize
+try:
+    from setuptools import setup, Extension, find_packages
+except ImportError as ie:
+    print("PyEMMA requires setuptools. Please install it with conda or pip.")
+    sys.exit(1)
 
 ###############################################################################
 # Extensions