From 93aa602260c818467d287465c0c6d1679199c572 Mon Sep 17 00:00:00 2001
From: edaub <edaub@turing.ac.uk>
Date: Fri, 22 Nov 2019 09:32:54 +0000
Subject: [PATCH] merge input derivative bugfix into devel

---
 mogp_emulator/GaussianProcess.py   |  379 ++++----
 mogp_emulator/Kernel.py            |  406 +++++----
 mogp_emulator/tests/test_Kernel.py | 1308 +++++++++++++++++++---------
 setup.py                           |    2 +-
 4 files changed, 1344 insertions(+), 751 deletions(-)

diff --git a/mogp_emulator/GaussianProcess.py b/mogp_emulator/GaussianProcess.py
index 7ad3cc3e..7e737b62 100644
--- a/mogp_emulator/GaussianProcess.py
+++ b/mogp_emulator/GaussianProcess.py
@@ -10,21 +10,21 @@
 class GaussianProcess(object):
     """
     Implementation of a Gaussian Process Emulator.
-    
+
     This class provides an interface to fit a Gaussian Process Emulator to a set of training
     data. The class can be initialized from either a pair of inputs/targets arrays, or a file
     holding data saved from a previous emulator instance (saved via the ``save_emulator``
     method). Once the emulator has been created, the class provides methods for fitting
     optimal hyperparameters, changing hyperparameter values, making predictions, and other
     calculations associated with fitting and making predictions.
-    
+
     The internal emulator structure involves arrays for the inputs, targets, and hyperparameters.
     Other useful information are the number of training examples ``n`` and the number of input
     parameters ``D``. These parameters are available externally through the ``get_n`` and
     ``get_D`` methods
-    
+
     Example: ::
-    
+
         >>> import numpy as np
         >>> from mogp_emulator import GaussianProcess
         >>> x = np.array([[1., 2., 3.], [4., 5., 6.]])
@@ -44,38 +44,38 @@ class GaussianProcess(object):
         (array([4.74687618, 6.84934016]), array([0.01639298, 1.05374973]),
         array([[8.91363045e-05, 7.18827798e-01, 3.74439445e-16],
                [4.64005897e-06, 3.74191346e-02, 1.94917337e-17]]))
-        
+
     """
-    
+
     def __init__(self, *args):
         """
         Create a new GP Emulator
-        
+
         Creates a new GP Emulator from either the input data and targets to be fit or a
         file holding the input/targets and (optionally) learned parameter values.
-        
+
         Arguments passed to the ``__init__`` method must be either two arguments which
         are numpy arrays ``inputs`` and ``targets``, described below, three arguments
         which are the same ``inputs`` and ``targets`` arrays plus a float representing
         the ``nugget`` parameter, or a single argument which is the filename (string or file
         handle) of a previously saved emulator.
-        
+
         ``inputs`` is a 2D array-like object holding the input data, whose shape is
         ``n`` by ``D``, where ``n`` is the number of training examples to be fit and ``D``
         is the number of input variables to each simulation.
-        
+
         ``targets`` is the target data to be fit by the emulator, also held in an array-like
         object. This must be a 1D array of length ``n``.
-        
+
         ``nugget`` is the additional noise added to the emulator targets when fitting. This
         can take on values ``None`` (in which case, noise will be added adaptively to
         stabilize fitting), or a non-negative float (in which case, a fixed noise level
         will be used). If no value is specified for the ``nugget`` parameter, ``None``
         is the default.
-        
+
         If two or three input arguments ``inputs``, ``targets``, and optionally ``nugget`` are
         given:
-        
+
         :param inputs: Numpy array holding emulator input parameters. Must be 2D with shape
                        ``n`` by ``D``, where ``n`` is the number of training examples and
                        ``D`` is the number of input parameters for each output.
@@ -86,22 +86,22 @@ def __init__(self, *args):
                        noise level explicitly, while if ``None`` is given, the noise will set
                        to be as small as possible to ensure stable inversion of the covariance
                        matrix. Optional, default is ``None``.
-        
+
         If one input argument ``emulator_file`` is given:
-        
+
         :param emulator_file: Filename or file object for saved emulator parameters (using
                               the ``save_emulator`` method)
-        
+
         :type emulator_file: str or file
         :returns: New ``GaussianProcess`` instance
         :rtype: GaussianProcess
-        
+
         """
-        
+
         emulator_file = None
         theta = None
         nugget = None
-        
+
         if len(args) == 1:
             emulator_file = args[0]
             inputs, targets, theta, nugget = self._load_emulator(emulator_file)
@@ -127,19 +127,19 @@ def __init__(self, *args):
 
         self.inputs = np.array(inputs)
         self.targets = np.array(targets)
-        
+
         self.n = self.inputs.shape[0]
         self.D = self.inputs.shape[1]
-        
+
         self.nugget = nugget
-        
+
         self.kernel =  SquaredExponential()
-        
+
         if not (emulator_file is None or theta is None):
             self._set_params(theta)
         else:
             self.theta = None
-            
+
         self.mle_theta = None
         self.samples = None
 
@@ -154,12 +154,12 @@ def train_model(Cls, *init_args):
     def _load_emulator(self, filename):
         """
         Load saved emulator and parameter values from file
-        
+
         Method takes the filename of a saved emulator (using the ``save_emulator`` method).
         The saved emulator may or may not contain the fitted parameters. If there are no
         parameters found in the emulator file, the method returns ``None`` for the
         parameters.
-        
+
         :param filename: File where the emulator parameters are saved. Can be a string
                          filename or a file object.
         :type filename: str or file
@@ -168,15 +168,15 @@ def _load_emulator(self, filename):
         :rtype: tuple containing 3 ndarrays and a float or 2 ndarrays, a None type, and
                 a float (if no theta values are found in the emulator file)
         """
-        
+
         emulator_file = np.load(filename, allow_pickle=True)
-        
+
         try:
             inputs = np.array(emulator_file['inputs'])
             targets = np.array(emulator_file['targets'])
         except KeyError:
             raise KeyError("Emulator file does not contain emulator inputs and targets")
-            
+
         try:
             if emulator_file['theta'] is None:
                 theta = None
@@ -184,7 +184,7 @@ def _load_emulator(self, filename):
                 theta = np.array(emulator_file['theta'])
         except KeyError:
             theta = None
-            
+
         try:
             if emulator_file['nugget'] == None:
                 nugget = None
@@ -192,92 +192,92 @@ def _load_emulator(self, filename):
                 nugget = float(emulator_file['nugget'])
         except KeyError:
             nugget = None
-            
+
         return inputs, targets, theta, nugget
 
     def save_emulator(self, filename):
         """
         Write emulators to disk
-        
+
         Method saves the emulator to disk using the given filename or file handle. The method
         writes the inputs and targets arrays to file. If the model has been assigned parameters,
         either manually or by fitting, those parameters are saved as well. Once saved, the
         emulator can be read by passing the file name or handle to the one-argument ``__init__``
         method.
-        
+
         :param filename: Name of file (or file handle) to which the emulator will be saved.
         :type filename: str or file
         :returns: None
         """
-        
+
         emulator_dict = {}
         emulator_dict['targets'] = self.targets
         emulator_dict['inputs'] = self.inputs
         emulator_dict['nugget'] = self.nugget
         emulator_dict['theta'] = self.theta
-                    
+
         np.savez(filename, **emulator_dict)
 
     def get_n(self):
         """
         Returns number of training examples for the emulator
-        
+
         :returns: Number of training examples for the emulator object
         :rtype: int
         """
-        
+
         return self.n
-        
+
     def get_D(self):
         """
         Returns number of inputs for the emulator
-        
+
         :returns: Number of inputs for the emulator object
         :rtype: int
         """
-        
+
         return self.D
-        
+
     def get_params(self):
         """
         Returns emulator parameters
-        
+
         Returns current parameters for the emulator as a numpy array if they have been fit. If no
         parameters have been fit, returns None.
-        
+
         :returns: Current parameter values (numpy array of length ``D + 1``), or ``None`` if the
                   parameters have not been fit.
         :rtype: ndarray or None
         """
-        
+
         return self.theta
-            
+
     def get_nugget(self):
         """
         Returns emulator nugget parameter
-        
+
         Returns current value of the nugget parameter. If the nugget is selected adaptively, returns None.
-        
+
         :returns: Current nugget value, either a float or ``None``
         :rtype: float or None
         """
-        
+
         return self.nugget
-    
+
     def set_nugget(self, nugget):
         """
         Set the nugget parameter for the emulator
-        
+
         Method for changing the ``nugget`` parameter for the emulator. When a new emulator is
         initilized, this is set to None.
-        
+
         The ``nugget`` parameter controls how noise is added to the covariance matrix in order to
         stabilize the inversion or smooth the emulator predictions. If ``nugget`` is a non-negative
         float, then that particular value is used for the nugget. Note that setting this parameter
         to be zero enforces that the emulator strictly interpolates between points. Alternatively,
         if ``nugget`` is set to be ``None``, the fitting routine will adaptively make the noise
         parameter as large as is needed to ensure that the emulator can be fit.
-        
+
         :param nugget: Controls how noise is added to the emulator. If ``nugget`` is a nonnegative
                        float, then this manually sets the noise parameter (if negative, this will
                        lead to an error), with ``nugget = 0`` resulting in interpolation with no
@@ -288,19 +288,19 @@ def set_nugget(self, nugget):
         :returns: None
         :rtype: None
         """
-        
+
         if not nugget == None:
             nugget = float(nugget)
             assert nugget >= 0., "noise parameter must be nonnegative"
         self.nugget = nugget
-    
+
     def _jit_cholesky(self, Q, maxtries = 5):
         """
         Performs Jittered Cholesky Decomposition
-        
+
         Performs a Jittered Cholesky decomposition, adding noise to the diagonal of the matrix as needed
         in order to ensure that the matrix can be inverted. Adapted from code in GPy.
-        
+
         On occasion, the matrix that needs to be inverted in fitting a GP is nearly singular. This arises
         when the training samples are very close to one another, and can be averted by adding a noise term
         to the diagonal of the matrix. This routine performs an exact Cholesky decomposition if it can
@@ -308,7 +308,7 @@ def _jit_cholesky(self, Q, maxtries = 5):
         the mean of the diagonal and incrementing by a factor of 10 each time) until the matrix can be
         decomposed or the algorithm reaches ``maxtries`` attempts. The routine returns the lower
         triangular matrix and the amount of noise necessary to stabilize the decomposition.
-        
+
         :param Q: The matrix to be inverted as an array of shape ``(n,n)``. Must be a symmetric positive
                   definite matrix.
         :type Q: ndarray
@@ -319,9 +319,9 @@ def _jit_cholesky(self, Q, maxtries = 5):
                   the diagonal to achieve that result.
         :rtype: tuple containing an ndarray and a float
         """
-        
+
         assert int(maxtries) > 0, "maxtries must be a positive integer"
-        
+
         Q = np.ascontiguousarray(Q)
         L, info = lapack.dpotrf(Q, lower = 1)
         if info == 0:
@@ -347,76 +347,76 @@ def _jit_cholesky(self, Q, maxtries = 5):
             logging.warning('\n'.join(['Added jitter of {:.10e}'.format(jitter),
                 '  in '+traceback.format_list(traceback.extract_stack(limit=3)[-2:-1])[0][2:]]))
         return L, jitter
-    
+
     def _prepare_likelihood(self):
         """
         Pre-calculates matrices needed for fitting and making predictions
-        
+
         Pre-calculates the matrices needed to compute the log-likelihood and make subsequent
         predictions. This is called any time the hyperparameter values are changed in order
         to ensure that all the information is needed to evaluate the log-likelihood and
         its derivatives, which are needed when fitting the optimal hyperparameters.
-        
-        The method computes the covariance matrix (assuming a squared exponential kernel) 
+
+        The method computes the covariance matrix (assuming a squared exponential kernel)
         and inverts it using the jittered cholesky decomposition. Some additional information
         is also pre-computed and stored. This method has no inputs and no return value,
         but it does modify the state of the object.
-        
+
         :returns: None
         """
 
         assert not self.theta is None, "Must set a parameter value to fit a GP"
 
         self.Q = self.kernel.kernel_f(self.inputs, self.inputs, self.theta)
-        
+
         if self.nugget == None:
             L, nugget = self._jit_cholesky(self.Q)
             self.Z = self.Q + nugget*np.eye(self.n)
         else:
             self.Z = self.Q + self.nugget*np.eye(self.n)
             L = linalg.cholesky(self.Z, lower=True)
-        
+
         self.invQ = np.linalg.inv(L.T).dot(np.linalg.inv(L))
         self.invQt = np.dot(self.invQ, self.targets)
         self.logdetQ = 2.0 * np.sum(np.log(np.diag(L)))
-        
+
     def _set_params(self, theta):
         """
         Method for setting the hyperparameters for the emulator
-        
+
         This method is used to reset the value of the hyperparameters for the emulator and
         update the log-likelihood. It is used after fitting the hyperparameters or when loading
         an emulator from file. Input ``theta`` must be array-like with shape
         ``(D + 1,)``, where ``D`` is the number of input parameters.
-        
+
         :param theta: Parameter values to be used for the emulator. Must be array-like and
                       have shape ``(D + 1,)``
         :type theta: ndarray
         :returns: None
         """
-        
+
         theta = np.array(theta)
         assert theta.shape == (self.D + 1,), "Parameter vector must have length number of inputs + 1"
-        
+
         self.theta = theta
         self._prepare_likelihood()
-    
+
     def loglikelihood(self, theta):
         """
         Calculate the negative log-likelihood at a particular value of the hyperparameters
-        
+
         Calculate the negative log-likelihood for the given set of parameters. Calling this
         method sets the parameter values and computes the needed inverse matrices in order
         to evaluate the log-likelihood and its derivatives. In addition to returning the
         log-likelihood value, it stores the current value of the hyperparameters and
         log-likelihood in attributes of the object.
-        
+
         :param theta: Value of the hyperparameters. Must be array-like with shape ``(D + 1,)``
         :type theta: ndarray
         :returns: negative log-likelihood
         :rtype: float
         """
-        
+
         self._set_params(theta)
 
         loglikelihood = (0.5 * self.logdetQ +
@@ -424,15 +424,15 @@ def loglikelihood(self, theta):
                          0.5 * self.n * np.log(2. * np.pi))
 
         return loglikelihood
-    
+
     def partial_devs(self, theta):
         """
         Calculate the partial derivatives of the negative log-likelihood
-        
+
         Calculate the partial derivatives of the negative log-likelihood with respect to
         the hyperparameters. Note that this function is normally used only when fitting
         the hyperparameters, and it is not needed to make predictions.
-        
+
         During normal use, the ``partial_devs`` method is called after evaluating the
         ``loglikelihood`` method. The implementation takes advantage of this by storing
         the inverse of the covariance matrix, which is expensive to compute and is used
@@ -441,7 +441,7 @@ def partial_devs(self, theta):
         the log-likelihood, the method calls ``_set_params`` to compute the needed
         information. However, caling ``partial_devs`` does not evaluate the log-likelihood,
         so it does not change the cached values of the parameters or log-likelihood.
-        
+
         :param theta: Value of the hyperparameters. Must be array-like with shape ``(D + 1,)``
         :type theta: ndarray
         :returns: partial derivatives of the negative log-likelihood (array with shape
@@ -450,29 +450,29 @@ def partial_devs(self, theta):
         """
 
         assert theta.shape == (self.D + 1,), "Parameter vector must have length number of inputs + 1"
-        
+
         if not np.allclose(np.array(theta), self.theta):
             self._set_params(theta)
-            
+
         partials = np.zeros(self.D + 1)
-        
+
         dKdtheta = self.kernel.kernel_deriv(self.inputs, self.inputs, self.theta)
-        
+
         for d in range(self.D + 1):
             partials[d] = -0.5 * (np.dot(self.invQt, np.dot(dKdtheta[d], self.invQt)) - np.sum(self.invQ * dKdtheta[d]))
-        
+
         return partials
-        
+
     def hessian(self, theta):
         """
         Calculate the Hessian of the negative log-likelihood
-        
+
         Calculate the Hessian of the negative log-likelihood with respect to
         the hyperparameters. Note that this function is normally used only when fitting
         the hyperparameters, and it is not needed to make predictions. It is also used
         to estimate an appropriate step size when fitting hyperparameters using
         the lognormal approximation or MCMC sampling.
-        
+
         When used in an optimization routine, the ``hessian`` method is called after
         evaluating the ``loglikelihood`` method. The implementation takes advantage of
         this by storing the inverse of the covariance matrix, which is expensive to
@@ -481,7 +481,7 @@ def hessian(self, theta):
         used to set the log-likelihood, the method calls ``_set_params`` to compute the needed
         information. However, caling ``hessian`` does not evaluate the log-likelihood,
         so it does not change the cached values of the parameters or log-likelihood.
-        
+
         :param theta: Value of the hyperparameters. Must be array-like with shape ``(D + 1,)``
         :type theta: ndarray
         :returns: Hessian of the negative log-likelihood (array with shape
@@ -490,44 +490,44 @@ def hessian(self, theta):
         """
 
         assert theta.shape == (self.D + 1,), "Parameter vector must have length number of inputs + 1"
-        
+
         if not np.allclose(np.array(theta), self.theta):
             self._set_params(theta)
-            
+
         hessian = np.zeros((self.D + 1, self.D + 1))
-        
+
         dKdtheta = self.kernel.kernel_deriv(self.inputs, self.inputs, self.theta)
         d2Kdtheta2 = self.kernel.kernel_hessian(self.inputs, self.inputs, self.theta)
-        
+
         for d1 in range(self.D + 1):
             for d2 in range(self.D + 1):
-                hessian[d1, d2] = 0.5*(np.linalg.multi_dot([self.invQt, 
+                hessian[d1, d2] = 0.5*(np.linalg.multi_dot([self.invQt,
                                         2.*np.linalg.multi_dot([dKdtheta[d1], self.invQ, dKdtheta[d2]])-d2Kdtheta2[d1, d2],
                                         self.invQt])-
                                         np.trace(np.linalg.multi_dot([self.invQ, dKdtheta[d1], self.invQ, dKdtheta[d2]])
                                                  -np.dot(self.invQ, d2Kdtheta2[d1, d2])))
-                
+
         return hessian
-    
+
     def _learn(self, theta0, method = 'L-BFGS-B', **kwargs):
         """
         Minimize log-likelihood function wrt the hyperparameters
-        
+
         Minimize the negative log-likelihood function, with a starting value given by ``theta0``.
         This is done via any gradient-based method available through scipy (see the scipy
         documentation for details), and options can be passed to the minimization routine.
         The default minimization routine is ``L-BFGS-B'``, but this can be specified.
-        
+
         The method is dumb and returns the last value returned from the minimization routine
         irrespective of any error flags returned by the minimization function. This is not
         necessarily a cause for concern, as (1) the parent routine to this function is
         configured to do a certain number of attempts, taking the best result and (2) application
         of the GP does not require that the hyperparameters are at a true minimum of the
         log-likelihood function, just at a value that leads to predictions that are good enough.
-        
-        The method returns the hyperparameter values as an array with shape ``(D + 1,)`` 
+
+        The method returns the hyperparameter values as an array with shape ``(D + 1,)``
         and the minimimum negative log-likelihood value found.
-        
+
         :param theta0: Starting value for the minimization routine. Must be an array with shape
                        ``(D + 1,)``
         :type theta0: ndarray
@@ -540,41 +540,41 @@ def _learn(self, theta0, method = 'L-BFGS-B', **kwargs):
                   minimum negative log-likelihood value
         :rtype: tuple containing a ndarray and a float
         """
-        
+
         self._set_params(theta0)
-        
-        fmin_dict = minimize(self.loglikelihood, theta0, method = method, jac = self.partial_devs, 
+
+        fmin_dict = minimize(self.loglikelihood, theta0, method = method, jac = self.partial_devs,
                              options = kwargs)
-        
+
         return fmin_dict['x'], fmin_dict['fun']
-    
+
     def learn_hyperparameters(self, n_tries = 15, theta0 = None, method = 'L-BFGS-B', **kwargs):
         """
         Fit hyperparameters by attempting to minimize the negative log-likelihood
-        
+
         Fits the hyperparameters by attempting to minimize the negative log-likelihood multiple times
         from a given starting location and using a particular minimization method. The best result
         found among all of the attempts is returned, unless all attempts to fit the parameters result
         in an error (see below).
-        
+
         If the method encounters an overflow (this can result because the parameter values stored are
         the logarithm of the actual hyperparameters to enforce positivity) or a linear algebra error
         (occurs when the covariance matrix cannot be inverted, even with the addition of additional
         noise added along the diagonal if adaptive noise was selected by setting the nugget parameter
         to be None), the iteration is skipped. If all attempts to find optimal hyperparameters result
         in an error, then the method raises an exception.
-        
+
         The ``theta0`` parameter is the point at which the first iteration will start. If more than
         one attempt is made, subsequent attempts will use random starting points.
-        
+
         The user can specify the details of the minimization method, using any of the gradient-based
         optimizers available in ``scipy.optimize.minimize``. Any additional parameters beyond the method
         specification can be passed as keyword arguments.
-        
+
         The method returns the minimum negative log-likelihood found and the parameter values at
         which that minimum was obtained. The method also sets the current values of the hyperparameters
         to these optimal values and pre-computes the matrices needed to make predictions.
-        
+
         :param n_tries: Number of attempts to minimize the negative log-likelihood function.
                         Must be a positive integer (optional, default is 15)
         :type n_tries: int
@@ -593,15 +593,15 @@ def learn_hyperparameters(self, n_tries = 15, theta0 = None, method = 'L-BFGS-B'
                   to make predictions.
         :rtype: tuple containing a float and an ndarray
         """
-    
+
         n_tries = int(n_tries)
         assert n_tries > 0, "number of attempts must be positive"
-        
+
         np.seterr(divide = 'raise', over = 'raise', invalid = 'raise')
-        
+
         loglikelihood_values = []
         theta_values = []
-        
+
         theta_startvals = 5.*(np.random.rand(n_tries, self.D + 1) - 0.5)
         if not theta0 is None:
             theta0 = np.array(theta0)
@@ -617,29 +617,29 @@ def learn_hyperparameters(self, n_tries = 15, theta0 = None, method = 'L-BFGS-B'
                 print("Matrix not positive definite, skipping this iteration")
             except FloatingPointError:
                 print("Floating point error in optimization routine, skipping this iteration")
-                
+
         if len(loglikelihood_values) == 0:
             raise RuntimeError("Minimization routine failed to return a value")
-            
+
         loglikelihood_values = np.array(loglikelihood_values)
         idx = np.argmin(loglikelihood_values)
-        
+
         self._set_params(theta_values[idx])
         self.mle_theta = theta_values[idx]
-        
+
         return loglikelihood_values[idx], theta_values[idx]
-    
+
     def compute_local_covariance(self):
         """
         Estimate local covariance matrix around the MLE parameters
-        
+
         This method inverts the hessian matrix to get the local covariance matrix around the
         MLE parameters. Note that if the MLE parameters have not been estimated, they will be
         found first prior to inverting the Hessian. The local Hessian should be positive definite
         if the MLE parameters are at a local minimum of the negative log-likelihood, so if the
         routine encounters a non-positive definite matrix it will raise an error. Returns the
         inverse of the Hessian matrix evaluated at the MLE parameter values.
-        
+
         :returns: Inverse of the Hessian matrix evaluated at the MLE parameter values. This is
                   a 2D array with shape ``(D + 1, D + 1)``.
         :rtype: ndarray
@@ -649,48 +649,48 @@ def compute_local_covariance(self):
             self.learn_hyperparameters()
 
         hess = self.hessian(self.mle_theta)
-    
+
         assert hess.ndim == 2
         assert hess.shape[0] == hess.shape[1]
-    
+
         try:
             L = np.linalg.cholesky(hess)
             cov = np.linalg.inv(L.T).dot(np.linalg.inv(L))
         except linalg.LinAlgError:
             raise linalg.LinAlgError("Hessian matrix is not symmetric positive definite, optimization may not have converged")
-        
+
         return cov
-    
+
     def learn_hyperparameters_MLE(self, n_tries = 15, theta0 = None, method = 'L-BFGS-B', **kwargs):
         """
         Fit hyperparameters by attempting to minimize the negative log-likelihood
-        
+
         This method an alias for ``learn_hyperparameters`` to distinguish it from other methods
         for estimating hyperparameters.
-        
+
         Fits the hyperparameters by attempting to minimize the negative log-likelihood multiple times
         from a given starting location and using a particular minimization method. The best result
         found among all of the attempts is returned, unless all attempts to fit the parameters result
         in an error (see below).
-        
+
         If the method encounters an overflow (this can result because the parameter values stored are
         the logarithm of the actual hyperparameters to enforce positivity) or a linear algebra error
         (occurs when the covariance matrix cannot be inverted, even with the addition of additional
         noise added along the diagonal if adaptive noise was selected by setting the nugget parameter
         to be None), the iteration is skipped. If all attempts to find optimal hyperparameters result
         in an error, then the method raises an exception.
-        
+
         The ``theta0`` parameter is the point at which the first iteration will start. If more than
         one attempt is made, subsequent attempts will use random starting points.
-        
+
         The user can specify the details of the minimization method, using any of the gradient-based
         optimizers available in ``scipy.optimize.minimize``. Any additional parameters beyond the method
         specification can be passed as keyword arguments.
-        
+
         The method returns the minimum negative log-likelihood found and the parameter values at
         which that minimum was obtained. The method also sets the current values of the hyperparameters
         to these optimal values and pre-computes the matrices needed to make predictions.
-        
+
         :param n_tries: Number of attempts to minimize the negative log-likelihood function.
                         Must be a positive integer (optional, default is 15)
         :type n_tries: int
@@ -709,13 +709,13 @@ def learn_hyperparameters_MLE(self, n_tries = 15, theta0 = None, method = 'L-BFG
                   to make predictions.
         :rtype: tuple containing a float and an ndarray
         """
-        
+
         return self.learn_hyperparameters(n_tries, theta0, method, **kwargs)
-    
+
     def learn_hyperparameters_normalapprox(self, n_samples = 1000):
         """
         Sample hyperparameters via a normal approximation around MLE solution
-        
+
         Sample hyperparameters via a multivariate normal approximation around the MLE parameters.
         This method first obtains an MLE estimate of the hyperparameters, and then draws
         samples assuming the posterior follows an approximate normal distribution around the MLE
@@ -726,28 +726,28 @@ def learn_hyperparameters_normalapprox(self, n_samples = 1000):
         but sets the ``samples`` class attribute to a 2D array with shape ``(n_samples, D + 1)``,
         where the first dimension indicates the different samples and the second dimension specifies
         the different hyperparameters.
-        
+
         :param n_samples: Number of samples to be drawn. Must be a positive integer.
         :type n_samples: int
         :returns: None
         """
-        
+
         n_samples = int(n_samples)
         assert n_samples > 0
-    
+
         n_params = self.D + 1
-        
+
         if self.mle_theta is None:
             self.learn_hyperparameters()
-        
+
         cov = self.compute_local_covariance()
-    
+
         self.samples = np.random.multivariate_normal(self.mle_theta, cov, size=n_samples)
-        
+
     def learn_hyperparameters_MCMC(self, n_samples = 1000, thin = 0):
         """
         Sample hyperparameters via MCMC estimation
-        
+
         Sample hyperparameters via MCMC estimation. Parameters are found by doing a random
         walk in parameter space, choosing new points via the Metropolis-Hastings algorithm.
         Steps are drawn from a multivariate normal distribution around the current parameters,
@@ -757,7 +757,7 @@ def learn_hyperparameters_MCMC(self, n_samples = 1000, thin = 0):
         does not require a "burn-in" phase. Optional parameters specify the number of MCMC
         steps to take (must be a positive integer, default is 1000) and information about
         how to thin the MCMC chain to obtain uncorrelated samples.
-        
+
         Thinning may be specified with a non-negative integer. If a positive integer is
         given, the chain will be thinned by only keeping every ``thin`` steps. Note that
         ``thin = 1`` means that the chain will not be thinned. If ``thin = 0`` is given
@@ -767,14 +767,14 @@ def learn_hyperparameters_MCMC(self, n_samples = 1000, thin = 0):
         (usually occurrs if the posterior is multimodal), the chain will not be thinned
         and a warning will be given. More details on the autothinning procedure are
         described in the corresponding function.
-        
+
         Does not return a value, but sets the ``samples`` class attribute to a 2D array with shape
         ``(n_chain, D + 1)``, where the first dimension indicates the different samples and the
         second dimension specifies the different hyperparameters. Note that ``n_chain``
         will only be the same as ``n_samples`` if ``thin = 1`` is specified or if
         autothinning fails. If you wish to obtain a specific number of samples in the thinned
         chain, you will need to modify ``n_samples`` and ``thin`` appropriately.
-        
+
         Note that at present, the return information from the MCMC sampler is not returned
         or cached. The code does give a warning if a problem arises, in particular if the
         acceptance rate is not within the target range of 20% to 60% or if the final MCMC
@@ -782,11 +782,11 @@ def learn_hyperparameters_MCMC(self, n_samples = 1000, thin = 0):
         If either of these warnings occur, the MCMC chain may require further inspection.
         At the moment, this can only be done by re-running the MCMC samples using the function
         ``sample_MCMC`` in the ``MCMC`` submodule manually.
-        
+
         :param n_samples: Number of MCMC steps to be taken. Must be a positive integer.
         :type n_samples: int
         :param thin: Specifies how the chain is thinned to remove correlations. Must be
-                     a non-negative integer. If a positive integer ``k`` is used, it will 
+                     a non-negative integer. If a positive integer ``k`` is used, it will
                      keep every ``k`` samples. Note that ``thin = 1`` indicates that the
                      chain will not be thinned. ``thin = 0`` will attempt to autothin
                      the chain using the autocorrelation of the MCMC chain. Default is 0.
@@ -801,7 +801,7 @@ def learn_hyperparameters_MCMC(self, n_samples = 1000, thin = 0):
         assert thin >= 0
 
         n_params = self.D + 1
-        
+
         if self.mle_theta is None:
             self.learn_hyperparameters()
 
@@ -813,7 +813,7 @@ def learn_hyperparameters_MCMC(self, n_samples = 1000, thin = 0):
 
         if acceptance < 0.2 or acceptance > 0.6:
             warnings.warn("acceptance rate of "+str(100.*acceptance)+"% not within bounds")
-            
+
         if np.max(first_lag) > 3./np.sqrt(len(self.samples)):
             warnings.warn("autocorrelation of "+str(np.max(first_lag))+
                           " not within bounds. posterior may be multimodal or require thinning.")
@@ -821,23 +821,23 @@ def learn_hyperparameters_MCMC(self, n_samples = 1000, thin = 0):
     def _predict_single(self, testing, do_deriv = True, do_unc = True):
         """
         Make a prediction for a set of input vectors for a single set of hyperparameters
-        
+
         Note that the class provides a public ``predict`` method which calls this method for the
         appropriate case, so this should not need to be used in ordinary circumstances.
-        
+
         Makes predictions for the emulator on a given set of input vectors. The input vectors
         must be passed as a ``(n_predict, D)`` or ``(D,)`` shaped array-like object, where
         ``n_predict`` is the number of different prediction points under consideration and
         ``D`` is the number of inputs to the emulator. If the prediction inputs array has shape
         ``(D,)``, then the method assumes ``n_predict == 1``. The prediction is returned as an
         ``(n_predict, )`` shaped numpy array as the first return value from the method.
-        
-        Optionally, the emulator can also calculate the variances in the predictions 
+
+        Optionally, the emulator can also calculate the variances in the predictions
         and the derivatives with respect to each input parameter. If the uncertainties are
         computed, they are returned as the second output from the method as an ``(n_predict,)``
         shaped numpy array. If the derivatives are computed, they are returned as the third
         output from the method as an ``(n_predict, D)`` shaped numpy array.
-        
+
         :param testing: Array-like object holding the points where predictions will be made.
                         Must have shape ``(n_predict, D)`` or ``(D,)`` (for a single prediction)
         :type testing: ndarray
@@ -856,33 +856,32 @@ def _predict_single(self, testing, do_deriv = True, do_unc = True):
                   ``None``.
         :rtype: tuple
         """
-        
+
         testing = np.array(testing)
         if len(testing.shape) == 1:
             testing = np.reshape(testing, (1, len(testing)))
         assert len(testing.shape) == 2
-                        
+
         n_testing, D = np.shape(testing)
         assert D == self.D
-        
+
         exp_theta = np.exp(self.theta)
 
         Ktest = self.kernel.kernel_f(self.inputs, testing, self.theta)
 
         mu = np.dot(Ktest.T, self.invQt)
-        
+
         var = None
         if do_unc:
             var = np.maximum(exp_theta[self.D] - np.sum(Ktest * np.dot(self.invQ, Ktest), axis=0), 0.)
-        
+
         deriv = None
         if do_deriv:
             deriv = np.zeros((n_testing, self.D))
+            kern_deriv = self.kernel.kernel_inputderiv(testing, self.inputs, self.theta)
             for d in range(self.D):
-                aa = (self.inputs[:, d].flatten()[None, :] - testing[:, d].flatten()[:, None])
-                c = Ktest * aa.T
-                deriv[:, d] = exp_theta[d] * np.dot(c.T, self.invQt)
-                
+                deriv[:, d] = np.dot(kern_deriv[d], self.invQt)
+
         return mu, var, deriv
 
 
@@ -894,27 +893,27 @@ def __call__(self, testing):
         """
         return (self.predict(testing, do_deriv=False, do_unc=False)[0])
 
-    
+
     def _predict_samples(self, testing, do_deriv = True, do_unc = True):
         """
         Make a prediction for a set of input vectors for a set of hyperparameter posterior samples
-        
+
         Note that the class provides a public ``predict`` method which calls this method for the
         appropriate case, so this should not need to be used in ordinary circumstances.
-        
+
         Makes predictions for the emulator on a given set of input vectors. The input vectors
         must be passed as a ``(n_predict, D)`` or ``(D,)`` shaped array-like object, where
         ``n_predict`` is the number of different prediction points under consideration and
         ``D`` is the number of inputs to the emulator. If the prediction inputs array has shape
         ``(D,)``, then the method assumes ``n_predict == 1``. The prediction is returned as an
         ``(n_predict, )`` shaped numpy array as the first return value from the method.
-        
-        Optionally, the emulator can also calculate the variances in the predictions 
+
+        Optionally, the emulator can also calculate the variances in the predictions
         and the derivatives with respect to each input parameter. If the uncertainties are
         computed, they are returned as the second output from the method as an ``(n_predict,)``
         shaped numpy array. If the derivatives are computed, they are returned as the third
         output from the method as an ``(n_predict, D)`` shaped numpy array.
-        
+
         For this method to work, hyperparameter samples must have been drawn via the
         ``learn_hyperparameters_normalapprox`` or ``learn_hyperparameters_MCMC``
         methods. If samples have not been drawn, predictions fall back onto using the MLE
@@ -923,7 +922,7 @@ def _predict_samples(self, testing, do_deriv = True, do_unc = True):
         expensive for large numbers of samples or large numbers of inputs as the matrix inverse
         must be computed for each hyperparameter samples. Predictions from a single set of
         parameters used the cached matrix inverse, so these predictions are much more efficient.
-        
+
         :param testing: Array-like object holding the points where predictions will be made.
                         Must have shape ``(n_predict, D)`` or ``(D,)`` (for a single prediction)
         :type testing: ndarray
@@ -942,19 +941,19 @@ def _predict_samples(self, testing, do_deriv = True, do_unc = True):
                   ``None``.
         :rtype: tuple
         """
-        
+
         testing = np.array(testing)
         if len(testing.shape) == 1:
             testing = np.reshape(testing, (1, len(testing)))
         assert len(testing.shape) == 2
-                        
+
         n_testing, D = np.shape(testing)
         assert D == self.D
-        
+
         if self.samples is None:
             warnings.warn("hyperparameter samples have not been drawn, trying single parameter predictions")
             return self.predict(testing, do_deriv, do_unc, predict_from_samples = False)
-        
+
         n_samples = self.samples.shape[0]
 
         mu = np.zeros((n_samples, n_testing))
@@ -964,7 +963,7 @@ def _predict_samples(self, testing, do_deriv = True, do_unc = True):
         for i in range(n_samples):
             self._set_params(self.samples[i])
             mu[i], var[i], deriv[i] = self._predict_single(testing, do_deriv, do_unc)
-        
+
         mu_mean = np.mean(mu, axis = 0)
         if do_unc:
             var_mean = np.mean(var, axis = 0)+np.var(mu, axis = 0)
@@ -974,26 +973,26 @@ def _predict_samples(self, testing, do_deriv = True, do_unc = True):
             deriv_mean = np.mean(deriv, axis = 0)
         else:
             deriv_mean = None
-        
+
         return mu_mean, var_mean, deriv_mean
-    
+
     def predict(self, testing, do_deriv = True, do_unc = True, predict_from_samples = False):
         """
         Make a prediction for a set of input vectors
-        
+
         Makes predictions for the emulator on a given set of input vectors. The input vectors
         must be passed as a ``(n_predict, D)`` or ``(D,)`` shaped array-like object, where
         ``n_predict`` is the number of different prediction points under consideration and
         ``D`` is the number of inputs to the emulator. If the prediction inputs array has shape
         ``(D,)``, then the method assumes ``n_predict == 1``. The prediction is returned as an
         ``(n_predict, )`` shaped numpy array as the first return value from the method.
-        
-        Optionally, the emulator can also calculate the variances in the predictions 
+
+        Optionally, the emulator can also calculate the variances in the predictions
         and the derivatives with respect to each input parameter. If the uncertainties are
         computed, they are returned as the second output from the method as an ``(n_predict,)``
         shaped numpy array. If the derivatives are computed, they are returned as the third
         output from the method as an ``(n_predict, D)`` shaped numpy array.
-        
+
         If predictions based on samples of the hyperparameters (drawn by either assuming a
         normal posterior or using MCMC sampling) are desired, hyperparameter samples must have
         been drawn via the ``learn_hyperparameters_normalapprox`` or ``learn_hyperparameters_MCMC``
@@ -1005,14 +1004,14 @@ def predict(self, testing, do_deriv = True, do_unc = True, predict_from_samples
         large numbers of inputs as the matrix inverse must be computed for each hyperparameter
         samples. Predictions from a single set of parameters used the cached matrix inverse,
         so these predictions are much more efficient.
-        
+
         If predictions from a single set of parameters are desired, and the GP does not have
         a current set of parameters, the code raises an error. If the code does have a current
         set of parameters but the MLE parameters have not been estimated, it gives a warning
         but continues with the predictions using the current parameters. If it has current
         parameters as well as MLE parameters but they differ, the code issues a warning but
         continues with the predictions using the current parameters.
-        
+
         :param testing: Array-like object holding the points where predictions will be made.
                         Must have shape ``(n_predict, D)`` or ``(D,)`` (for a single prediction)
         :type testing: ndarray
@@ -1034,9 +1033,9 @@ def predict(self, testing, do_deriv = True, do_unc = True, predict_from_samples
                   ``None``.
         :rtype: tuple
         """
-        
+
         assert not self.theta is None, "Must set a parameter value to make predictions"
-        
+
         if predict_from_samples:
             return self._predict_samples(testing, do_deriv, do_unc)
         else:
@@ -1045,15 +1044,15 @@ def predict(self, testing, do_deriv = True, do_unc = True, predict_from_samples
             elif not np.allclose(self.mle_theta, self.theta):
                 warnings.warn("Warning: Current parameters are not MLE values")
             return self._predict_single(testing, do_deriv, do_unc)
-        
+
 
     def __str__(self):
         """
         Returns a string representation of the model
-        
+
         :returns: A string representation of the model (indicates number of training examples
                   and inputs)
         :rtype: str
         """
-        
+
         return "Gaussian Process with "+str(self.n)+" training examples and "+str(self.D)+" input variables"
diff --git a/mogp_emulator/Kernel.py b/mogp_emulator/Kernel.py
index 17155bf9..dcea194c 100644
--- a/mogp_emulator/Kernel.py
+++ b/mogp_emulator/Kernel.py
@@ -1,4 +1,4 @@
-"""
+r"""
 Kernel module, implements a few standard stationary kernels for use with the
 ``GaussianProcess`` class. At present, kernels can only be selected manually by setting
 the ``kernel`` attribute of the GP. The default is to use the ``SquaredExponential``
@@ -9,16 +9,16 @@
 from scipy.spatial.distance import cdist
 
 class Kernel(object):
-    """
+    r"""
     Generic class representing a stationary kernel
-    
+
     This base class implements the necessary scaffolding for defining a stationary kernel.
-    Stationary kernels are only dependent on a distance measure between any two points, so 
+    Stationary kernels are only dependent on a distance measure between any two points, so
     the base class holds all the necessary information for doing the distance computation.
     Individual subclasses will implement the functional dependence of the kernel on the
     distance, plus first and second derivatives (if desired) to compute the gradient or
     Hessian of the kernel with respect to the hyperparameters.
-    
+
     This implementation uses a scaled euclidean distance metric. Each individual parameter
     has a hyperparameter scale associated with it that is used in the distance computation.
     If a different metric is to be defined, a new base class needs to be defined that
@@ -26,36 +26,36 @@ class Kernel(object):
     methods if gradient or Hessian computation is desired. The methods ``kernel_f``,
     ``kernel_gradient``, and ``kernel_hessian`` can then be used to compute the appropriate
     quantities with no further modification.
-    
+
     Note that the Kernel object just collates all of the methods together; the class itself
     does not hold any information on the data point or hyperparamters, which are passed
     directly to the appropriate methods. Thus, no information needs to be provided when
     creating a new ``Kernal`` instance.
-    """ 
+    """
     def __str__(self):
-        """
+        r"""
         Defines a string representation of the kernel
-        
+
         Returns a string representation of the kernel. Note that since the kernel just
         collects methods for kernel evaluations together with no data, this is just a basic
         string that will not change for different instances of the class.
-        
+
         :returns: String representation of the kernel
         :rtype: str
         """
         return "Stationary Kernel"
 
     def _check_inputs(self, x1, x2, params):
-        """
+        r"""
         Common function for checking dimensions of inputs
-        
+
         This function checks the inputs to any kernel evaluation for consistency and ensures
         that all input arrays have the correct dimensionality. It returns the reformatted
         arrays, the number of inputs, and the number of hyperparameters. If the method
         determines that the array dimensions are not all consistent with one another,
         it will raise an ``AssertionError``. This method is called internally whenever
         the kernel is evaluated.
-        
+
         :param x1: First parameter array. Should be a 1-D or 2-D array (1-D is acceptable
                    if either there is only a single point, or each point has only a single
                    parameter). If there is more than one parameter, the last dimension
@@ -74,16 +74,16 @@ def _check_inputs(self, x1, x2, params):
                   ``(n2, D - 1)``, and ``params`` will be an array with dimensions ``(D,)``.
                   ``n1``, ``n2``, and ``D`` will be integers.
         """
-        
+
         params = np.array(params)
         assert params.ndim == 1, "parameters must be a vector"
         D = len(params)
         assert D >= 2, "minimum number of parameters in a covariance kernel is 2"
-    
+
         x1 = np.array(x1)
-    
+
         assert x1.ndim == 1 or x1.ndim == 2, "bad number of dimensions in input x1"
-    
+
         if x1.ndim == 2:
             assert x1.shape[1] == D - 1, "bad shape for x1"
         else:
@@ -91,13 +91,13 @@ def _check_inputs(self, x1, x2, params):
                 x1 = np.reshape(x1, (len(x1), 1))
             else:
                 x1 = np.reshape(x1, (1, D - 1))
-    
+
         n1 = x1.shape[0]
-            
+
         x2 = np.array(x2)
-    
+
         assert x2.ndim == 1 or x2.ndim == 2, "bad number of dimensions in input x2"
-    
+
         if x2.ndim == 2:
             assert x2.shape[1] == D - 1, "bad shape for x2"
         else:
@@ -105,22 +105,22 @@ def _check_inputs(self, x1, x2, params):
                 x2 = np.reshape(x2, (len(x2), 1))
             else:
                 x2 = np.reshape(x2, (1, D - 1))
-            
+
         n2 = x2.shape[0]
-        
+
         return x1, n1, x2, n2, params, D
-        
+
 
     def calc_r(self, x1, x2, params):
-        """
+        r"""
         Calculate distance between all pairs of points
-        
+
         This method computes the scaled Euclidean distance between all pairs of points
         in ``x1`` and ``x2``. Each component distance is multiplied by the corresponding
         hyperparameter prior to summing and taking the square root. For example, if
         ``x1 = [1.]``, ``x2`` = [2.], and ``params = [2., 2.]`` then ``calc_r`` would
         return :math:`{\sqrt{2(1 - 2)^2}=\sqrt{2}}` as an array with shape ``(1,1)``.
-        
+
         :param x1: First input array. Must be a 1-D or 2-D array, with the length of
                    the last dimension matching the last dimension of ``x2`` and
                    one less than the length of ``params``. ``x1`` may be 1-D if either
@@ -140,20 +140,20 @@ def calc_r(self, x1, x2, params):
                   of the first axis of ``x2``.
         :rtype: ndarray
         """
-    
+
         x1, n1, x2, n2, params, D = self._check_inputs(x1, x2, params)
-    
+
         exp_theta = np.exp(-params[:(D - 1)])
-    
+
         r_matrix = cdist(x1, x2, "seuclidean", V = exp_theta)
-    
+
         return r_matrix
 
     def calc_drdtheta(self, x1, x2, params):
-        """
+        r"""
         Calculate the first derivative of the distance between all pairs of points with
         respect to the hyperparameters
-        
+
         This method computes the derivative of the scaled Euclidean distance between
         all pairs of points in ``x1`` and ``x2`` with respect to the hyperparameters.
         The gradient is held in an array with shape ``(D, n1, n2)``, where ``D`` is
@@ -161,7 +161,7 @@ def calc_drdtheta(self, x1, x2, params):
         and ``n2`` is the length of the first axis of ``x2``. This is used in the
         computation of the gradient and Hessian of the kernel. The first index
         represents the different derivatives with respect to each hyperparameter.
-        
+
         :param x1: First input array. Must be a 1-D or 2-D array, with the length of
                    the last dimension matching the last dimension of ``x2`` and
                    one less than the length of ``params``. ``x1`` may be 1-D if either
@@ -184,27 +184,28 @@ def calc_drdtheta(self, x1, x2, params):
                   with respect to the first parameter is [0,:,:], etc.)
         :rtype: ndarray
         """
-    
+
         x1, n1, x2, n2, params, D = self._check_inputs(x1, x2, params)
-    
+
         exp_theta = np.exp(-params[:(D - 1)])
-    
+
         drdtheta = np.zeros((D - 1, n1, n2))
-    
+
         r_matrix = self.calc_r(x1, x2, params)
         r_matrix[(r_matrix == 0.)] = 1.
-    
+
         for d in range(D - 1):
-            drdtheta[d] = 0.5 * np.exp(params[d]) / r_matrix * cdist(np.reshape(x1[:,d], (n1, 1)),
-                                                                     np.reshape(x2[:,d], (n2, 1)), "sqeuclidean")
-                                                                 
+            drdtheta[d] = (0.5 * np.exp(params[d]) / r_matrix *
+                           cdist(np.reshape(x1[:,d], (n1, 1)),
+                           np.reshape(x2[:,d], (n2, 1)), "sqeuclidean"))
+
         return drdtheta
-    
+
     def calc_d2rdtheta2(self, x1, x2, params):
-        """
+        r"""
         Calculate all second derivatives of the distance between all pairs of points with
         respect to the hyperparameters
-        
+
         This method computes all second derivatives of the scaled Euclidean distance
         between all pairs of points in ``x1`` and ``x2`` with respect to the
         hyperparameters. The gradient is held in an array with shape ``(D, D, n1, n2)``,
@@ -212,7 +213,7 @@ def calc_d2rdtheta2(self, x1, x2, params):
         of ``x1``, and ``n2`` is the length of the first axis of ``x2``. This is used in
         the computation of the gradient and Hessian of the kernel. The first two indices
         represents the different derivatives with respect to each hyperparameter.
-        
+
         :param x1: First input array. Must be a 1-D or 2-D array, with the length of
                    the last dimension matching the last dimension of ``x2`` and
                    one less than the length of ``params``. ``x1`` may be 1-D if either
@@ -236,38 +237,92 @@ def calc_d2rdtheta2(self, x1, x2, params):
                   respect to the first and second parameters is [0,1,:,:] or [1,0,:,:], etc.)
         :rtype: ndarray
         """
-    
+
         x1, n1, x2, n2, params, D = self._check_inputs(x1, x2, params)
-    
+
         exp_theta = np.exp(-params[:(D - 1)])
-    
+
         d2rdtheta2 = np.zeros((D - 1, D - 1, n1, n2))
-    
+
         r_matrix = self.calc_r(x1, x2, params)
         r_matrix[(r_matrix == 0.)] = 1.
-    
+
         for d1 in range(D - 1):
             for d2 in range(D - 1):
                 if d1 == d2:
                     d2rdtheta2[d1, d2] = (0.5*np.exp(params[d1]) / r_matrix *
                                           cdist(np.reshape(x1[:,d1], (n1, 1)),
                                                 np.reshape(x2[:,d1], (n2, 1)), "sqeuclidean"))
-                d2rdtheta2[d1, d2] -= (0.25 * np.exp(params[d1]) * np.exp(params[d2]) / r_matrix**3 *
-                                       cdist(np.reshape(x1[:,d1], (n1, 1)), np.reshape(x2[:,d1], (n2, 1)), "sqeuclidean")*
-                                       cdist(np.reshape(x1[:,d2], (n1, 1)), np.reshape(x2[:,d2], (n2, 1)), "sqeuclidean"))
-                                                                 
+                d2rdtheta2[d1, d2] -= (0.25 * np.exp(params[d1]) *
+                                       np.exp(params[d2]) / r_matrix**3 *
+                                       cdist(np.reshape(x1[:,d1], (n1, 1)),
+                                             np.reshape(x2[:,d1], (n2, 1)), "sqeuclidean")*
+                                       cdist(np.reshape(x1[:,d2], (n1, 1)),
+                                             np.reshape(x2[:,d2], (n2, 1)), "sqeuclidean"))
+
         return d2rdtheta2
-        
-    def kernel_f(self, x1, x2, params):
+
+    def calc_drdx(self, x1, x2, params):
+        r"""
+        Calculate the first derivative of the distance between all pairs of points with
+        respect to the first set of inputs
+
+        This method computes the derivative of the scaled Euclidean distance between
+        all pairs of points in ``x1`` and ``x2`` with respect to the first input ``x1``.
+        The gradient is held in an array with shape ``(D - 1, n1, n2)``, where ``D`` is the
+        length of ``params``, ``n1`` is the length of the first axis of
+        ``x1``, and ``n2`` is the length of the first axis of ``x2``. This is used in the
+        computation of the derivative of the kernel with respect to the inputs. The first
+        index represents the different derivatives with respect to each input dimension.
+
+        :param x1: First input array. Must be a 1-D or 2-D array, with the length of
+                   the last dimension matching the last dimension of ``x2`` and
+                   one less than the length of ``params``. ``x1`` may be 1-D if either
+                   each point consists of a single parameter (and ``params`` has length
+                   2) or the array only contains a single point (in which case, the array
+                   will be reshaped to ``(1, D - 1)``).
+        :type x1: array-like
+        :param x2: Second input array. The same restrictions that apply to ``x1`` also
+                   apply here.
+        :type x2: array-like
+        :param params: Hyperparameter array. Must be 1-D with length one greater than
+                       the last dimension of ``x1`` and ``x2``.
+        :type params: array-like
+        :returns: Array holding the derivative of the pair-wise distances between
+                  points in arrays ``x1`` and ``x2`` with respect to ``x1``.
+                  Will be an array with shape ``(D, n1, n2)``, where ``D`` is the length
+                  of ``params``, ``n1`` is the length of the first axis
+                  of ``x1`` and ``n2`` is the length of the first axis of ``x2``. The first
+                  axis indicates the different derivative components (i.e. the derivative
+                  with respect to the first input parameter is [0,:,:], etc.)
+        :rtype: ndarray
         """
+
+        x1, n1, x2, n2, params, D = self._check_inputs(x1, x2, params)
+
+        drdx = np.zeros((D - 1, n1, n2))
+
+        exp_theta = np.exp(params[:(D - 1)])
+
+        r_matrix = self.calc_r(x1, x2, params)
+        r_matrix[(r_matrix == 0.)] = 1.
+
+        for d in range(D - 1):
+            drdx[d] = exp_theta[d]*(x1[:, d].flatten()[ :,    None ] -
+                                    x2[:, d].flatten()[ None, :    ])/r_matrix
+
+        return drdx
+
+    def kernel_f(self, x1, x2, params):
+        r"""
         Compute kernel values for a set of inputs
-        
+
         Returns the value of the kernel for two sets of input points and a choice of
         hyperparameters. This function should not need to be modified for different choices
         of the kernel function or distance metric, as after checking the inputs it simply
         calls the routine to compute the distance metric and then evaluates the kernel function
         for those distances.
-        
+
         :param x1: First input array. Must be a 1-D or 2-D array, with the length of
                    the last dimension matching the last dimension of ``x2`` and
                    one less than the length of ``params``. ``x1`` may be 1-D if either
@@ -287,21 +342,21 @@ def kernel_f(self, x1, x2, params):
                   of the first axis of ``x2``.
         :rtype: ndarray
         """
-    
+
         x1, n1, x2, n2, params, D = self._check_inputs(x1, x2, params)
-    
+
         return np.exp(params[D - 1]) * self.calc_K(self.calc_r(x1, x2, params))
-        
+
     def kernel_deriv(self, x1, x2, params):
-        """
+        r"""
         Compute kernel gradient for a set of inputs
-        
+
         Returns the value of the kernel gradient for two sets of input points and a choice of
         hyperparameters. This function should not need to be modified for different choices
         of the kernel function or distance metric, as after checking the inputs it simply
         calls the routine to compute the distance metric, kernel function, and the appropriate
         derivative functions of the distance and kernel functions.
-        
+
         :param x1: First input array. Must be a 1-D or 2-D array, with the length of
                    the last dimension matching the last dimension of ``x2`` and
                    one less than the length of ``params``. ``x1`` may be 1-D if either
@@ -323,33 +378,33 @@ def kernel_deriv(self, x1, x2, params):
                   (i.e. the derivative with respect to the first parameter is [0,:,:], etc.)
         :rtype: ndarray
         """
-    
+
         x1, n1, x2, n2, params, D = self._check_inputs(x1, x2, params)
-    
+
         dKdtheta = np.zeros((D, n1, n2))
-    
+
         dKdtheta[-1] = self.kernel_f(x1, x2, params)
-    
+
         dKdr = self.calc_dKdr(self.calc_r(x1, x2, params))
-    
+
         drdtheta = self.calc_drdtheta(x1, x2, params)
-    
+
         for d in range(D - 1):
             dKdtheta[d] = np.exp(params[-1]) * dKdr * drdtheta[d]
-    
+
         return dKdtheta
-        
+
     def kernel_hessian(self, x1, x2, params):
-        """
+        r"""
         Calculate the Hessian of the kernel evaluated for all pairs of points with
         respect to the hyperparameters
-        
+
         Returns the value of the kernel Hessian for two sets of input points and a choice of
         hyperparameters. This function should not need to be modified for different choices
         of the kernel function or distance metric, as after checking the inputs it simply
         calls the routine to compute the distance metric, kernel function, and the appropriate
         derivative functions of the distance and kernel functions.
-        
+
         :param x1: First input array. Must be a 1-D or 2-D array, with the length of
                    the last dimension matching the last dimension of ``x2`` and
                    one less than the length of ``params``. ``x1`` may be 1-D if either
@@ -373,248 +428,297 @@ def kernel_hessian(self, x1, x2, params):
                   or [1,0,:,:], etc.)
         :rtype: ndarray
         """
-    
+
         x1, n1, x2, n2, params, D = self._check_inputs(x1, x2, params)
-    
+
         d2Kdtheta2 = np.zeros((D, D, n1, n2))
-    
+
         d2Kdtheta2[-1, :] = self.kernel_deriv(x1, x2, params)
         d2Kdtheta2[:, -1] = d2Kdtheta2[-1, :]
-    
+
         r_matrix = self.calc_r(x1, x2, params)
         dKdr = self.calc_dKdr(r_matrix)
         d2Kdr2 = self.calc_d2Kdr2(r_matrix)
-    
+
         drdtheta = self.calc_drdtheta(x1, x2, params)
         d2rdtheta2 = self.calc_d2rdtheta2(x1, x2, params)
-    
+
         for d1 in range(D - 1):
             for d2 in range(D - 1):
-                d2Kdtheta2[d1, d2] = np.exp(params[-1]) * (d2Kdr2 * drdtheta[d1] * drdtheta[d2] + dKdr * d2rdtheta2[d1, d2])
-    
+                d2Kdtheta2[d1, d2] = np.exp(params[-1]) * (d2Kdr2 *
+                                                           drdtheta[d1] * drdtheta[d2] +
+                                                           dKdr * d2rdtheta2[d1, d2])
+
         return d2Kdtheta2
 
-    def calc_K(self, r):
+    def kernel_inputderiv(self, x1, x2, params):
+        r"""
+        Compute derivative of Kernel with respect to inputs x1
+
+        Returns the value of the kernel derivative with respect to the first set of input
+        points given inputs and a choice of hyperparameters. This function should not need
+        to be modified for different choices of the kernel function or distance metric, as
+        after checking the inputs it simply calls the routine to compute the distance metric,
+        kernel function, and the appropriate derivative functions of the distance and kernel
+        functions.
+
+        :param x1: First input array. Must be a 1-D or 2-D array, with the length of
+                   the last dimension matching the last dimension of ``x2`` and
+                   one less than the length of ``params``. ``x1`` may be 1-D if either
+                   each point consists of a single parameter (and ``params`` has length
+                   2) or the array only contains a single point (in which case, the array
+                   will be reshaped to ``(1, D - 1)``).
+        :type x1: array-like
+        :param x2: Second input array. The same restrictions that apply to ``x1`` also
+                   apply here.
+        :type x2: array-like
+        :param params: Hyperparameter array. Must be 1-D with length one greater than
+                       the last dimension of ``x1`` and ``x2``.
+        :type params: array-like
+        :returns: Array holding the derivative of the kernel function between points in arrays
+                  ``x1`` and ``x2`` with respect to the first inputs ``x1``. Will be an array with
+                  shape ``(D, n1, n2)``, where ``D`` is the length of ``params``,
+                  ``n1`` is the length of the first axis of ``x1`` and ``n2`` is the length of the
+                  first axis of ``x2``. The first axis indicates the different derivative components
+                  (i.e. the derivative with respect to the first input dimension is [0,:,:], etc.)
+        :rtype: ndarray
         """
+
+        x1, n1, x2, n2, params, D = self._check_inputs(x1, x2, params)
+
+        dKdx = np.zeros((D - 1, n1, n2))
+
+        r_matrix = self.calc_r(x1, x2, params)
+        dKdr = self.calc_dKdr(r_matrix)
+
+        drdx = self.calc_drdx(x1, x2, params)
+
+        for d in range(D - 1):
+            dKdx[d] = np.exp(params[-1]) * dKdr * drdx[d]
+
+        return dKdx
+
+    def calc_K(self, r):
+        r"""
         Calculate kernel as a function of distance
-        
+
         This method implements the kernel function as a function of distance. Given an array
         of distances, this function evaluates the kernel function of those values, returning
         an array of the same shape. Note that this is not implemented for the base class, as
         this must be defined for a specific kernel.
-        
+
         :param r: Array holding distances between all points. All values in this array must be
                   non-negative.
         :type r: array-like
         :returns: Array holding kernel evaluations, with the same shape as the input ``r``
         :rtype: ndarray
         """
-        
+
         raise NotImplementedError("base Kernel class does not implement a kernel function")
-        
+
     def calc_dKdr(self, r):
-        """
+        r"""
         Calculate first derivative of kernel as a function of distance
-        
+
         This method implements the first derivative of the kernel function as a function of
         distance. Given an array of distances, this function evaluates the derivative
         function of those values, returning an array of the same shape. Note that this is
         not implemented for the base class, as this must be defined for a specific kernel.
-        
+
         :param r: Array holding distances between all points. All values in this array must be
                   non-negative.
         :type r: array-like
         :returns: Array holding kernel derivatives, with the same shape as the input ``r``
         :rtype: ndarray
         """
-        
+
         raise NotImplementedError("base Kernel class does not implement a kernel derivative function")
-        
+
     def calc_d2Kdr2(self, r):
-        """
+        r"""
         Calculate second derivative of kernel as a function of distance
-        
+
         This method implements the second derivative of the kernel function as a function of
         distance. Given an array of distances, this function evaluates the second derivative
         function of those values, returning an array of the same shape. Note that this is
         not implemented for the base class, as this must be defined for a specific kernel.
-        
+
         :param r: Array holding distances between all points. All values in this array must be
                   non-negative.
         :type r: array-like
         :returns: Array holding kernel second derivatives, with the same shape as the input ``r``
         :rtype: ndarray
         """
-        
+
         raise NotImplementedError("base Kernel class does not implement kernel derivatives")
 
 class SquaredExponential(Kernel):
-    """
+    r"""
     Implementation of the squared exponential kernel
-    
+
     Class representing a squared exponential kernel. It derives from the base class for a
     stationary kernel, using the scaled Euclidean distance metric. The subclass then just
     defines the kernel function and its derivatives.
     """
 
     def calc_K(self, r):
-        """
+        r"""
         Compute K(r) for the squared exponential kernel
-        
+
         This method implements the squared exponential kernel function as a function of distance.
         Given an array of distances, this function evaluates the kernel function of those values,
         returning an array of the same shape.
-        
+
         :param r: Array holding distances between all points. All values in this array must be
                   non-negative.
         :type r: array-like
         :returns: Array holding kernel evaluations, with the same shape as the input ``r``
         :rtype: ndarray
         """
-    
+
         assert np.all(r >= 0.), "kernel distances must be positive"
-    
+
         r = np.array(r)
-    
+
         return np.exp(-0.5*r**2)
-    
+
     def calc_dKdr(self, r):
-        """
+        r"""
         Calculate first derivative of the squared exponential kernel as a function of distance
-        
+
         This method implements the first derivative of the squared exponential kernel function
         as a function of distance. Given an array of distances, this function evaluates the derivative
         function of those values, returning an array of the same shape.
-        
+
         :param r: Array holding distances between all points. All values in this array must be
                   non-negative.
         :type r: array-like
         :returns: Array holding kernel derivatives, with the same shape as the input ``r``
         :rtype: ndarray
         """
-    
+
         assert np.all(r >= 0.), "kernel distances must be positive"
-    
+
         r = np.array(r)
-    
+
         return -r*np.exp(-0.5*r**2)
-    
+
     def calc_d2Kdr2(self, r):
-        """
+        r"""
         Calculate second derivative of the squared exponential kernel as a function of distance
-        
+
         This method implements the second derivative of the squared exponential kernel function
         as a function of distance. Given an array of distances, this function evaluates the
         second derivative function of those values, returning an array of the same shape.
-        
+
         :param r: Array holding distances between all points. All values in this array must be
                   non-negative.
         :type r: array-like
         :returns: Array holding kernel second derivatives, with the same shape as the input ``r``
         :rtype: ndarray
         """
-    
+
         assert np.all(r >= 0.), "kernel distances must be positive"
-    
+
         r = np.array(r)
-    
+
         return (r**2 - 1.)*np.exp(-0.5*r**2)
-        
+
     def __str__(self):
-        """
+        r"""
         Defines a string representation of the squared exponential kernel
-        
+
         Returns a string representation of the squared exponential kernel. Note that since
         the kernel just collects methods for kernel evaluations together with no data, this
         is just a basic string that will not change for different instances of the class.
-        
+
         :returns: String representation of the kernel
         :rtype: str
         """
         return "Squared Exponential Kernel"
 
 class Matern52(Kernel):
-    """
+    r"""
     Implementation of the Matern 5/2 kernel
-    
+
     Class representing the Matern 5/2 kernel. It derives from the base class for a
     stationary kernel, using the scaled Euclidean distance metric. The subclass then just
     defines the kernel function and its derivatives.
     """
     def calc_K(self, r):
-        """
+        r"""
         Compute K(r) for the Matern 5/2 kernel
-        
+
         This method implements the Matern 5/2 kernel function as a function of distance.
         Given an array of distances, this function evaluates the kernel function of those values,
         returning an array of the same shape.
-        
+
         :param r: Array holding distances between all points. All values in this array must be
                   non-negative.
         :type r: array-like
         :returns: Array holding kernel evaluations, with the same shape as the input ``r``
         :rtype: ndarray
         """
-    
+
         assert np.all(r >= 0.), "kernel distances must be positive"
-    
+
         r = np.array(r)
-    
+
         return (1.+np.sqrt(5.)*r+5./3.*r**2)*np.exp(-np.sqrt(5.)*r)
-    
+
     def calc_dKdr(self, r):
-        """
+        r"""
         Calculate first derivative of the Matern 5/2 kernel as a function of distance
-        
+
         This method implements the first derivative of the Matern 5/2 kernel function
         as a function of distance. Given an array of distances, this function evaluates the derivative
         function of those values, returning an array of the same shape.
-        
+
         :param r: Array holding distances between all points. All values in this array must be
                   non-negative.
         :type r: array-like
         :returns: Array holding kernel derivatives, with the same shape as the input ``r``
         :rtype: ndarray
         """
-    
+
         assert np.all(r >= 0.), "kernel distances must be positive"
-    
+
         r = np.array(r)
-    
+
         return -5./3.*r*(1.+np.sqrt(5.)*r)*np.exp(-np.sqrt(5.)*r)
-    
+
     def calc_d2Kdr2(self, r):
-        """
+        r"""
         Calculate second derivative of the squared exponential kernel as a function of distance
-        
+
         This method implements the second derivative of the squared exponential kernel function
         as a function of distance. Given an array of distances, this function evaluates the
         second derivative function of those values, returning an array of the same shape.
-        
+
         :param r: Array holding distances between all points. All values in this array must be
                   non-negative.
         :type r: array-like
         :returns: Array holding kernel second derivatives, with the same shape as the input ``r``
         :rtype: ndarray
         """
-    
+
         assert np.all(r >= 0.), "kernel distances must be positive"
-    
+
         r = np.array(r)
-    
+
         return 5./3.*(5.*r**2-np.sqrt(5.)*r-1.)*np.exp(-np.sqrt(5.)*r)
-        
+
     def __str__(self):
-        """
+        r"""
         Defines a string representation of the Matern 5/2 kernel
-        
+
         Returns a string representation of the Matern 5/2 kernel. Note that since
         the kernel just collects methods for kernel evaluations together with no data, this
         is just a basic string that will not change for different instances of the class.
-        
+
         :returns: String representation of the kernel
         :rtype: str
         """
         return "Matern 5/2 Kernel"
 
-    
+
diff --git a/mogp_emulator/tests/test_Kernel.py b/mogp_emulator/tests/test_Kernel.py
index 9ed9dcb4..ca10ed8d 100644
--- a/mogp_emulator/tests/test_Kernel.py
+++ b/mogp_emulator/tests/test_Kernel.py
@@ -5,77 +5,79 @@
 
 def test_calc_r():
     "test function for calc_r function for kernels"
-    
+
     k = Kernel()
-    
+
     x = np.array([[1.], [2.]])
     y = np.array([[2.], [3.]])
     params = np.array([0., 0.])
-    
+
     assert_allclose(k.calc_r(x, y, params), np.array([[1., 2.], [0., 1.]]))
-    
+
     x = np.array([[1., 2.], [2., 3.]])
     y = np.array([[2., 4.], [3., 1.]])
     params = np.array([0., 0., 0.])
-    
-    assert_allclose(k.calc_r(x, y, params), np.array([[np.sqrt(5.), np.sqrt(5.)], [1., np.sqrt(5.)]]))
-    
+
+    assert_allclose(k.calc_r(x, y, params),
+                    np.array([[np.sqrt(5.), np.sqrt(5.)], [1., np.sqrt(5.)]]))
+
     x = np.array([[1., 2.], [2., 3.]])
     y = np.array([[2., 4.], [3., 1.]])
     params = np.array([np.log(2.), np.log(4.), 0.])
-    
+
     assert_allclose(k.calc_r(x, y, params),
-                    np.array([[np.sqrt(1.*2.+4.*4.), np.sqrt(4.*2.+1.*4.)], [np.sqrt(1.*4.), np.sqrt(1.*2.+4.*4.)]]))
-                    
+                    np.array([[np.sqrt(1.*2.+4.*4.), np.sqrt(4.*2.+1.*4.)],
+                              [np.sqrt(1.*4.), np.sqrt(1.*2.+4.*4.)]]))
+
     x = np.array([1., 2.])
     y = np.array([2., 3.])
     params = np.array([0., 0.])
-    
+
     assert_allclose(k.calc_r(x, y, params), np.array([[1., 2.], [0., 1.]]))
-    
-    
+
+
 def test_calc_r_failures():
     "test scenarios where calc_r should raise an exception"
-    
+
     k = Kernel()
-    
+
     x = np.array([[1.], [2.]])
     y = np.array([[2.], [3.]])
     params = np.array([0.])
-    
+
     with pytest.raises(AssertionError):
         k.calc_r(x, y, params)
-    
+
     params = np.array([[0., 0.], [0., 0.]])
-    
+
     with pytest.raises(AssertionError):
         k.calc_r(x, y, params)
-        
+
     x = np.array([[1.], [2.]])
     y = np.array([[2., 4.], [3., 2.]])
     params = np.array([0., 0.])
-    
+
     with pytest.raises(AssertionError):
         k.calc_r(x, y, params)
-        
+
     x = np.array([[1.], [2.]])
     y = np.array([[[2.], [4.]], [[3.], [2.]]])
     params = np.array([0., 0.])
-    
+
     with pytest.raises(AssertionError):
         k.calc_r(x, y, params)
-        
+
     x = np.array([[2., 4.], [3., 2.]])
     y = np.array([[1.], [2.]])
     params = np.array([0., 0.])
-    
+
     with pytest.raises(AssertionError):
         k.calc_r(x, y, params)
-        
+
     x = np.array([[[2.], [4.]], [[3.], [2.]]])
     y = np.array([[1.], [2.]])
     params = np.array([0., 0.])
-    
+
     with pytest.raises(AssertionError):
         k.calc_r(x, y, params)
 
@@ -89,136 +91,145 @@ def test_calc_drdtheta():
     x = np.array([[1.], [2.]])
     y = np.array([[2.], [3.]])
     params = np.array([0., 0.])
-    
+
     r = np.array([[1., 2.], [1., 1.]])
-    
+
     deriv = np.zeros((1, 2, 2))
     deriv[0] = 0.5*np.array([[1., 4.], [0., 1.]])/r
     deriv_fd = np.zeros((1, 2, 2))
-    deriv_fd[0] = (k.calc_r(x, y, params) - k.calc_r(x, y, params - np.array([dx, 0.])))/dx
-    
+    deriv_fd[0] = (k.calc_r(x, y, params) -
+                   k.calc_r(x, y, params - np.array([dx, 0.])))/dx
+
     assert_allclose(k.calc_drdtheta(x, y, params), deriv)
     assert_allclose(k.calc_drdtheta(x, y, params), deriv_fd, rtol = 1.e-5)
-    
+
     x = np.array([[1., 2.], [2., 3.]])
     y = np.array([[2., 4.], [3., 1.]])
     params = np.array([0., 0., 0.])
-    
+
     r = np.array([[np.sqrt(5.), np.sqrt(5.)], [1., np.sqrt(5.)]])
-    
+
     deriv = np.zeros((2, 2, 2))
     deriv[0] = 0.5*np.array([[1., 4.], [0., 1.]])/r
     deriv[1] = 0.5*np.array([[4., 1.], [1., 4.]])/r
     deriv_fd = np.zeros((2, 2, 2))
-    deriv_fd[0] = (k.calc_r(x, y, params) - k.calc_r(x, y, params - np.array([dx, 0., 0.])))/dx
-    deriv_fd[1] = (k.calc_r(x, y, params) - k.calc_r(x, y, params - np.array([0., dx, 0.])))/dx
-    
+    deriv_fd[0] = (k.calc_r(x, y, params) -
+                   k.calc_r(x, y, params - np.array([dx, 0., 0.])))/dx
+    deriv_fd[1] = (k.calc_r(x, y, params) -
+                   k.calc_r(x, y, params - np.array([0., dx, 0.])))/dx
+
     assert_allclose(k.calc_drdtheta(x, y, params), deriv)
     assert_allclose(k.calc_drdtheta(x, y, params), deriv_fd, rtol = 1.e-5)
-    
+
     deriv = np.zeros((2, 2, 2))
     x = np.array([[1., 2.], [2., 3.]])
     y = np.array([[2., 4.], [3., 1.]])
     params = np.array([np.log(2.), np.log(4.), 0.])
-    
-    r = np.array([[np.sqrt(1.*2.+4.*4.), np.sqrt(4.*2.+1.*4.)], [np.sqrt(1.*4.), np.sqrt(1.*2.+4.*4.)]])
-    
+
+    r = np.array([[np.sqrt(1.*2.+4.*4.), np.sqrt(4.*2.+1.*4.)],
+                  [np.sqrt(1.*4.), np.sqrt(1.*2.+4.*4.)]])
+
     deriv = np.zeros((2, 2, 2))
     deriv[0] = 0.5*2.*np.array([[1., 4.], [0., 1.]])/r
     deriv[1] = 0.5*4.*np.array([[4., 1.], [1., 4.]])/r
     deriv_fd = np.zeros((2, 2, 2))
-    deriv_fd[0] = (k.calc_r(x, y, params) - k.calc_r(x, y, params - np.array([dx, 0., 0.])))/dx
-    deriv_fd[1] = (k.calc_r(x, y, params) - k.calc_r(x, y, params - np.array([0., dx, 0.])))/dx
-    
+    deriv_fd[0] = (k.calc_r(x, y, params) -
+                   k.calc_r(x, y, params - np.array([dx, 0., 0.])))/dx
+    deriv_fd[1] = (k.calc_r(x, y, params) -
+                   k.calc_r(x, y, params - np.array([0., dx, 0.])))/dx
+
     assert_allclose(k.calc_drdtheta(x, y, params), deriv)
     assert_allclose(k.calc_drdtheta(x, y, params), deriv_fd, rtol = 1.e-5)
-                    
+
     x = np.array([1., 2.])
     y = np.array([2., 3.])
     params = np.array([0., 0.])
-    
+
     r = np.array([[1., 2.], [1., 1.]])
-    
+
     deriv = np.zeros((1, 2, 2))
     deriv[0] = 0.5*np.array([[1., 4.], [0., 1.]])/r
     deriv_fd = np.zeros((1, 2, 2))
-    deriv_fd[0] = (k.calc_r(x, y, params) - k.calc_r(x, y, params - np.array([dx, 0.])))/dx
-    
+    deriv_fd[0] = (k.calc_r(x, y, params) -
+                   k.calc_r(x, y, params - np.array([dx, 0.])))/dx
+
     assert_allclose(k.calc_drdtheta(x, y, params), deriv)
     assert_allclose(k.calc_drdtheta(x, y, params), deriv_fd, rtol = 1.e-5)
-    
+
 def test_calc_drdtheta_failures():
     "test situations where calc_drdtheta should fail"
-    
+
     k = Kernel()
-    
+
     x = np.array([[1.], [2.]])
     y = np.array([[2.], [3.]])
     params = np.array([0.])
-    
+
     with pytest.raises(AssertionError):
         k.calc_drdtheta(x, y, params)
-    
+
     params = np.array([[0., 0.], [0., 0.]])
-    
+
     with pytest.raises(AssertionError):
         k.calc_drdtheta(x, y, params)
-        
+
     x = np.array([[1.], [2.]])
     y = np.array([[2., 4.], [3., 2.]])
     params = np.array([0., 0.])
-    
+
     with pytest.raises(AssertionError):
         k.calc_drdtheta(x, y, params)
-        
+
     x = np.array([[1.], [2.]])
     y = np.array([[[2.], [4.]], [[3.], [2.]]])
     params = np.array([0., 0.])
-    
+
     with pytest.raises(AssertionError):
         k.calc_drdtheta(x, y, params)
-        
+
     x = np.array([[2., 4.], [3., 2.]])
     y = np.array([[1.], [2.]])
     params = np.array([0., 0.])
-    
+
     with pytest.raises(AssertionError):
         k.calc_drdtheta(x, y, params)
-        
+
     x = np.array([[[2.], [4.]], [[3.], [2.]]])
     y = np.array([[1.], [2.]])
     params = np.array([0., 0.])
-    
+
     with pytest.raises(AssertionError):
         k.calc_drdtheta(x, y, params)
-    
+
 def test_calc_d2rdtheta2():
     "test calc_d2rdtheta2 function"
-    
+
     k = Kernel()
-    
+
     dx = 1.e-6
-    
+
     x = np.array([[1.], [2.]])
     y = np.array([[2.], [3.]])
     params = np.array([0., 0.])
-    
+
     r = np.array([[1., 2.], [1., 1.]])
-    
+
     deriv = np.zeros((1, 1, 2, 2))
-    deriv[0, 0] = 0.5*np.array([[1., 4.], [0., 1.]])/r - 0.25*np.array([[1., 4.], [0., 1.]])**2/r**3
+    deriv[0, 0] = (0.5*np.array([[1., 4.], [0., 1.]])/r -
+                   0.25*np.array([[1., 4.], [0., 1.]])**2/r**3)
     deriv_fd = np.zeros((1, 1, 2, 2))
-    deriv_fd[0, 0] = (k.calc_drdtheta(x, y, params)[0] - k.calc_drdtheta(x, y, params - np.array([dx, 0.]))[0])/dx
-    
+    deriv_fd[0, 0] = (k.calc_drdtheta(x, y, params)[0] -
+                      k.calc_drdtheta(x, y, params - np.array([dx, 0.]))[0])/dx
+
     assert_allclose(k.calc_d2rdtheta2(x, y, params), deriv)
     assert_allclose(k.calc_d2rdtheta2(x, y, params), deriv_fd, rtol = 1.e-5)
-    
+
     x = np.array([[1., 2.], [2., 3.]])
     y = np.array([[2., 4.], [3., 1.]])
     params = np.array([0., 0., 0.])
-    
+
     r = np.array([[np.sqrt(5.), np.sqrt(5.)], [1., np.sqrt(5.)]])
-    
+
     deriv = np.zeros((2, 2, 2, 2))
     x12 = np.array([[1., 4.], [0., 1.]])
     x22 = np.array([[4., 1.], [1., 4.]])
@@ -227,20 +238,25 @@ def test_calc_d2rdtheta2():
     deriv[1, 0] = -0.25*x22*x12/r**3
     deriv[1, 1] = 0.5*x22/r-0.25*x22*x22/r**3
     deriv_fd = np.zeros((2, 2, 2, 2))
-    deriv_fd[0, 0] = (k.calc_drdtheta(x, y, params)[0] - k.calc_drdtheta(x, y, params - np.array([dx, 0., 0.]))[0])/dx
-    deriv_fd[0, 1] = (k.calc_drdtheta(x, y, params)[1] - k.calc_drdtheta(x, y, params - np.array([dx, 0., 0.]))[1])/dx
-    deriv_fd[1, 0] = (k.calc_drdtheta(x, y, params)[0] - k.calc_drdtheta(x, y, params - np.array([0., dx, 0.]))[0])/dx
-    deriv_fd[1, 1] = (k.calc_drdtheta(x, y, params)[1] - k.calc_drdtheta(x, y, params - np.array([0., dx, 0.]))[1])/dx
-    
+    deriv_fd[0, 0] = (k.calc_drdtheta(x, y, params)[0] -
+                      k.calc_drdtheta(x, y, params - np.array([dx, 0., 0.]))[0])/dx
+    deriv_fd[0, 1] = (k.calc_drdtheta(x, y, params)[1] -
+                      k.calc_drdtheta(x, y, params - np.array([dx, 0., 0.]))[1])/dx
+    deriv_fd[1, 0] = (k.calc_drdtheta(x, y, params)[0] -
+                      k.calc_drdtheta(x, y, params - np.array([0., dx, 0.]))[0])/dx
+    deriv_fd[1, 1] = (k.calc_drdtheta(x, y, params)[1] -
+                      k.calc_drdtheta(x, y, params - np.array([0., dx, 0.]))[1])/dx
+
     assert_allclose(k.calc_d2rdtheta2(x, y, params), deriv)
     assert_allclose(k.calc_d2rdtheta2(x, y, params), deriv_fd, rtol = 1.e-5)
-    
+
     x = np.array([[1., 2.], [2., 3.]])
     y = np.array([[2., 4.], [3., 1.]])
     params = np.array([np.log(2.), np.log(4.), 0.])
-    
-    r = np.array([[np.sqrt(1.*2.+4.*4.), np.sqrt(4.*2.+1.*4.)], [np.sqrt(1.*4.), np.sqrt(1.*2.+4.*4.)]])
-    
+
+    r = np.array([[np.sqrt(1.*2.+4.*4.), np.sqrt(4.*2.+1.*4.)],
+                  [np.sqrt(1.*4.), np.sqrt(1.*2.+4.*4.)]])
+
     deriv = np.zeros((2, 2, 2, 2))
     x12 = np.array([[1., 4.], [0., 1.]])
     x22 = np.array([[4., 1.], [1., 4.]])
@@ -249,92 +265,219 @@ def test_calc_d2rdtheta2():
     deriv[1, 0] = -4.*2.*0.25*x22*x12/r**3
     deriv[1, 1] = 4.*0.5*x22/r-4.*4.*0.25*x22*x22/r**3
     deriv_fd = np.zeros((2, 2, 2, 2))
-    deriv_fd[0, 0] = (k.calc_drdtheta(x, y, params)[0] - k.calc_drdtheta(x, y, params - np.array([dx, 0., 0.]))[0])/dx
-    deriv_fd[0, 1] = (k.calc_drdtheta(x, y, params)[1] - k.calc_drdtheta(x, y, params - np.array([dx, 0., 0.]))[1])/dx
-    deriv_fd[1, 0] = (k.calc_drdtheta(x, y, params)[0] - k.calc_drdtheta(x, y, params - np.array([0., dx, 0.]))[0])/dx
-    deriv_fd[1, 1] = (k.calc_drdtheta(x, y, params)[1] - k.calc_drdtheta(x, y, params - np.array([0., dx, 0.]))[1])/dx
-    
+    deriv_fd[0, 0] = (k.calc_drdtheta(x, y, params)[0] -
+                      k.calc_drdtheta(x, y, params - np.array([dx, 0., 0.]))[0])/dx
+    deriv_fd[0, 1] = (k.calc_drdtheta(x, y, params)[1] -
+                      k.calc_drdtheta(x, y, params - np.array([dx, 0., 0.]))[1])/dx
+    deriv_fd[1, 0] = (k.calc_drdtheta(x, y, params)[0] -
+                      k.calc_drdtheta(x, y, params - np.array([0., dx, 0.]))[0])/dx
+    deriv_fd[1, 1] = (k.calc_drdtheta(x, y, params)[1] -
+                      k.calc_drdtheta(x, y, params - np.array([0., dx, 0.]))[1])/dx
+
     assert_allclose(k.calc_d2rdtheta2(x, y, params), deriv)
     assert_allclose(k.calc_d2rdtheta2(x, y, params), deriv_fd, rtol = 1.e-5)
-                    
+
     x = np.array([1., 2.])
     y = np.array([2., 3.])
     params = np.array([0., 0.])
-    
+
     r = np.array([[1., 2.], [1., 1.]])
-    
+
     deriv = np.zeros((1, 1, 2, 2))
-    deriv[0, 0] = 0.5*np.array([[1., 4.], [0., 1.]])/r - 0.25*np.array([[1., 4.], [0., 1.]])**2/r**3
+    deriv[0, 0] = (0.5*np.array([[1., 4.], [0., 1.]])/r -
+                   0.25*np.array([[1., 4.], [0., 1.]])**2/r**3)
     deriv_fd = np.zeros((1, 1, 2, 2))
-    deriv_fd[0, 0] = (k.calc_drdtheta(x, y, params)[0] - k.calc_drdtheta(x, y, params - np.array([dx, 0.]))[0])/dx
-    
+    deriv_fd[0, 0] = (k.calc_drdtheta(x, y, params)[0] -
+                      k.calc_drdtheta(x, y, params - np.array([dx, 0.]))[0])/dx
+
     assert_allclose(k.calc_d2rdtheta2(x, y, params), deriv)
     assert_allclose(k.calc_d2rdtheta2(x, y, params), deriv_fd, rtol = 1.e-5)
-    
+
 def test_calc_d2rdtheta2_failures():
     "test situations where calc_d2rdtheta2 should fail"
-    
+
     k = Kernel()
-    
+
     x = np.array([[1.], [2.]])
     y = np.array([[2.], [3.]])
     params = np.array([0.])
-    
+
     with pytest.raises(AssertionError):
         k.calc_d2rdtheta2(x, y, params)
-    
+
     params = np.array([[0., 0.], [0., 0.]])
-    
+
     with pytest.raises(AssertionError):
         k.calc_d2rdtheta2(x, y, params)
-        
+
     x = np.array([[1.], [2.]])
     y = np.array([[2., 4.], [3., 2.]])
     params = np.array([0., 0.])
-    
+
     with pytest.raises(AssertionError):
         k.calc_d2rdtheta2(x, y, params)
-        
+
     x = np.array([[1.], [2.]])
     y = np.array([[[2.], [4.]], [[3.], [2.]]])
     params = np.array([0., 0.])
-    
+
     with pytest.raises(AssertionError):
         k.calc_d2rdtheta2(x, y, params)
-        
+
     x = np.array([[2., 4.], [3., 2.]])
     y = np.array([[1.], [2.]])
     params = np.array([0., 0.])
-    
+
     with pytest.raises(AssertionError):
         k.calc_d2rdtheta2(x, y, params)
-        
+
     x = np.array([[[2.], [4.]], [[3.], [2.]]])
     y = np.array([[1.], [2.]])
     params = np.array([0., 0.])
-    
+
     with pytest.raises(AssertionError):
         k.calc_d2rdtheta2(x, y, params)
 
+def test_kernel_calc_drdx():
+    "test the calc_drdx method of the kernel class"
+
+    k = Kernel()
+
+    dx = 1.e-6
+
+    x = np.array([[1.], [2.]])
+    y = np.array([[2.], [3.]])
+    params = np.array([0., 0.])
+
+    r = np.array([[1., 2.], [1., 1.]])
+
+    deriv = np.zeros((1, 2, 2))
+    deriv[0] = -np.array([[1., 2.], [0., 1.]])/r
+    deriv_fd = np.zeros((1, 2, 2))
+    # need to use central differences here as derivative is discontiuous at zero
+    deriv_fd[0] = (k.calc_r(x + dx, y, params) - k.calc_r(x - dx, y, params))/dx/2.
+
+    assert_allclose(k.calc_drdx(x, y, params), deriv)
+    assert_allclose(k.calc_drdx(x, y, params), deriv_fd, rtol = 1.e-5, atol = 1.e-8)
+
+    x = np.array([[1., 2.], [2., 3.]])
+    y = np.array([[2., 4.], [3., 1.]])
+    params = np.array([0., 0., 0.])
+
+    r = np.array([[np.sqrt(5.), np.sqrt(5.)], [1., np.sqrt(5.)]])
+
+    deriv = np.zeros((2, 2, 2))
+    deriv[0] = -np.array([[1., 2.], [0., 1.]])/r
+    deriv[1] = np.array([[-2., 1.], [-1., 2.]])/r
+    deriv_fd = np.zeros((2, 2, 2))
+    deriv_fd[0] = (k.calc_r(x + np.array([[dx, 0.], [dx, 0.]]), y, params) -
+                   k.calc_r(x - np.array([[dx, 0.], [dx, 0.]]), y, params))/dx/2.
+    deriv_fd[1] = (k.calc_r(x + np.array([[0., dx], [0., dx]]), y, params) -
+                   k.calc_r(x - np.array([[0., dx], [0., dx]]), y, params))/dx/2.
+
+    assert_allclose(k.calc_drdx(x, y, params), deriv)
+    assert_allclose(k.calc_drdx(x, y, params), deriv_fd, rtol = 1.e-5, atol = 1.e-7)
+
+    deriv = np.zeros((2, 2, 2))
+    x = np.array([[1., 2.], [2., 3.]])
+    y = np.array([[2., 4.], [3., 1.]])
+    params = np.array([np.log(2.), np.log(4.), 0.])
+
+    r = np.array([[np.sqrt(1.*2.+4.*4.), np.sqrt(4.*2.+1.*4.)],
+                  [np.sqrt(1.*4.), np.sqrt(1.*2.+4.*4.)]])
+
+    deriv = np.zeros((2, 2, 2))
+    deriv[0] = 2.*np.array([[-1., -2.], [0., -1.]])/r
+    deriv[1] = 4.*np.array([[-2., 1.], [-1., 2.]])/r
+    deriv_fd = np.zeros((2, 2, 2))
+    deriv_fd[0] = (k.calc_r(x + np.array([[dx, 0.], [dx, 0.]]), y, params) -
+                   k.calc_r(x - np.array([[dx, 0.], [dx, 0.]]), y, params))/dx/2.
+    deriv_fd[1] = (k.calc_r(x + np.array([[0., dx], [0., dx]]), y, params) -
+                   k.calc_r(x - np.array([[0., dx], [0., dx]]), y, params))/dx/2.
+
+    assert_allclose(k.calc_drdx(x, y, params), deriv)
+    assert_allclose(k.calc_drdx(x, y, params), deriv_fd, rtol = 1.e-5, atol = 1.e-7)
+
+    x = np.array([1., 2.])
+    y = np.array([2., 3.])
+    params = np.array([0., 0.])
+
+    r = np.array([[1., 2.], [1., 1.]])
+
+    deriv = np.zeros((1, 2, 2))
+    deriv[0] = -np.array([[1., 2.], [0., 1.]])/r
+    deriv_fd = np.zeros((1, 2, 2))
+    # need to use central differences here as derivative is discontiuous at zero
+    deriv_fd[0] = (k.calc_r(x + dx, y, params) - k.calc_r(x - dx, y, params))/dx/2.
+
+    assert_allclose(k.calc_drdx(x, y, params), deriv)
+    assert_allclose(k.calc_drdx(x, y, params), deriv_fd, rtol = 1.e-5, atol = 1.e-8)
+
+def test_kernel_calc_drdx_failures():
+    "test situations where calc_drdx should fail"
+
+    k = Kernel()
+
+    x = np.array([[1.], [2.]])
+    y = np.array([[2.], [3.]])
+    params = np.array([0.])
+
+    with pytest.raises(AssertionError):
+        k.calc_drdx(x, y, params)
+
+    params = np.array([[0., 0.], [0., 0.]])
+
+    with pytest.raises(AssertionError):
+        k.calc_drdx(x, y, params)
+
+    x = np.array([[1.], [2.]])
+    y = np.array([[2., 4.], [3., 2.]])
+    params = np.array([0., 0.])
+
+    with pytest.raises(AssertionError):
+        k.calc_drdx(x, y, params)
+
+    x = np.array([[1.], [2.]])
+    y = np.array([[[2.], [4.]], [[3.], [2.]]])
+    params = np.array([0., 0.])
+
+    with pytest.raises(AssertionError):
+        k.calc_drdx(x, y, params)
+
+    x = np.array([[2., 4.], [3., 2.]])
+    y = np.array([[1.], [2.]])
+    params = np.array([0., 0.])
+
+    with pytest.raises(AssertionError):
+        k.calc_drdx(x, y, params)
+
+    x = np.array([[[2.], [4.]], [[3.], [2.]]])
+    y = np.array([[1.], [2.]])
+    params = np.array([0., 0.])
+
+    with pytest.raises(AssertionError):
+        k.calc_drdx(x, y, params)
+
 def test_squared_exponential_K():
     "test squared exponential K(r) function"
-    
+
     k = SquaredExponential()
-    
+
     assert_allclose(k.calc_K(1.), np.exp(-0.5))
 
-    assert_allclose(k.calc_K(np.array([[1., 2.], [3., 4.]])), np.exp(-0.5*np.array([[1., 4.], [9., 16.]])))
+    assert_allclose(k.calc_K(np.array([[1., 2.], [3., 4.]])),
+                             np.exp(-0.5*np.array([[1., 4.], [9., 16.]])))
 
     with pytest.raises(AssertionError):
         k.calc_K(-1.)
-    
+
 def test_squared_exponential_dKdr():
     "test squared exponential dK/dr function"
-    
+
     k = SquaredExponential()
-    
+
     dx = 1.e-6
-    
+
     assert_allclose(k.calc_dKdr(1.), -np.exp(-0.5))
     assert_allclose(k.calc_dKdr(1.), (k.calc_K(1.)-k.calc_K(1.-dx))/dx, rtol = 1.e-5)
 
@@ -345,69 +488,76 @@ def test_squared_exponential_dKdr():
 
     with pytest.raises(AssertionError):
         k.calc_dKdr(-1.)
-    
+
 def test_squared_exponential_d2Kdr2():
     "test squared exponential d2K/dr2 function"
-    
+
     k = SquaredExponential()
-    
+
     dx = 1.e-6
-    
+
     assert_allclose(k.calc_d2Kdr2(1.), 0.)
-    assert_allclose(k.calc_d2Kdr2(1.), (k.calc_dKdr(1.)-k.calc_dKdr(1.-dx))/dx, atol = 1.e-5)
+    assert_allclose(k.calc_d2Kdr2(1.),
+                    (k.calc_dKdr(1.)-k.calc_dKdr(1.-dx))/dx, atol = 1.e-5)
 
     r = np.array([[1., 2.], [3., 4.]])
 
     assert_allclose(k.calc_d2Kdr2(r), (r**2 - 1.)*np.exp(-0.5*r**2))
-    assert_allclose(k.calc_d2Kdr2(r), (k.calc_dKdr(r)-k.calc_dKdr(r-dx))/dx, rtol = 1.e-5, atol = 1.e-5)
+    assert_allclose(k.calc_d2Kdr2(r),
+                    (k.calc_dKdr(r)-k.calc_dKdr(r-dx))/dx, rtol = 1.e-5, atol = 1.e-5)
 
     with pytest.raises(AssertionError):
         k.calc_d2Kdr2(-1.)
 
 def test_squared_exponential():
     "test squared exponential covariance kernel"
-    
+
     k = SquaredExponential()
-    
+
     x = np.array([[1.], [2.]])
     y = np.array([[2.], [3.]])
     params = np.array([0., 0.])
-    
-    assert_allclose(k.kernel_f(x, y, params), np.exp(-0.5*np.array([[1., 2.], [0., 1.]])**2))
-    
+
+    assert_allclose(k.kernel_f(x, y, params),
+                    np.exp(-0.5*np.array([[1., 2.], [0., 1.]])**2))
+
     x = np.array([[1., 2.], [2., 3.]])
     y = np.array([[2., 4.], [3., 1.]])
     params = np.array([0., 0., 0.])
-    
-    assert_allclose(k.kernel_f(x, y, params), np.exp(-0.5*np.array([[np.sqrt(5.), np.sqrt(5.)], [1., np.sqrt(5.)]])**2))
-    
+
+    assert_allclose(k.kernel_f(x, y, params),
+                    np.exp(-0.5*np.array([[np.sqrt(5.), np.sqrt(5.)],
+                                          [1., np.sqrt(5.)]])**2))
+
     x = np.array([[1., 2.], [2., 3.]])
     y = np.array([[2., 4.], [3., 1.]])
     params = np.array([np.log(2.), np.log(4.), np.log(2.)])
-    
+
     assert_allclose(k.kernel_f(x, y, params),
-                    2.*np.exp(-0.5*np.array([[np.sqrt(1.*2.+4.*4.), np.sqrt(4.*2.+1.*4.)], [np.sqrt(1.*4.), np.sqrt(1.*2.+4.*4.)]])**2))
-                    
+                    2.*np.exp(-0.5*np.array([[np.sqrt(1.*2.+4.*4.), np.sqrt(4.*2.+1.*4.)],
+                                             [np.sqrt(1.*4.), np.sqrt(1.*2.+4.*4.)]])**2))
+
     x = np.array([1., 2.])
     y = np.array([2., 3.])
     params = np.array([0., 0.])
-    
-    assert_allclose(k.kernel_f(x, y, params), np.exp(-0.5*np.array([[1., 2.], [0., 1.]])**2))
+
+    assert_allclose(k.kernel_f(x, y, params),
+                    np.exp(-0.5*np.array([[1., 2.], [0., 1.]])**2))
 
 def test_squared_exponential_failures():
     "test scenarios where squared_exponential should raise an exception"
-    
+
     k = SquaredExponential()
-    
+
     x = np.array([[1.], [2.]])
     y = np.array([[2.], [3.]])
     params = np.array([0.])
-    
+
     with pytest.raises(AssertionError):
         k.kernel_f(x, y, params)
-    
+
     params = np.array([[0., 0.], [0., 0.]])
-    
+
     with pytest.raises(AssertionError):
         k.kernel_f(x, y, params)
 
@@ -422,123 +572,137 @@ def test_squared_exponential_deriv():
     x = np.array([[1.], [2.]])
     y = np.array([[2.], [3.]])
     params = np.array([0., 0.])
-    
+
     deriv = np.zeros((2, 2, 2))
-    
+
     deriv[-1] = np.exp(-0.5*np.array([[1., 2.], [0., 1.]])**2)
-    deriv[0] = -0.5*np.array([[1., 4.],[0., 1.]])*np.exp(-0.5*np.array([[1., 2.], [0., 1.]])**2)
+    deriv[0] = (-0.5*np.array([[1., 4.], [0., 1.]])*
+                np.exp(-0.5*np.array([[1., 2.], [0., 1.]])**2))
     deriv_fd = np.zeros((2, 2, 2))
-    deriv_fd[0] = (k.kernel_f(x, y, params)-k.kernel_f(x, y, params - np.array([dx, 0.])))/dx
-    deriv_fd[1] = (k.kernel_f(x, y, params)-k.kernel_f(x, y, params - np.array([0., dx])))/dx
-    
+    deriv_fd[0] = (k.kernel_f(x, y, params) -
+                   k.kernel_f(x, y, params - np.array([dx, 0.])))/dx
+    deriv_fd[1] = (k.kernel_f(x, y, params) -
+                   k.kernel_f(x, y, params - np.array([0., dx])))/dx
+
     assert_allclose(k.kernel_deriv(x, y, params), deriv)
     assert_allclose(k.kernel_deriv(x, y, params), deriv_fd, rtol = 1.e-5)
-    
+
     x = np.array([[1., 2.], [2., 3.]])
     y = np.array([[2., 4.], [3., 1.]])
     params = np.array([0., 0., 0.])
-    
+
     deriv = np.zeros((3, 2, 2))
-    
-    deriv[-1] = np.exp(-0.5*np.array([[np.sqrt(5.), np.sqrt(5.)], [1., np.sqrt(5.)]])**2)
-    deriv[0] = -0.5*np.array([[1., 4.],[0., 1.]])*deriv[-1]
-    deriv[1] = -0.5*np.array([[4., 1.],[1., 4.]])*deriv[-1]
+
+    deriv[-1] = np.exp(-0.5*np.array([[np.sqrt(5.), np.sqrt(5.)],
+                                      [1., np.sqrt(5.)]])**2)
+    deriv[0] = -0.5*np.array([[1., 4.], [0., 1.]])*deriv[-1]
+    deriv[1] = -0.5*np.array([[4., 1.], [1., 4.]])*deriv[-1]
     deriv_fd = np.zeros((3, 2, 2))
-    deriv_fd[0] = (k.kernel_f(x, y, params)-k.kernel_f(x, y, params - np.array([dx, 0., 0.])))/dx
-    deriv_fd[1] = (k.kernel_f(x, y, params)-k.kernel_f(x, y, params - np.array([0., dx, 0.])))/dx
-    deriv_fd[2] = (k.kernel_f(x, y, params)-k.kernel_f(x, y, params - np.array([0., 0., dx])))/dx
-    
+    deriv_fd[0] = (k.kernel_f(x, y, params) -
+                   k.kernel_f(x, y, params - np.array([dx, 0., 0.])))/dx
+    deriv_fd[1] = (k.kernel_f(x, y, params) -
+                   k.kernel_f(x, y, params - np.array([0., dx, 0.])))/dx
+    deriv_fd[2] = (k.kernel_f(x, y, params) -
+                   k.kernel_f(x, y, params - np.array([0., 0., dx])))/dx
+
     assert_allclose(k.kernel_deriv(x, y, params), deriv)
     assert_allclose(k.kernel_deriv(x, y, params), deriv_fd, rtol = 1.e-5)
-    
+
     x = np.array([[1., 2.], [2., 3.]])
     y = np.array([[2., 4.], [3., 1.]])
     params = np.array([np.log(2.), np.log(4.), np.log(2.)])
-    
+
     deriv = np.zeros((3, 2, 2))
-    
-    deriv[-1] = 2.*np.exp(-0.5*np.array([[np.sqrt(1.*2.+4.*4.), np.sqrt(4.*2.+1.*4.)], [np.sqrt(1.*4.), np.sqrt(1.*2.+4.*4.)]])**2)
-    deriv[0] = -0.5*np.array([[2., 8.],[0., 2.]])*deriv[-1]
-    deriv[1] = -0.5*np.array([[16., 4.],[4., 16.]])*deriv[-1]
+
+    deriv[-1] = 2.*np.exp(-0.5*np.array([[np.sqrt(1.*2.+4.*4.), np.sqrt(4.*2.+1.*4.)],
+                                         [np.sqrt(1.*4.), np.sqrt(1.*2.+4.*4.)]])**2)
+    deriv[0] = -0.5*np.array([[2., 8.], [0., 2.]])*deriv[-1]
+    deriv[1] = -0.5*np.array([[16., 4.], [4., 16.]])*deriv[-1]
     deriv_fd = np.zeros((3, 2, 2))
-    deriv_fd[0] = (k.kernel_f(x, y, params)-k.kernel_f(x, y, params - np.array([dx, 0., 0.])))/dx
-    deriv_fd[1] = (k.kernel_f(x, y, params)-k.kernel_f(x, y, params - np.array([0., dx, 0.])))/dx
-    deriv_fd[2] = (k.kernel_f(x, y, params)-k.kernel_f(x, y, params - np.array([0., 0., dx])))/dx
-    
+    deriv_fd[0] = (k.kernel_f(x, y, params) -
+                   k.kernel_f(x, y, params - np.array([dx, 0., 0.])))/dx
+    deriv_fd[1] = (k.kernel_f(x, y, params) -
+                   k.kernel_f(x, y, params - np.array([0., dx, 0.])))/dx
+    deriv_fd[2] = (k.kernel_f(x, y, params) -
+                   k.kernel_f(x, y, params - np.array([0., 0., dx])))/dx
+
     assert_allclose(k.kernel_deriv(x, y, params), deriv)
     assert_allclose(k.kernel_deriv(x, y, params), deriv_fd, rtol = 1.e-5)
-                    
+
     x = np.array([1., 2.])
     y = np.array([2., 3.])
     params = np.array([0., 0.])
-    
+
     deriv = np.zeros((2, 2, 2))
-    
+
     deriv[-1] = np.exp(-0.5*np.array([[1., 2.], [0., 1.]])**2)
-    deriv[0] = -0.5*np.array([[1., 4.],[0., 1.]])*np.exp(-0.5*np.array([[1., 2.], [0., 1.]])**2)
+    deriv[0] = (-0.5*np.array([[1., 4.],[0., 1.]])*
+                np.exp(-0.5*np.array([[1., 2.], [0., 1.]])**2))
     deriv_fd = np.zeros((2, 2, 2))
-    deriv_fd[0] = (k.kernel_f(x, y, params)-k.kernel_f(x, y, params - np.array([dx, 0.])))/dx
-    deriv_fd[1] = (k.kernel_f(x, y, params)-k.kernel_f(x, y, params - np.array([0., dx])))/dx
-    
+    deriv_fd[0] = (k.kernel_f(x, y, params) -
+                   k.kernel_f(x, y, params - np.array([dx, 0.])))/dx
+    deriv_fd[1] = (k.kernel_f(x, y, params) -
+                   k.kernel_f(x, y, params - np.array([0., dx])))/dx
+
     assert_allclose(k.kernel_deriv(x, y, params), deriv)
     assert_allclose(k.kernel_deriv(x, y, params), deriv_fd, rtol = 1.e-5)
 
 def test_squared_exponential_deriv_failures():
     "test scenarios where squared_exponential should raise an exception"
-    
+
     k = SquaredExponential()
-    
+
     x = np.array([[1.], [2.]])
     y = np.array([[2.], [3.]])
     params = np.array([0.])
-    
+
     with pytest.raises(AssertionError):
         k.kernel_deriv(x, y, params)
-    
+
     params = np.array([[0., 0.], [0., 0.]])
-    
+
     with pytest.raises(AssertionError):
         k.kernel_deriv(x, y, params)
-        
+
     x = np.array([[1.], [2.]])
     y = np.array([[2., 4.], [3., 2.]])
     params = np.array([0., 0.])
-    
+
     with pytest.raises(AssertionError):
         k.kernel_deriv(x, y, params)
-        
+
     x = np.array([[1.], [2.]])
     y = np.array([[[2.], [4.]], [[3.], [2.]]])
     params = np.array([0., 0.])
-    
+
     with pytest.raises(AssertionError):
         k.kernel_deriv(x, y, params)
-        
+
     x = np.array([[2., 4.], [3., 2.]])
     y = np.array([[1.], [2.]])
     params = np.array([0., 0.])
-    
+
     with pytest.raises(AssertionError):
         k.kernel_deriv(x, y, params)
-        
+
     x = np.array([[[2.], [4.]], [[3.], [2.]]])
     y = np.array([[1.], [2.]])
     params = np.array([0., 0.])
-    
+
     with pytest.raises(AssertionError):
         k.kernel_deriv(x, y, params)
 
 def test_squared_exponential_hessian():
     "test the function to compute the squared exponential hessian"
-    
+
     k = SquaredExponential()
-    
+
     dx = 1.e-6
-    
+
     x = np.array([[1.], [2.]])
     y = np.array([[2.], [3.]])
     params = np.array([0., 0.])
-    
+
     hess = np.zeros((2, 2, 2, 2))
     r2 = np.array([[1., 4.], [0., 1.]])
     hess[0, 0] = (-0.5*r2+0.25*r2**2)*np.exp(-0.5*r2)
@@ -546,20 +710,24 @@ def test_squared_exponential_hessian():
     hess[1, 0] = -0.5*np.exp(-0.5*r2)*r2
     hess[1, 1] = np.exp(-0.5*r2)
     hess_fd = np.zeros((2, 2, 2, 2))
-    hess_fd[0, 0] = (k.kernel_deriv(x, y, params)[0]-k.kernel_deriv(x, y, params-np.array([dx, 0.]))[0])/dx
-    hess_fd[1, 0] = (k.kernel_deriv(x, y, params)[1]-k.kernel_deriv(x, y, params-np.array([dx, 0.]))[1])/dx
-    hess_fd[0, 1] = (k.kernel_deriv(x, y, params)[0]-k.kernel_deriv(x, y, params-np.array([0., dx]))[0])/dx
-    hess_fd[1, 1] = (k.kernel_deriv(x, y, params)[1]-k.kernel_deriv(x, y, params-np.array([0., dx]))[1])/dx
-    
+    hess_fd[0, 0] = (k.kernel_deriv(x, y, params)[0] -
+                     k.kernel_deriv(x, y, params-np.array([dx, 0.]))[0])/dx
+    hess_fd[1, 0] = (k.kernel_deriv(x, y, params)[1] -
+                     k.kernel_deriv(x, y, params-np.array([dx, 0.]))[1])/dx
+    hess_fd[0, 1] = (k.kernel_deriv(x, y, params)[0] -
+                     k.kernel_deriv(x, y, params-np.array([0., dx]))[0])/dx
+    hess_fd[1, 1] = (k.kernel_deriv(x, y, params)[1] -
+                     k.kernel_deriv(x, y, params-np.array([0., dx]))[1])/dx
+
     assert_allclose(k.kernel_hessian(x, y, params), hess)
     assert_allclose(k.kernel_hessian(x, y, params), hess_fd, atol = 1.e-5)
-    
+
     x = np.array([[1., 2.], [2., 3.]])
     y = np.array([[2., 4.], [3., 1.]])
     params = np.array([0., 0., 0.])
-    
+
     hess = np.zeros((3, 3, 2, 2))
-    
+
     r2 = np.array([[np.sqrt(5.), np.sqrt(5.)], [1., np.sqrt(5.)]])**2
     x12 = np.array([[1., 4.],[0., 1.]])
     x22 = np.array([[4., 1.],[1., 4.]])
@@ -573,26 +741,36 @@ def test_squared_exponential_hessian():
     hess[2, 1] = -0.5*np.exp(-0.5*r2)*x22
     hess[2, 2] = np.exp(-0.5*r2)
     hess_fd = np.zeros((3, 3, 2, 2))
-    hess_fd[0, 0] = (k.kernel_deriv(x, y, params)[0]-k.kernel_deriv(x, y, params-np.array([dx, 0., 0.]))[0])/dx
-    hess_fd[1, 0] = (k.kernel_deriv(x, y, params)[1]-k.kernel_deriv(x, y, params-np.array([dx, 0., 0.]))[1])/dx
-    hess_fd[0, 1] = (k.kernel_deriv(x, y, params)[0]-k.kernel_deriv(x, y, params-np.array([0., dx, 0.]))[0])/dx
-    hess_fd[1, 1] = (k.kernel_deriv(x, y, params)[1]-k.kernel_deriv(x, y, params-np.array([0., dx, 0.]))[1])/dx
-    hess_fd[0, 2] = (k.kernel_deriv(x, y, params)[0]-k.kernel_deriv(x, y, params-np.array([0., 0., dx]))[0])/dx
-    hess_fd[2, 0] = (k.kernel_deriv(x, y, params)[2]-k.kernel_deriv(x, y, params-np.array([dx, 0., 0.]))[2])/dx
-    hess_fd[2, 1] = (k.kernel_deriv(x, y, params)[2]-k.kernel_deriv(x, y, params-np.array([0., dx, 0.]))[2])/dx
-    hess_fd[1, 2] = (k.kernel_deriv(x, y, params)[1]-k.kernel_deriv(x, y, params-np.array([0., 0., dx]))[1])/dx
-    hess_fd[2, 2] = (k.kernel_deriv(x, y, params)[2]-k.kernel_deriv(x, y, params-np.array([0., 0., dx]))[2])/dx
+    hess_fd[0, 0] = (k.kernel_deriv(x, y, params)[0] -
+                     k.kernel_deriv(x, y, params-np.array([dx, 0., 0.]))[0])/dx
+    hess_fd[1, 0] = (k.kernel_deriv(x, y, params)[1] -
+                     k.kernel_deriv(x, y, params-np.array([dx, 0., 0.]))[1])/dx
+    hess_fd[0, 1] = (k.kernel_deriv(x, y, params)[0] -
+                     k.kernel_deriv(x, y, params-np.array([0., dx, 0.]))[0])/dx
+    hess_fd[1, 1] = (k.kernel_deriv(x, y, params)[1] -
+                     k.kernel_deriv(x, y, params-np.array([0., dx, 0.]))[1])/dx
+    hess_fd[0, 2] = (k.kernel_deriv(x, y, params)[0] -
+                     k.kernel_deriv(x, y, params-np.array([0., 0., dx]))[0])/dx
+    hess_fd[2, 0] = (k.kernel_deriv(x, y, params)[2] -
+                     k.kernel_deriv(x, y, params-np.array([dx, 0., 0.]))[2])/dx
+    hess_fd[2, 1] = (k.kernel_deriv(x, y, params)[2] -
+                     k.kernel_deriv(x, y, params-np.array([0., dx, 0.]))[2])/dx
+    hess_fd[1, 2] = (k.kernel_deriv(x, y, params)[1] -
+                     k.kernel_deriv(x, y, params-np.array([0., 0., dx]))[1])/dx
+    hess_fd[2, 2] = (k.kernel_deriv(x, y, params)[2] -
+                     k.kernel_deriv(x, y, params-np.array([0., 0., dx]))[2])/dx
 
     assert_allclose(k.kernel_hessian(x, y, params), hess)
     assert_allclose(k.kernel_hessian(x, y, params), hess_fd, atol = 1.e-5)
-    
+
     x = np.array([[1., 2.], [2., 3.]])
     y = np.array([[2., 4.], [3., 1.]])
     params = np.array([np.log(2.), np.log(4.), np.log(2.)])
-    
+
     hess = np.zeros((3, 3, 2, 2))
-    
-    r2 = np.array([[np.sqrt(1.*2.+4.*4.), np.sqrt(4.*2.+1.*4.)], [np.sqrt(1.*4.), np.sqrt(1.*2.+4.*4.)]])**2
+
+    r2 = np.array([[np.sqrt(1.*2.+4.*4.), np.sqrt(4.*2.+1.*4.)],
+                   [np.sqrt(1.*4.), np.sqrt(1.*2.+4.*4.)]])**2
     x12 = np.array([[1., 4.],[0., 1.]])
     x22 = np.array([[4., 1.],[1., 4.]])
     hess[0, 0] = (-0.5*x12+0.25*x12**2)*2.*2.*2.*np.exp(-0.5*r2)
@@ -605,23 +783,32 @@ def test_squared_exponential_hessian():
     hess[2, 1] = -0.5*2.*4.*np.exp(-0.5*r2)*x22
     hess[2, 2] = 2.*np.exp(-0.5*r2)
     hess_fd = np.zeros((3, 3, 2, 2))
-    hess_fd[0, 0] = (k.kernel_deriv(x, y, params)[0]-k.kernel_deriv(x, y, params-np.array([dx, 0., 0.]))[0])/dx
-    hess_fd[1, 0] = (k.kernel_deriv(x, y, params)[1]-k.kernel_deriv(x, y, params-np.array([dx, 0., 0.]))[1])/dx
-    hess_fd[0, 1] = (k.kernel_deriv(x, y, params)[0]-k.kernel_deriv(x, y, params-np.array([0., dx, 0.]))[0])/dx
-    hess_fd[1, 1] = (k.kernel_deriv(x, y, params)[1]-k.kernel_deriv(x, y, params-np.array([0., dx, 0.]))[1])/dx
-    hess_fd[0, 2] = (k.kernel_deriv(x, y, params)[0]-k.kernel_deriv(x, y, params-np.array([0., 0., dx]))[0])/dx
-    hess_fd[2, 0] = (k.kernel_deriv(x, y, params)[2]-k.kernel_deriv(x, y, params-np.array([dx, 0., 0.]))[2])/dx
-    hess_fd[2, 1] = (k.kernel_deriv(x, y, params)[2]-k.kernel_deriv(x, y, params-np.array([0., dx, 0.]))[2])/dx
-    hess_fd[1, 2] = (k.kernel_deriv(x, y, params)[1]-k.kernel_deriv(x, y, params-np.array([0., 0., dx]))[1])/dx
-    hess_fd[2, 2] = (k.kernel_deriv(x, y, params)[2]-k.kernel_deriv(x, y, params-np.array([0., 0., dx]))[2])/dx
+    hess_fd[0, 0] = (k.kernel_deriv(x, y, params)[0] -
+                     k.kernel_deriv(x, y, params-np.array([dx, 0., 0.]))[0])/dx
+    hess_fd[1, 0] = (k.kernel_deriv(x, y, params)[1] -
+                     k.kernel_deriv(x, y, params-np.array([dx, 0., 0.]))[1])/dx
+    hess_fd[0, 1] = (k.kernel_deriv(x, y, params)[0] -
+                     k.kernel_deriv(x, y, params-np.array([0., dx, 0.]))[0])/dx
+    hess_fd[1, 1] = (k.kernel_deriv(x, y, params)[1] -
+                     k.kernel_deriv(x, y, params-np.array([0., dx, 0.]))[1])/dx
+    hess_fd[0, 2] = (k.kernel_deriv(x, y, params)[0] -
+                     k.kernel_deriv(x, y, params-np.array([0., 0., dx]))[0])/dx
+    hess_fd[2, 0] = (k.kernel_deriv(x, y, params)[2] -
+                     k.kernel_deriv(x, y, params-np.array([dx, 0., 0.]))[2])/dx
+    hess_fd[2, 1] = (k.kernel_deriv(x, y, params)[2] -
+                     k.kernel_deriv(x, y, params-np.array([0., dx, 0.]))[2])/dx
+    hess_fd[1, 2] = (k.kernel_deriv(x, y, params)[1] -
+                     k.kernel_deriv(x, y, params-np.array([0., 0., dx]))[1])/dx
+    hess_fd[2, 2] = (k.kernel_deriv(x, y, params)[2] -
+                     k.kernel_deriv(x, y, params-np.array([0., 0., dx]))[2])/dx
 
     assert_allclose(k.kernel_hessian(x, y, params)[1,2], hess[1,2])
     assert_allclose(k.kernel_hessian(x, y, params), hess_fd, atol = 1.e-5)
-                    
+
     x = np.array([1., 2.])
     y = np.array([2., 3.])
     params = np.array([0., 0.])
-    
+
     hess = np.zeros((2, 2, 2, 2))
     r2 = np.array([[1., 4.], [0., 1.]])
     hess[0, 0] = (-0.5*r2+0.25*r2**2)*np.exp(-0.5*r2)
@@ -629,80 +816,208 @@ def test_squared_exponential_hessian():
     hess[1, 0] = -0.5*np.exp(-0.5*r2)*r2
     hess[1, 1] = np.exp(-0.5*r2)
     hess_fd = np.zeros((2, 2, 2, 2))
-    hess_fd[0, 0] = (k.kernel_deriv(x, y, params)[0]-k.kernel_deriv(x, y, params-np.array([dx, 0.]))[0])/dx
-    hess_fd[1, 0] = (k.kernel_deriv(x, y, params)[1]-k.kernel_deriv(x, y, params-np.array([dx, 0.]))[1])/dx
-    hess_fd[0, 1] = (k.kernel_deriv(x, y, params)[0]-k.kernel_deriv(x, y, params-np.array([0., dx]))[0])/dx
-    hess_fd[1, 1] = (k.kernel_deriv(x, y, params)[1]-k.kernel_deriv(x, y, params-np.array([0., dx]))[1])/dx
-    
+    hess_fd[0, 0] = (k.kernel_deriv(x, y, params)[0] -
+                     k.kernel_deriv(x, y, params-np.array([dx, 0.]))[0])/dx
+    hess_fd[1, 0] = (k.kernel_deriv(x, y, params)[1] -
+                     k.kernel_deriv(x, y, params-np.array([dx, 0.]))[1])/dx
+    hess_fd[0, 1] = (k.kernel_deriv(x, y, params)[0] -
+                     k.kernel_deriv(x, y, params-np.array([0., dx]))[0])/dx
+    hess_fd[1, 1] = (k.kernel_deriv(x, y, params)[1] -
+                     k.kernel_deriv(x, y, params-np.array([0., dx]))[1])/dx
+
     assert_allclose(k.kernel_hessian(x, y, params), hess)
     assert_allclose(k.kernel_hessian(x, y, params), hess_fd, atol = 1.e-5)
 
 def test_squared_exponential_hessian_failures():
     "test situaitons where squared_exponential_hessian should fail"
-    
+
     k = SquaredExponential()
-    
+
     x = np.array([[1.], [2.]])
     y = np.array([[2.], [3.]])
     params = np.array([0.])
-    
+
     with pytest.raises(AssertionError):
         k.kernel_hessian(x, y, params)
-    
+
     params = np.array([[0., 0.], [0., 0.]])
-    
+
     with pytest.raises(AssertionError):
         k.kernel_hessian(x, y, params)
-        
+
     x = np.array([[1.], [2.]])
     y = np.array([[2., 4.], [3., 2.]])
     params = np.array([0., 0.])
-    
+
     with pytest.raises(AssertionError):
         k.kernel_hessian(x, y, params)
-        
+
     x = np.array([[1.], [2.]])
     y = np.array([[[2.], [4.]], [[3.], [2.]]])
     params = np.array([0., 0.])
-    
+
     with pytest.raises(AssertionError):
         k.kernel_hessian(x, y, params)
-        
+
     x = np.array([[2., 4.], [3., 2.]])
     y = np.array([[1.], [2.]])
     params = np.array([0., 0.])
-    
+
     with pytest.raises(AssertionError):
         k.kernel_hessian(x, y, params)
-        
+
     x = np.array([[[2.], [4.]], [[3.], [2.]]])
     y = np.array([[1.], [2.]])
     params = np.array([0., 0.])
-    
+
     with pytest.raises(AssertionError):
         k.kernel_hessian(x, y, params)
 
-def test_matern_5_2_K():
-    "test matern 5/2 K(r) function"
-    
-    k = Matern52()
-    
-    assert_allclose(k.calc_K(1.), (1.+np.sqrt(5.)+5./3.)*np.exp(-np.sqrt(5.)))
+def test_squared_exponential_inputderiv():
+    "test the input derivative method of squared exponential"
 
-    r = np.array([[1., 2.], [3., 4.]])
+    k = SquaredExponential()
+
+    dx = 1.e-6
+
+    x = np.array([[1.], [2.]])
+    y = np.array([[2.], [3.]])
+    params = np.array([0., 0.])
+
+    deriv = np.zeros((1, 2, 2))
+
+    r = np.array([[1., 2.], [0., 1.]])
+
+    deriv[0] = -r*np.exp(-0.5*r**2)*np.array([[-1., -1.], [0., -1.]])
+    deriv_fd = np.zeros((1, 2, 2))
+    deriv_fd[0] = (k.kernel_f(x + dx, y, params) -
+                   k.kernel_f(x - dx, y, params))/dx/2.
+
+    assert_allclose(k.kernel_inputderiv(x, y, params), deriv)
+    assert_allclose(k.kernel_inputderiv(x, y, params), deriv_fd, rtol = 1.e-5)
+
+    x = np.array([[1., 2.], [2., 3.]])
+    y = np.array([[2., 4.], [3., 1.]])
+    params = np.array([0., 0., 0.])
+
+    deriv = np.zeros((2, 2, 2))
+
+    r = np.array([[np.sqrt(5.), np.sqrt(5.)], [1., np.sqrt(5.)]])
+
+    deriv[0] = -np.exp(-0.5*r**2)*np.array([[-1., -2.], [ 0., -1.]])
+    deriv[1] = -np.exp(-0.5*r**2)*np.array([[-2.,  1.], [-1.,  2.]])
+    deriv_fd = np.zeros((2, 2, 2))
+    deriv_fd[0] = (k.kernel_f(x + np.array([[dx, 0.], [dx, 0.]]), y, params) -
+                   k.kernel_f(x - np.array([[dx, 0.], [dx, 0.]]), y, params))/dx/2.
+    deriv_fd[1] = (k.kernel_f(x + np.array([[0., dx], [0., dx]]), y, params) -
+                   k.kernel_f(x - np.array([[0., dx], [0., dx]]), y, params))/dx/2.
+
+    assert_allclose(k.kernel_inputderiv(x, y, params), deriv)
+    assert_allclose(k.kernel_inputderiv(x, y, params), deriv_fd, rtol = 1.e-5)
+
+    x = np.array([[1., 2.], [2., 3.]])
+    y = np.array([[2., 4.], [3., 1.]])
+    params = np.array([np.log(2.), np.log(4.), np.log(2.)])
+
+    deriv = np.zeros((2, 2, 2))
+
+    r = np.array([[np.sqrt(1.*2.+4.*4.), np.sqrt(4.*2.+1.*4.)],
+                  [np.sqrt(1.*4.), np.sqrt(1.*2.+4.*4.)]])
+
+    deriv[0] = (-2.*2.*np.exp(-0.5*r**2)*
+                np.array([[-1., -2.], [0., -1.]]))
+    deriv[1] = (-2.*4.*np.exp(-0.5*r**2)*
+                np.array([[-2., 1.], [-1., 2.]]))
+    deriv_fd = np.zeros((2, 2, 2))
+    deriv_fd[0] = (k.kernel_f(x + np.array([[dx, 0.], [dx, 0.]]), y, params) -
+                   k.kernel_f(x - np.array([[dx, 0.], [dx, 0.]]), y, params))/dx/2.
+    deriv_fd[1] = (k.kernel_f(x + np.array([[0., dx], [0., dx]]), y, params) -
+                   k.kernel_f(x - np.array([[0., dx], [0., dx]]), y, params))/dx/2.
+
+    assert_allclose(k.kernel_inputderiv(x, y, params), deriv)
+    assert_allclose(k.kernel_inputderiv(x, y, params), deriv_fd, rtol = 1.e-5)
+
+    x = np.array([1., 2.])
+    y = np.array([2., 3.])
+    params = np.array([0., 0.])
+
+    deriv = np.zeros((1, 2, 2))
+
+    r = np.array([[1., 2.], [0., 1.]])
+
+    deriv[0] = -r*np.exp(-0.5*r**2)*np.array([[-1., -1.], [0., -1.]])
+    deriv_fd = np.zeros((1, 2, 2))
+    deriv_fd[0] = (k.kernel_f(x + dx, y, params)-k.kernel_f(x - dx, y, params))/dx/2.
+
+    assert_allclose(k.kernel_inputderiv(x, y, params), deriv)
+    assert_allclose(k.kernel_inputderiv(x, y, params), deriv_fd, rtol = 1.e-5)
+
+def test_squared_exponential_inputderiv_failures():
+    "test situations where input derivative method should fail"
+
+    k = SquaredExponential()
+
+    x = np.array([[1.], [2.]])
+    y = np.array([[2.], [3.]])
+    params = np.array([0.])
+
+    with pytest.raises(AssertionError):
+       k.kernel_inputderiv(x, y, params)
+
+    params = np.array([[0., 0.], [0., 0.]])
+
+    with pytest.raises(AssertionError):
+       k.kernel_inputderiv(x, y, params)
+
+    x = np.array([[1.], [2.]])
+    y = np.array([[2., 4.], [3., 2.]])
+    params = np.array([0., 0.])
+
+    with pytest.raises(AssertionError):
+       k.kernel_inputderiv(x, y, params)
+
+    x = np.array([[1.], [2.]])
+    y = np.array([[[2.], [4.]], [[3.], [2.]]])
+    params = np.array([0., 0.])
+
+    with pytest.raises(AssertionError):
+       k.kernel_inputderiv(x, y, params)
+
+    x = np.array([[2., 4.], [3., 2.]])
+    y = np.array([[1.], [2.]])
+    params = np.array([0., 0.])
+
+    with pytest.raises(AssertionError):
+       k.kernel_inputderiv(x, y, params)
+
+    x = np.array([[[2.], [4.]], [[3.], [2.]]])
+    y = np.array([[1.], [2.]])
+    params = np.array([0., 0.])
+
+    with pytest.raises(AssertionError):
+       k.kernel_inputderiv(x, y, params)
+
+def test_matern_5_2_K():
+    "test matern 5/2 K(r) function"
+
+    k = Matern52()
+
+    assert_allclose(k.calc_K(1.), (1.+np.sqrt(5.)+5./3.)*np.exp(-np.sqrt(5.)))
+
+    r = np.array([[1., 2.], [3., 4.]])
 
     assert_allclose(k.calc_K(r), (1.+np.sqrt(5.)*r+5./3.*r**2)*np.exp(-np.sqrt(5.)*r))
 
     with pytest.raises(AssertionError):
         k.calc_K(-1.)
-    
+
 def test_matern_5_2_dKdr():
     "test matern 5/2 dK/dr function"
-    
+
     k = Matern52()
-    
+
     dx = 1.e-6
-    
+
     assert_allclose(k.calc_dKdr(1.), -5./3.*(1.+np.sqrt(5.))*np.exp(-np.sqrt(5.)))
     assert_allclose(k.calc_dKdr(1.), (k.calc_K(1.)-k.calc_K(1.-dx))/dx, rtol = 1.e-5)
 
@@ -713,14 +1028,14 @@ def test_matern_5_2_dKdr():
 
     with pytest.raises(AssertionError):
         k.calc_dKdr(-1.)
-    
+
 def test_matern_5_2_d2Kdr2():
     "test squared exponential d2K/dr2 function"
-    
+
     k = Matern52()
-    
+
     dx = 1.e-6
-    
+
     assert_allclose(k.calc_d2Kdr2(1.), 5./3.*(5.-np.sqrt(5.)-1.)*np.exp(-np.sqrt(5.)))
     assert_allclose(k.calc_d2Kdr2(1.), (k.calc_dKdr(1.)-k.calc_dKdr(1.-dx))/dx, rtol = 1.e-5)
 
@@ -728,235 +1043,254 @@ def test_matern_5_2_d2Kdr2():
 
     assert_allclose(k.calc_d2Kdr2(r), 5./3.*(5.*r**2-np.sqrt(5.)*r-1.)*np.exp(-np.sqrt(5.)*r))
     assert_allclose(k.calc_d2Kdr2(r), (k.calc_dKdr(r)-k.calc_dKdr(r - dx))/dx, rtol = 1.e-5)
-    
+
     with pytest.raises(AssertionError):
         k.calc_d2Kdr2(-1.)
 
 def test_matern_5_2():
     "test matern 5/2 covariance kernel"
-    
+
     k = Matern52()
-    
+
     x = np.array([[1.], [2.]])
     y = np.array([[2.], [3.]])
     params = np.array([0., 0.])
-    
+
     D = np.array([[1., 2.], [0., 1.]])
-    
-    assert_allclose(k.kernel_f(x, y, params), (1.+np.sqrt(5.)*D+5./3.*D**2)*np.exp(-np.sqrt(5.)*D))
-    
+
+    assert_allclose(k.kernel_f(x, y, params),
+                    (1.+np.sqrt(5.)*D+5./3.*D**2)*np.exp(-np.sqrt(5.)*D))
+
     x = np.array([[1., 2.], [2., 3.]])
     y = np.array([[2., 4.], [3., 1.]])
     params = np.array([0., 0., 0.])
-    
+
     D = np.array([[np.sqrt(5.), np.sqrt(5.)], [1., np.sqrt(5.)]])
-    
-    assert_allclose(k.kernel_f(x, y, params), (1.+np.sqrt(5.)*D+5./3.*D**2)*np.exp(-np.sqrt(5.)*D))
-    
+
+    assert_allclose(k.kernel_f(x, y, params),
+                    (1.+np.sqrt(5.)*D+5./3.*D**2)*np.exp(-np.sqrt(5.)*D))
+
     x = np.array([[1., 2.], [2., 3.]])
     y = np.array([[2., 4.], [3., 1.]])
     params = np.array([np.log(2.), np.log(4.), np.log(2.)])
-    
-    D = np.array([[np.sqrt(1.*2.+4.*4.), np.sqrt(4.*2.+1.*4.)], [np.sqrt(1.*4.), np.sqrt(1.*2.+4.*4.)]])
-    
+
+    D = np.array([[np.sqrt(1.*2.+4.*4.), np.sqrt(4.*2.+1.*4.)],
+                  [np.sqrt(1.*4.), np.sqrt(1.*2.+4.*4.)]])
+
     assert_allclose(k.kernel_f(x, y, params),
                     2.*(1.+np.sqrt(5.)*D+5./3.*D**2)*np.exp(-np.sqrt(5.)*D))
-                    
+
     x = np.array([1., 2.])
     y = np.array([2., 3.])
     params = np.array([0., 0.])
-    
+
     D = np.array([[1., 2.], [0., 1.]])
-    
-    assert_allclose(k.kernel_f(x, y, params), (1.+np.sqrt(5.)*D + 5./3.*D**2)*np.exp(-np.sqrt(5.)*D))
-    
+
+    assert_allclose(k.kernel_f(x, y, params),
+                    (1.+np.sqrt(5.)*D + 5./3.*D**2)*np.exp(-np.sqrt(5.)*D))
+
 def test_matern_5_2_failures():
     "test scenarios where matern_5_2 should raise an exception"
-    
+
     k = Matern52()
-    
+
     x = np.array([[1.], [2.]])
     y = np.array([[2.], [3.]])
     params = np.array([0.])
-    
+
     with pytest.raises(AssertionError):
         k.kernel_f(x, y, params)
-    
+
     params = np.array([[0., 0.], [0., 0.]])
-    
+
     with pytest.raises(AssertionError):
         k.kernel_f(x, y, params)
-        
+
 def test_matern_5_2_deriv():
     "test computing the gradient of the matern 5/2 kernel"
-    
+
     k = Matern52()
-    
+
     dx = 1.e-6
-    
+
     x = np.array([[1.], [2.]])
     y = np.array([[2.], [3.]])
     params = np.array([0., 0.])
-    
+
     deriv = np.zeros((2, 2, 2))
-    
+
     D = np.array([[1., 2.], [0., 1.]])
-    
+
     deriv[-1] = (1.+np.sqrt(5.)*D+5./3.*D**2)*np.exp(-np.sqrt(5.)*D)
     deriv[0] = -0.5*D**2*5./3.*(1.+np.sqrt(5.)*D)*np.exp(-np.sqrt(5.)*D)
-    
+
     deriv_fd = np.zeros((2, 2, 2))
-    deriv_fd[0] = (k.kernel_f(x, y, params)-k.kernel_f(x, y, params - np.array([dx, 0.])))/dx
-    deriv_fd[1] = (k.kernel_f(x, y, params)-k.kernel_f(x, y, params - np.array([0., dx])))/dx
-    
+    deriv_fd[0] = (k.kernel_f(x, y, params) -
+                   k.kernel_f(x, y, params - np.array([dx, 0.])))/dx
+    deriv_fd[1] = (k.kernel_f(x, y, params) -
+                   k.kernel_f(x, y, params - np.array([0., dx])))/dx
+
     assert_allclose(k.kernel_deriv(x, y, params), deriv)
     assert_allclose(k.kernel_deriv(x, y, params), deriv_fd, rtol = 1.e-5)
-    
+
     x = np.array([[1., 2.], [2., 3.]])
     y = np.array([[2., 4.], [3., 1.]])
     params = np.array([0., 0., 0.])
-    
+
     D = np.array([[np.sqrt(5.), np.sqrt(5.)], [1., np.sqrt(5.)]])
     D1 = np.array([[1., 2.], [0., 1.]])
     D2 = np.array([[2., 1.], [1., 2.]])
-    
+
     deriv = np.zeros((3, 2, 2))
-    
+
     deriv[-1] = (1.+np.sqrt(5.)*D+5./3.*D**2)*np.exp(-np.sqrt(5.)*D)
     deriv[0] = -0.5*D1**2*5./3.*(1.+np.sqrt(5.)*D)*np.exp(-np.sqrt(5.)*D)
     deriv[1] = -0.5*D2**2*5./3.*(1.+np.sqrt(5.)*D)*np.exp(-np.sqrt(5.)*D)
-    
+
     deriv_fd = np.zeros((3, 2, 2))
-    deriv_fd[0] = (k.kernel_f(x, y, params)-k.kernel_f(x, y, params - np.array([dx, 0., 0.])))/dx
-    deriv_fd[1] = (k.kernel_f(x, y, params)-k.kernel_f(x, y, params - np.array([0., dx, 0.])))/dx
-    deriv_fd[2] = (k.kernel_f(x, y, params)-k.kernel_f(x, y, params - np.array([0., 0., dx])))/dx
-    
+    deriv_fd[0] = (k.kernel_f(x, y, params) -
+                   k.kernel_f(x, y, params - np.array([dx, 0., 0.])))/dx
+    deriv_fd[1] = (k.kernel_f(x, y, params) -
+                   k.kernel_f(x, y, params - np.array([0., dx, 0.])))/dx
+    deriv_fd[2] = (k.kernel_f(x, y, params) -
+                   k.kernel_f(x, y, params - np.array([0., 0., dx])))/dx
+
     assert_allclose(k.kernel_deriv(x, y, params), deriv)
     assert_allclose(k.kernel_deriv(x, y, params), deriv_fd, rtol = 1.e-5)
-    
+
     x = np.array([[1., 2.], [2., 3.]])
     y = np.array([[2., 4.], [3., 1.]])
     params = np.array([np.log(2.), np.log(4.), np.log(2.)])
-    
-    D = np.array([[np.sqrt(1.*2.+4.*4.), np.sqrt(4.*2.+1.*4.)], [np.sqrt(1.*4.), np.sqrt(1.*2.+4.*4.)]])
+
+    D = np.array([[np.sqrt(1.*2.+4.*4.), np.sqrt(4.*2.+1.*4.)],
+                  [np.sqrt(1.*4.), np.sqrt(1.*2.+4.*4.)]])
     D1 = np.array([[1., 2.], [0., 1.]])
     D2 = np.array([[2., 1.], [1., 2.]])
-    
+
     deriv = np.zeros((3, 2, 2))
-    
+
     deriv[-1] = 2.*(1.+np.sqrt(5.)*D+5./3.*D**2)*np.exp(-np.sqrt(5.)*D)
     deriv[0] = -0.5*2.*2.*D1**2*5./3.*(1.+np.sqrt(5.)*D)*np.exp(-np.sqrt(5.)*D)
     deriv[1] = -0.5*2.*4.*D2**2*5./3.*(1.+np.sqrt(5.)*D)*np.exp(-np.sqrt(5.)*D)
-    
+
     deriv_fd = np.zeros((3, 2, 2))
-    deriv_fd[0] = (k.kernel_f(x, y, params)-k.kernel_f(x, y, params - np.array([dx, 0., 0.])))/dx
-    deriv_fd[1] = (k.kernel_f(x, y, params)-k.kernel_f(x, y, params - np.array([0., dx, 0.])))/dx
-    deriv_fd[2] = (k.kernel_f(x, y, params)-k.kernel_f(x, y, params - np.array([0., 0., dx])))/dx
-    
+    deriv_fd[0] = (k.kernel_f(x, y, params) -
+                   k.kernel_f(x, y, params - np.array([dx, 0., 0.])))/dx
+    deriv_fd[1] = (k.kernel_f(x, y, params) -
+                   k.kernel_f(x, y, params - np.array([0., dx, 0.])))/dx
+    deriv_fd[2] = (k.kernel_f(x, y, params) -
+                   k.kernel_f(x, y, params - np.array([0., 0., dx])))/dx
+
     assert_allclose(k.kernel_deriv(x, y, params), deriv)
     assert_allclose(k.kernel_deriv(x, y, params), deriv_fd, rtol = 1.e-5)
-                    
+
     x = np.array([1., 2.])
     y = np.array([2., 3.])
     params = np.array([0., 0.])
-    
+
     deriv = np.zeros((2, 2, 2))
-    
+
     D = np.array([[1., 2.], [0., 1.]])
-    
+
     deriv[-1] = (1.+np.sqrt(5.)*D+5./3.*D**2)*np.exp(-np.sqrt(5.)*D)
     deriv[0] = -0.5*D**2*5./3.*(1.+np.sqrt(5.)*D)*np.exp(-np.sqrt(5.)*D)
-    
+
     deriv_fd = np.zeros((2, 2, 2))
-    deriv_fd[0] = (k.kernel_f(x, y, params)-k.kernel_f(x, y, params - np.array([dx, 0.])))/dx
-    deriv_fd[1] = (k.kernel_f(x, y, params)-k.kernel_f(x, y, params - np.array([0, dx])))/dx
-    
+    deriv_fd[0] = (k.kernel_f(x, y, params) -
+                   k.kernel_f(x, y, params - np.array([dx, 0.])))/dx
+    deriv_fd[1] = (k.kernel_f(x, y, params) -
+                   k.kernel_f(x, y, params - np.array([0, dx])))/dx
+
     assert_allclose(k.kernel_deriv(x, y, params), deriv)
     assert_allclose(k.kernel_deriv(x, y, params), deriv_fd, rtol = 1.e-5)
 
 def test_matern_5_2_deriv_failures():
     "test scenarios where matern_5_2_deriv should raise an exception"
-    
+
     k = Matern52()
-    
+
     x = np.array([[1.], [2.]])
     y = np.array([[2.], [3.]])
     params = np.array([0.])
-    
+
     with pytest.raises(AssertionError):
         k.kernel_deriv(x, y, params)
-    
+
     params = np.array([[0., 0.], [0., 0.]])
-    
+
     with pytest.raises(AssertionError):
         k.kernel_deriv(x, y, params)
-        
+
     x = np.array([[1.], [2.]])
     y = np.array([[2., 4.], [3., 2.]])
     params = np.array([0., 0.])
-    
+
     with pytest.raises(AssertionError):
         k.kernel_deriv(x, y, params)
-        
+
     x = np.array([[1.], [2.]])
     y = np.array([[[2.], [4.]], [[3.], [2.]]])
     params = np.array([0., 0.])
-    
+
     with pytest.raises(AssertionError):
         k.kernel_deriv(x, y, params)
-        
+
     x = np.array([[2., 4.], [3., 2.]])
     y = np.array([[1.], [2.]])
     params = np.array([0., 0.])
-    
+
     with pytest.raises(AssertionError):
         k.kernel_deriv(x, y, params)
-        
+
     x = np.array([[[2.], [4.]], [[3.], [2.]]])
     y = np.array([[1.], [2.]])
     params = np.array([0., 0.])
-    
+
     with pytest.raises(AssertionError):
         k.kernel_deriv(x, y, params)
-        
+
 def test_matern_5_2_hessian():
     "test the function to compute the squared exponential hessian"
-    
+
     k = Matern52()
-    
+
     dx = 1.e-6
-    
+
     x = np.array([[1.], [2.]])
     y = np.array([[2.], [3.]])
     params = np.array([0., 0.])
 
     D = np.array([[1., 2.], [0., 1.]])
-    
+
     hess = np.zeros((2, 2, 2, 2))
     hess[0, 0] = 5./3.*np.exp(-np.sqrt(5.)*D)*(5./4.*D**4-(1.+np.sqrt(5.)*D)*D**2/2.)
     hess[0, 1] = -0.5*D**2*5./3.*(1.+np.sqrt(5.)*D)*np.exp(-np.sqrt(5.)*D)
     hess[1, 0] = -0.5*D**2*5./3.*(1.+np.sqrt(5.)*D)*np.exp(-np.sqrt(5.)*D)
     hess[1, 1] = (1.+np.sqrt(5.)*D+5./3.*D**2)*np.exp(-np.sqrt(5.)*D)
-    
+
     hess_fd = np.zeros((2, 2, 2, 2))
-    hess_fd[0, 0] = (k.kernel_deriv(x, y, params)[0]-k.kernel_deriv(x, y, params-np.array([dx, 0.]))[0])/dx
-    hess_fd[0, 1] = (k.kernel_deriv(x, y, params)[0]-k.kernel_deriv(x, y, params-np.array([0., dx]))[0])/dx
-    hess_fd[1, 0] = (k.kernel_deriv(x, y, params)[1]-k.kernel_deriv(x, y, params-np.array([dx, 0.]))[1])/dx
-    hess_fd[1, 1] = (k.kernel_deriv(x, y, params)[1]-k.kernel_deriv(x, y, params-np.array([0., dx]))[1])/dx
-    
+    hess_fd[0, 0] = (k.kernel_deriv(x, y, params)[0] -
+                     k.kernel_deriv(x, y, params-np.array([dx, 0.]))[0])/dx
+    hess_fd[0, 1] = (k.kernel_deriv(x, y, params)[0] -
+                     k.kernel_deriv(x, y, params-np.array([0., dx]))[0])/dx
+    hess_fd[1, 0] = (k.kernel_deriv(x, y, params)[1] -
+                     k.kernel_deriv(x, y, params-np.array([dx, 0.]))[1])/dx
+    hess_fd[1, 1] = (k.kernel_deriv(x, y, params)[1] -
+                     k.kernel_deriv(x, y, params-np.array([0., dx]))[1])/dx
+
     assert_allclose(k.kernel_hessian(x, y, params), hess)
     assert_allclose(k.kernel_hessian(x, y, params), hess_fd, rtol = 1.e-5)
-    
+
     x = np.array([[1., 2.], [2., 3.]])
     y = np.array([[2., 4.], [3., 1.]])
     params = np.array([0., 0., 0.])
-    
+
     D = np.array([[np.sqrt(5.), np.sqrt(5.)], [1., np.sqrt(5.)]])
     D1 = np.array([[1., 2.], [0., 1.]])
     D2 = np.array([[2., 1.], [1., 2.]])
-    
+
     hess = np.zeros((3, 3, 2, 2))
-    
+
     hess[0, 0] = 5./3.*np.exp(-np.sqrt(5.)*D)*(5./4.*D1**4-(1.+np.sqrt(5.)*D)*D1**2/2.)
     hess[0, 1] = 5./3.*np.exp(-np.sqrt(5.)*D)*(5./4.*D1**2*D2**2)
     hess[1, 0] = 5./3.*np.exp(-np.sqrt(5.)*D)*(5./4.*D1**2*D2**2)
@@ -966,138 +1300,294 @@ def test_matern_5_2_hessian():
     hess[1, 2] = -0.5*D2**2*5./3.*(1.+np.sqrt(5.)*D)*np.exp(-np.sqrt(5.)*D)
     hess[2, 1] = -0.5*D2**2*5./3.*(1.+np.sqrt(5.)*D)*np.exp(-np.sqrt(5.)*D)
     hess[2, 2] = (1.+np.sqrt(5.)*D+5./3.*D**2)*np.exp(-np.sqrt(5.)*D)
-    
+
     hess_fd = np.zeros((3, 3, 2, 2))
-    hess_fd[0, 0] = (k.kernel_deriv(x, y, params)[0]-k.kernel_deriv(x, y, params-np.array([dx, 0., 0.]))[0])/dx
-    hess_fd[0, 1] = (k.kernel_deriv(x, y, params)[0]-k.kernel_deriv(x, y, params-np.array([0., dx, 0.]))[0])/dx
-    hess_fd[1, 0] = (k.kernel_deriv(x, y, params)[1]-k.kernel_deriv(x, y, params-np.array([dx, 0., 0.]))[1])/dx
-    hess_fd[1, 1] = (k.kernel_deriv(x, y, params)[1]-k.kernel_deriv(x, y, params-np.array([0., dx, 0.]))[1])/dx
-    hess_fd[0, 2] = (k.kernel_deriv(x, y, params)[0]-k.kernel_deriv(x, y, params-np.array([0., 0., dx]))[0])/dx
-    hess_fd[2, 0] = (k.kernel_deriv(x, y, params)[2]-k.kernel_deriv(x, y, params-np.array([dx, 0., 0.]))[2])/dx
-    hess_fd[1, 2] = (k.kernel_deriv(x, y, params)[1]-k.kernel_deriv(x, y, params-np.array([0., 0., dx]))[1])/dx
-    hess_fd[2, 1] = (k.kernel_deriv(x, y, params)[2]-k.kernel_deriv(x, y, params-np.array([0., dx, 0.]))[2])/dx
-    hess_fd[2, 2] = (k.kernel_deriv(x, y, params)[2]-k.kernel_deriv(x, y, params-np.array([0., 0., dx]))[2])/dx
-    
+    hess_fd[0, 0] = (k.kernel_deriv(x, y, params)[0] -
+                     k.kernel_deriv(x, y, params-np.array([dx, 0., 0.]))[0])/dx
+    hess_fd[0, 1] = (k.kernel_deriv(x, y, params)[0] -
+                     k.kernel_deriv(x, y, params-np.array([0., dx, 0.]))[0])/dx
+    hess_fd[1, 0] = (k.kernel_deriv(x, y, params)[1] -
+                     k.kernel_deriv(x, y, params-np.array([dx, 0., 0.]))[1])/dx
+    hess_fd[1, 1] = (k.kernel_deriv(x, y, params)[1] -
+                     k.kernel_deriv(x, y, params-np.array([0., dx, 0.]))[1])/dx
+    hess_fd[0, 2] = (k.kernel_deriv(x, y, params)[0] -
+                     k.kernel_deriv(x, y, params-np.array([0., 0., dx]))[0])/dx
+    hess_fd[2, 0] = (k.kernel_deriv(x, y, params)[2] -
+                     k.kernel_deriv(x, y, params-np.array([dx, 0., 0.]))[2])/dx
+    hess_fd[1, 2] = (k.kernel_deriv(x, y, params)[1] -
+                     k.kernel_deriv(x, y, params-np.array([0., 0., dx]))[1])/dx
+    hess_fd[2, 1] = (k.kernel_deriv(x, y, params)[2] -
+                     k.kernel_deriv(x, y, params-np.array([0., dx, 0.]))[2])/dx
+    hess_fd[2, 2] = (k.kernel_deriv(x, y, params)[2] -
+                     k.kernel_deriv(x, y, params-np.array([0., 0., dx]))[2])/dx
+
     assert_allclose(k.kernel_hessian(x, y, params), hess)
     assert_allclose(k.kernel_hessian(x, y, params), hess_fd, rtol = 1.e-5)
-    
+
     x = np.array([[1., 2.], [2., 3.]])
     y = np.array([[2., 4.], [3., 1.]])
     params = np.array([np.log(2.), np.log(4.), np.log(2.)])
-    
-    D = np.array([[np.sqrt(1.*2.+4.*4.), np.sqrt(4.*2.+1.*4.)], [np.sqrt(1.*4.), np.sqrt(1.*2.+4.*4.)]])
+
+    D = np.array([[np.sqrt(1.*2.+4.*4.), np.sqrt(4.*2.+1.*4.)],
+                  [np.sqrt(1.*4.), np.sqrt(1.*2.+4.*4.)]])
     D1 = np.array([[1., 2.], [0., 1.]])
     D2 = np.array([[2., 1.], [1., 2.]])
-    
+
     hess = np.zeros((3, 3, 2, 2))
-    
-    hess[0, 0] = 5./3.*2.*np.exp(-np.sqrt(5.)*D)*(5./4.*2.*2.*D1**4-(1.+np.sqrt(5.)*D)*2.*D1**2/2.)
+
+    hess[0, 0] = (5./3.*2.*np.exp(-np.sqrt(5.)*D)*
+                  (5./4.*2.*2.*D1**4 - (1.+np.sqrt(5.)*D)*2.*D1**2/2.))
     hess[0, 1] = 5./3.*2.*np.exp(-np.sqrt(5.)*D)*(5./4.*2.*4.*D1**2*D2**2)
     hess[1, 0] = 5./3.*2.*np.exp(-np.sqrt(5.)*D)*(5./4.*2.*4.*D1**2*D2**2)
-    hess[1, 1] = 5./3.*2.*np.exp(-np.sqrt(5.)*D)*(5./4.*4.*4.*D2**4-(1.+np.sqrt(5.)*D)*4.*D2**2/2.)
-    hess[0, 2] = -0.5*2.*2.*D1**2*5./3.*(1.+np.sqrt(5.)*D)*np.exp(-np.sqrt(5.)*D)
-    hess[2, 0] = -0.5*2.*2.*D1**2*5./3.*(1.+np.sqrt(5.)*D)*np.exp(-np.sqrt(5.)*D)
-    hess[1, 2] = -0.5*2.*4.*D2**2*5./3.*(1.+np.sqrt(5.)*D)*np.exp(-np.sqrt(5.)*D)
-    hess[2, 1] = -0.5*2.*4.*D2**2*5./3.*(1.+np.sqrt(5.)*D)*np.exp(-np.sqrt(5.)*D)
-    hess[2, 2] = 2.*(1.+np.sqrt(5.)*D+5./3.*D**2)*np.exp(-np.sqrt(5.)*D)
-    
+    hess[1, 1] = (5./3.*2.*np.exp(-np.sqrt(5.)*D)*(5./4.*4.*4.*D2**4 -
+                  (1. + np.sqrt(5.)*D)*4.*D2**2/2.))
+    hess[0, 2] = -0.5*2.*2.*D1**2*5./3.*(1. + np.sqrt(5.)*D)*np.exp(-np.sqrt(5.)*D)
+    hess[2, 0] = -0.5*2.*2.*D1**2*5./3.*(1. + np.sqrt(5.)*D)*np.exp(-np.sqrt(5.)*D)
+    hess[1, 2] = -0.5*2.*4.*D2**2*5./3.*(1. + np.sqrt(5.)*D)*np.exp(-np.sqrt(5.)*D)
+    hess[2, 1] = -0.5*2.*4.*D2**2*5./3.*(1. + np.sqrt(5.)*D)*np.exp(-np.sqrt(5.)*D)
+    hess[2, 2] = 2.*(1. + np.sqrt(5.)*D+5./3.*D**2)*np.exp(-np.sqrt(5.)*D)
+
     hess_fd = np.zeros((3, 3, 2, 2))
-    hess_fd[0, 0] = (k.kernel_deriv(x, y, params)[0]-k.kernel_deriv(x, y, params-np.array([dx, 0., 0.]))[0])/dx
-    hess_fd[0, 1] = (k.kernel_deriv(x, y, params)[0]-k.kernel_deriv(x, y, params-np.array([0., dx, 0.]))[0])/dx
-    hess_fd[1, 0] = (k.kernel_deriv(x, y, params)[1]-k.kernel_deriv(x, y, params-np.array([dx, 0., 0.]))[1])/dx
-    hess_fd[1, 1] = (k.kernel_deriv(x, y, params)[1]-k.kernel_deriv(x, y, params-np.array([0., dx, 0.]))[1])/dx
-    hess_fd[0, 2] = (k.kernel_deriv(x, y, params)[0]-k.kernel_deriv(x, y, params-np.array([0., 0., dx]))[0])/dx
-    hess_fd[2, 0] = (k.kernel_deriv(x, y, params)[2]-k.kernel_deriv(x, y, params-np.array([dx, 0., 0.]))[2])/dx
-    hess_fd[1, 2] = (k.kernel_deriv(x, y, params)[1]-k.kernel_deriv(x, y, params-np.array([0., 0., dx]))[1])/dx
-    hess_fd[2, 1] = (k.kernel_deriv(x, y, params)[2]-k.kernel_deriv(x, y, params-np.array([0., dx, 0.]))[2])/dx
-    hess_fd[2, 2] = (k.kernel_deriv(x, y, params)[2]-k.kernel_deriv(x, y, params-np.array([0., 0., dx]))[2])/dx
-    
+    hess_fd[0, 0] = (k.kernel_deriv(x, y, params)[0] -
+                     k.kernel_deriv(x, y, params-np.array([dx, 0., 0.]))[0])/dx
+    hess_fd[0, 1] = (k.kernel_deriv(x, y, params)[0] -
+                     k.kernel_deriv(x, y, params-np.array([0., dx, 0.]))[0])/dx
+    hess_fd[1, 0] = (k.kernel_deriv(x, y, params)[1] -
+                     k.kernel_deriv(x, y, params-np.array([dx, 0., 0.]))[1])/dx
+    hess_fd[1, 1] = (k.kernel_deriv(x, y, params)[1] -
+                     k.kernel_deriv(x, y, params-np.array([0., dx, 0.]))[1])/dx
+    hess_fd[0, 2] = (k.kernel_deriv(x, y, params)[0] -
+                     k.kernel_deriv(x, y, params-np.array([0., 0., dx]))[0])/dx
+    hess_fd[2, 0] = (k.kernel_deriv(x, y, params)[2] -
+                     k.kernel_deriv(x, y, params-np.array([dx, 0., 0.]))[2])/dx
+    hess_fd[1, 2] = (k.kernel_deriv(x, y, params)[1] -
+                     k.kernel_deriv(x, y, params-np.array([0., 0., dx]))[1])/dx
+    hess_fd[2, 1] = (k.kernel_deriv(x, y, params)[2] -
+                     k.kernel_deriv(x, y, params-np.array([0., dx, 0.]))[2])/dx
+    hess_fd[2, 2] = (k.kernel_deriv(x, y, params)[2] -
+                     k.kernel_deriv(x, y, params-np.array([0., 0., dx]))[2])/dx
+
     assert_allclose(k.kernel_hessian(x, y, params), hess)
     assert_allclose(k.kernel_hessian(x, y, params), hess_fd, rtol = 1.e-5)
-                    
+
     x = np.array([1., 2.])
     y = np.array([2., 3.])
     params = np.array([0., 0.])
-    
+
     D = np.array([[1., 2.], [0., 1.]])
-    
+
     hess = np.zeros((2, 2, 2, 2))
     hess[0, 0] = 5./3.*np.exp(-np.sqrt(5.)*D)*(5./4.*D**4-(1.+np.sqrt(5.)*D)*D**2/2.)
     hess[0, 1] = -0.5*D**2*5./3.*(1.+np.sqrt(5.)*D)*np.exp(-np.sqrt(5.)*D)
     hess[1, 0] = -0.5*D**2*5./3.*(1.+np.sqrt(5.)*D)*np.exp(-np.sqrt(5.)*D)
     hess[1, 1] = (1.+np.sqrt(5.)*D+5./3.*D**2)*np.exp(-np.sqrt(5.)*D)
-    
+
     hess_fd = np.zeros((2, 2, 2, 2))
-    hess_fd[0, 0] = (k.kernel_deriv(x, y, params)[0]-k.kernel_deriv(x, y, params-np.array([dx, 0.]))[0])/dx
-    hess_fd[0, 1] = (k.kernel_deriv(x, y, params)[0]-k.kernel_deriv(x, y, params-np.array([0., dx]))[0])/dx
-    hess_fd[1, 0] = (k.kernel_deriv(x, y, params)[1]-k.kernel_deriv(x, y, params-np.array([dx, 0.]))[1])/dx
-    hess_fd[1, 1] = (k.kernel_deriv(x, y, params)[1]-k.kernel_deriv(x, y, params-np.array([0., dx]))[1])/dx
-    
+    hess_fd[0, 0] = (k.kernel_deriv(x, y, params)[0] -
+                     k.kernel_deriv(x, y, params-np.array([dx, 0.]))[0])/dx
+    hess_fd[0, 1] = (k.kernel_deriv(x, y, params)[0] -
+                     k.kernel_deriv(x, y, params-np.array([0., dx]))[0])/dx
+    hess_fd[1, 0] = (k.kernel_deriv(x, y, params)[1] -
+                     k.kernel_deriv(x, y, params-np.array([dx, 0.]))[1])/dx
+    hess_fd[1, 1] = (k.kernel_deriv(x, y, params)[1] -
+                     k.kernel_deriv(x, y, params-np.array([0., dx]))[1])/dx
+
     assert_allclose(k.kernel_hessian(x, y, params), hess)
     assert_allclose(k.kernel_hessian(x, y, params), hess_fd, rtol = 1.e-5)
 
 def test_matern_5_2_hessian_failures():
     "test situaitons where squared_exponential_hessian should fail"
-    
+
     k = Matern52()
-    
+
     x = np.array([[1.], [2.]])
     y = np.array([[2.], [3.]])
     params = np.array([0.])
-    
+
     with pytest.raises(AssertionError):
         k.kernel_hessian(x, y, params)
-    
+
     params = np.array([[0., 0.], [0., 0.]])
-    
+
     with pytest.raises(AssertionError):
         k.kernel_hessian(x, y, params)
-        
+
     x = np.array([[1.], [2.]])
     y = np.array([[2., 4.], [3., 2.]])
     params = np.array([0., 0.])
-    
+
     with pytest.raises(AssertionError):
         k.kernel_hessian(x, y, params)
-        
+
     x = np.array([[1.], [2.]])
     y = np.array([[[2.], [4.]], [[3.], [2.]]])
     params = np.array([0., 0.])
-    
+
     with pytest.raises(AssertionError):
         k.kernel_hessian(x, y, params)
-        
+
     x = np.array([[2., 4.], [3., 2.]])
     y = np.array([[1.], [2.]])
     params = np.array([0., 0.])
-    
+
     with pytest.raises(AssertionError):
         k.kernel_hessian(x, y, params)
-        
+
     x = np.array([[[2.], [4.]], [[3.], [2.]]])
     y = np.array([[1.], [2.]])
     params = np.array([0., 0.])
-    
+
     with pytest.raises(AssertionError):
         k.kernel_hessian(x, y, params)
-        
+
+def test_matern_5_2_inputderiv():
+    "test input derivative method of Matern 5/2 kernel"
+
+    k = Matern52()
+
+    dx = 1.e-6
+
+    x = np.array([[1.], [2.]])
+    y = np.array([[2.], [3.]])
+    params = np.array([0., 0.])
+
+    deriv = np.zeros((1, 2, 2))
+
+    r = np.array([[1., 2.], [0., 1.]])
+
+    deriv[0] = (-5./3.*r*(1.+np.sqrt(5.)*r)*np.exp(-np.sqrt(5.)*r)
+                *np.array([[-1., -1.], [0., -1.]]))
+    deriv_fd = np.zeros((1, 2, 2))
+    deriv_fd[0] = (k.kernel_f(x + dx, y, params) -
+                   k.kernel_f(x - dx, y, params))/dx/2.
+
+    assert_allclose(k.kernel_inputderiv(x, y, params), deriv)
+    assert_allclose(k.kernel_inputderiv(x, y, params), deriv_fd,
+                    rtol = 1.e-5, atol = 1.e-8)
+
+    x = np.array([[1., 2.], [2., 3.]])
+    y = np.array([[2., 4.], [3., 1.]])
+    params = np.array([0., 0., 0.])
+
+    deriv = np.zeros((2, 2, 2))
+
+    r = np.array([[np.sqrt(5.), np.sqrt(5.)], [1., np.sqrt(5.)]])
+
+    deriv[0] = (-5./3.*(1.+np.sqrt(5.)*r)*np.exp(-np.sqrt(5.)*r)*
+                np.array([[-1., -2.], [0., -1.]]))
+    deriv[1] = (-5./3.*(1.+np.sqrt(5.)*r)*np.exp(-np.sqrt(5.)*r)*
+                np.array([[-2., 1.], [-1., 2.]]))
+    deriv_fd = np.zeros((2, 2, 2))
+    deriv_fd[0] = (k.kernel_f(x + np.array([[dx, 0.], [dx, 0.]]), y, params) -
+                   k.kernel_f(x - np.array([[dx, 0.], [dx, 0.]]), y, params))/dx/2.
+    deriv_fd[1] = (k.kernel_f(x + np.array([[0., dx], [0., dx]]), y, params) -
+                   k.kernel_f(x - np.array([[0., dx], [0., dx]]), y, params))/dx/2.
+
+    assert_allclose(k.kernel_inputderiv(x, y, params), deriv)
+    assert_allclose(k.kernel_inputderiv(x, y, params), deriv_fd, rtol = 1.e-5)
+
+    x = np.array([[1., 2.], [2., 3.]])
+    y = np.array([[2., 4.], [3., 1.]])
+    params = np.array([np.log(2.), np.log(4.), np.log(2.)])
+
+    deriv = np.zeros((2, 2, 2))
+
+    r = np.array([[np.sqrt(1.*2.+4.*4.), np.sqrt(4.*2.+1.*4.)],
+                  [np.sqrt(1.*4.), np.sqrt(1.*2.+4.*4.)]])
+
+    deriv[0] = (-np.exp(2.)*np.exp(2.)*5./3.*(1.+np.sqrt(5.)*r)*
+                np.exp(-np.sqrt(5.)*r)*np.array([[-1., -2.], [0., -1.]]))
+    deriv[1] = (-np.exp(2.)*np.exp(4.)*5./3.*(1.+np.sqrt(5.)*r)*
+                np.exp(-np.sqrt(5.)*r)*np.exp(-0.5*r**2)*np.array([[-2., 1.], [-1., 2.]]))
+    deriv_fd = np.zeros((2, 2, 2))
+    deriv_fd[0] = (k.kernel_f(x + np.array([[dx, 0.], [dx, 0.]]), y, params) -
+                   k.kernel_f(x - np.array([[dx, 0.], [dx, 0.]]), y, params))/dx/2.
+    deriv_fd[1] = (k.kernel_f(x + np.array([[0., dx], [0., dx]]), y, params) -
+                   k.kernel_f(x - np.array([[0., dx], [0., dx]]), y, params))/dx/2.
+
+    #assert_allclose(k.kernel_inputderiv(x, y, params), deriv)
+    assert_allclose(k.kernel_inputderiv(x, y, params), deriv_fd, rtol = 1.e-5)
+
+    x = np.array([1., 2.])
+    y = np.array([2., 3.])
+    params = np.array([0., 0.])
+
+    deriv = np.zeros((1, 2, 2))
+
+    r = np.array([[1., 2.], [0., 1.]])
+
+    deriv[0] = (-5./3.*r*(1.+np.sqrt(5.)*r)*
+                np.exp(-np.sqrt(5.)*r)*np.array([[-1., -1.], [0., -1.]]))
+    deriv_fd = np.zeros((1, 2, 2))
+    deriv_fd[0] = (k.kernel_f(x + dx, y, params) -
+                   k.kernel_f(x - dx, y, params))/dx/2.
+
+    assert_allclose(k.kernel_inputderiv(x, y, params), deriv)
+    assert_allclose(k.kernel_inputderiv(x, y, params), deriv_fd,
+                    rtol = 1.e-5, atol = 1.e-8)
+
+def test_matern_5_2_inputderiv_failures():
+    "test situations where input derivative should fail"
+
+    k = Matern52()
+
+    x = np.array([[1.], [2.]])
+    y = np.array([[2.], [3.]])
+    params = np.array([0.])
+
+    with pytest.raises(AssertionError):
+       k.kernel_inputderiv(x, y, params)
+
+    params = np.array([[0., 0.], [0., 0.]])
+
+    with pytest.raises(AssertionError):
+       k.kernel_inputderiv(x, y, params)
+
+    x = np.array([[1.], [2.]])
+    y = np.array([[2., 4.], [3., 2.]])
+    params = np.array([0., 0.])
+
+    with pytest.raises(AssertionError):
+       k.kernel_inputderiv(x, y, params)
+
+    x = np.array([[1.], [2.]])
+    y = np.array([[[2.], [4.]], [[3.], [2.]]])
+    params = np.array([0., 0.])
+
+    with pytest.raises(AssertionError):
+       k.kernel_inputderiv(x, y, params)
+
+    x = np.array([[2., 4.], [3., 2.]])
+    y = np.array([[1.], [2.]])
+    params = np.array([0., 0.])
+
+    with pytest.raises(AssertionError):
+       k.kernel_inputderiv(x, y, params)
+
+    x = np.array([[[2.], [4.]], [[3.], [2.]]])
+    y = np.array([[1.], [2.]])
+    params = np.array([0., 0.])
+
+    with pytest.raises(AssertionError):
+       k.kernel_inputderiv(x, y, params)
+
 def test_Kernel_str():
     "test string method of generic Kernel class"
-    
+
     k = Kernel()
-    
+
     assert str(k) == "Stationary Kernel"
-    
+
 def test_SquaredExponential_str():
     "test string method of SquaredExponential class"
-    
+
     k = SquaredExponential()
-    
+
     assert str(k) == "Squared Exponential Kernel"
-    
+
 def test_Matern52_str():
     "test string method of Matern52 class"
-    
+
     k = Matern52()
-    
+
     assert str(k) == "Matern 5/2 Kernel"
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 04e1dff5..20fa4110 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@
 MAJOR = 0
 MINOR = 2
 MICRO = 0
-PRERELEASE = 4
+PRERELEASE = 5
 ISRELEASED = False
 version = "{}.{}.{}".format(MAJOR, MINOR, MICRO)