Merge pull request #18 from bjmorgan/test

Fixing correctness bug
bjmorgan · Nov 23, 2022 · 8f70cd1 · 8f70cd1
2 parents 03a6edc + 9e81ccf
commit 8f70cd1
Show file tree

Hide file tree

Showing 14 changed files with 278 additions and 256 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -7,8 +7,7 @@ jobs:
     runs-on: ubuntu-20.04
     strategy:
       matrix:
-        python-version: ['3.7', 
-                         '3.8', 
+        python-version: ['3.8', 
                          '3.9', 
                          '3.10']
     steps:

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -10,10 +10,10 @@ jobs:
     runs-on: ubuntu-18.04
     steps:
     - uses: actions/checkout@master
-    - name: setup python 3.7
+    - name: setup python 3.9
       uses: actions/setup-python@v2
       with:
-        python-version: 3.7
+        python-version: 3.9
     - name: install pypa/build
       run: >- 
         python -m 

diff --git a/docs/source/arrhenius_t.ipynb b/docs/source/arrhenius_t.ipynb
@@ -61,7 +61,7 @@
    "source": [
     "To read these simulations we will use [MDAnalysis](https://userguide.mdanalysis.org/stable/index.html) (however, it is also possible to use data from a [VASP simulation](./vasp_d.html)).\n",
     "The parser, bootstrap, and diffusion parameters are all defined for all simulations, here we only consider the diffusive regime to begin after 5 ps.\n",
-    "Additionally, we include in the `p_params` a `sub_sample_atoms` key, this defines the sampling frequency of atoms to be used in the analysis.\n",
+    "Additionally, we include in the `p_params` a `sub_sample_atoms` key, this defines the sampling frequency of atoms to be used in the analysis and the `sub_sample_traj` key, which defined the sampling frequency for the trajectory.\n",
     "This facility can be particularly useful for large simulations where `kinisi` might encounter issues related to out-of-memory problems. "
    ]
   },
@@ -77,6 +77,7 @@
     "            'step_skip': 100,\n",
     "            'min_dt': 0.001,\n",
     "            'sub_sample_atoms': 4,\n",
+    "            'sub_sample_traj': 2,\n",
     "            'progress': False}\n",
     "b_params = {'progress': False}\n",
     "d_params = {'dt_skip': 10, \n",
@@ -113,6 +114,16 @@
     "The list of diffusion coefficient objects (which are `uravu.distribution.Distribution` type objects) and array of temperatures can then be passed to the `kinisi.arrhenius.StandardArrhenius` class. "
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a7ea3985-ffcf-45c1-a6ca-b870c31740d6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(D), len(temperatures)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -215,7 +226,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "9d87d5b9-24ef-4635-85eb-5d41678b53a9",
+   "id": "3fea99f1-ffcc-4a08-8f48-1e155a921bb6",
    "metadata": {},
    "outputs": [],
    "source": []
@@ -237,7 +248,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.13"
+   "version": "3.9.7"
   }
  },
  "nbformat": 4,

diff --git a/docs/source/vasp_d.ipynb b/docs/source/vasp_d.ipynb
@@ -55,16 +55,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "b_params = {'dimension': 'xy'}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from kinisi.parser import PymatgenParser"
+    "b_params = {'dimension': 'xyz'}"
    ]
   },
   {
@@ -355,7 +346,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.13"
+   "version": "3.9.7"
   }
  },
  "nbformat": 4,

diff --git a/kinisi/__init__.py b/kinisi/__init__.py
@@ -1,4 +1,4 @@
 MAJOR = 0
-MINOR = 4
+MINOR = 5
 MICRO = 0
 __version__ = f'{MAJOR:d}.{MINOR:d}.{MICRO:d}'
diff --git a/kinisi/analyzer.py b/kinisi/analyzer.py
@@ -240,7 +240,7 @@ def _stack_trajectories(u: Union[MDAnalysisParser, PymatgenParser]) -> List[np.n
                 disp[u[0].disp_3d[i].shape[0] * j:u[0].disp_3d[i].shape[0] * (j + 1)] = u[j].disp_3d[i]
             joint_disp_3d.append(disp)
         return joint_disp_3d
-
+    
     @property
     def distribution(self) -> np.ndarray:
         """

diff --git a/kinisi/diffusion.py b/kinisi/diffusion.py
@@ -8,9 +8,10 @@
 # author: Andrew R. McCluskey (arm61)
 
 import warnings
-from typing import List, Union
+from typing import List, Tuple, Union
 import numpy as np
-from scipy.stats import multivariate_normal, normaltest, linregress
+from scipy.stats import normaltest, linregress
+from scipy.linalg import pinvh
 from scipy.optimize import minimize, curve_fit
 import scipy.constants as const
 import tqdm
@@ -49,11 +50,11 @@ def __init__(self, delta_t: np.ndarray, disp_3d: List[np.ndarray], sub_sample_dt
         self._max_obs = self._displacements[0].shape[1]
         self._distributions = []
         self._dt = np.array([])
-        self._iterator = self.iterator(progress, range(len(self._displacements)))
         self._n = np.array([])
         self._s = np.array([])
         self._v = np.array([])
         self._n_i = np.array([], dtype=int)
+        self._n_o = np.array([], dtype=int)
         self._ngp = np.array([])
         self._euclidian_displacements = []
         self._diffusion_coefficient = None
@@ -265,7 +266,7 @@ def bootstrap_GLS(self,
                       n_samples: int = 1000,
                       n_walkers: int = 32,
                       n_burn: int = 500,
-                      thin: int = 1,
+                      thin: int = 10,
                       progress: bool = True,
                       random_state: np.random.mtrand.RandomState = None):
         """
@@ -283,17 +284,11 @@ def bootstrap_GLS(self,
         :param n_walkers: Number of MCMC walkers to use. Optional, default is :py:attr:`32`.
         :param n_burn: Number of burn in samples (these allow the sampling to settle). Optional, default
             is :py:attr:`500`.
-        :param rtol: The relative threshold term for the covariance matrix inversion. If you obtain a very unusual
-            value for the diffusion coefficient, it is recommended to increase this value (ideally iteratively).
-            Optional, default is :code:`N * eps`, where :code:`eps` is the machine precision value of the covariance
-            matrix content.
+        :param thin: Use only every :py:attr:`thin` samples for the MCMC sampler. Optional, default is :py:attr:`10`.
         :param progress: Show tqdm progress for sampling. Optional, default is :py:attr:`True`.
         :param random_state: A :py:attr:`RandomState` object to be used to ensure reproducibility. Optional,
             default is :py:attr:`None`.
         """
-        if random_state is not None:
-            np.random.seed(random_state.get_state()[1][1])
-
         max_ngp = np.argwhere(self._dt > dt_skip)[0][0]
         if use_ngp:
             max_ngp = np.argmax(self._ngp)
@@ -307,15 +302,17 @@ def model_variance(dt: np.ndarray, a: float) -> np.ndarray:
             :param a: Quadratic coefficient
             :return: Model variances
             """
-            return a / self._n_i[max_ngp:] * dt**2
-
-        popt, _ = curve_fit(model_variance, self.dt[max_ngp:], self._v[max_ngp:])
-        model_v = model_variance(self.dt[max_ngp:], *popt)
+            return a / self._n_o[max_ngp:] * dt**2
 
-        self._covariance_matrix = self.populate_covariance_matrix(model_v, self._n_i[max_ngp:])
+        self._popt, _ = curve_fit(model_variance, self.dt[max_ngp:], self._v[max_ngp:])
+        self._model_v = model_variance(self.dt[max_ngp:], *self._popt)
+        self._covariance_matrix = _populate_covariance_matrix(self._model_v, self._n_o[max_ngp:])
+        self._npd_covariance_matrix = self._covariance_matrix
         self._covariance_matrix = find_nearest_positive_definite(self._covariance_matrix)
 
-        mv = multivariate_normal(self._n[max_ngp:], self._covariance_matrix, allow_singular=True)
+        _, logdet = np.linalg.slogdet(self._covariance_matrix) 
+        logdet += np.log(2 * np.pi) * self._n[max_ngp:].size
+        inv = pinvh(self._covariance_matrix)
 
         def log_likelihood(theta: np.ndarray) -> float:
             """
@@ -326,7 +323,9 @@ def log_likelihood(theta: np.ndarray) -> float:
             if theta[0] < 0:
                 return -np.inf
             model = _straight_line(self._dt[max_ngp:], *theta)
-            return mv.logpdf(model)
+            diff = (model - self._n[max_ngp:])
+            logl = -0.5 * (logdet + np.matmul(diff.T, np.matmul(inv, diff)))
+            return logl
 
         ols = linregress(self._dt[max_ngp:], self._n[max_ngp:])
         slope = ols.slope
@@ -353,31 +352,11 @@ def nll(*args) -> float:
         #     sampler._random = random_state
         sampler.run_mcmc(pos, n_samples + n_burn, progress=progress, progress_kwargs={'desc': "Likelihood Sampling"})
         self.flatchain = sampler.get_chain(flat=True, thin=thin, discard=n_burn)
-
         self.gradient = Distribution(self.flatchain[:, 0])
         self._intercept = None
         if fit_intercept:
             self._intercept = Distribution(self.flatchain[:, 1])
 
-    @staticmethod
-    def populate_covariance_matrix(variances: np.ndarray, n_samples: np.ndarray) -> np.ndarray:
-        """
-        Populate the covariance matrix for the generalised least squares methodology.
-
-        :param variances: The variances for each timestep
-        :param n_samples: Number of independent trajectories for each timestep
-
-        :return: An estimated covariance matrix for the system
-        """
-        covariance_matrix = np.zeros((variances.size, variances.size))
-        for i in range(0, variances.size):
-            for j in range(i, variances.size):
-                ratio = n_samples[i] / n_samples[j]
-                value = ratio * variances[i]
-                covariance_matrix[i, j] = value
-                covariance_matrix[j, i] = np.copy(covariance_matrix[i, j])
-        return covariance_matrix
-
     def diffusion(self, **kwargs):
         """
         Use the bootstrap-GLS method to determine the diffusivity for the system. Keyword arguments will be
@@ -401,8 +380,8 @@ def jump_diffusion(self, **kwargs):
         will be passed of the :py:func:`bootstrap_GLS` method.
         """
         self.bootstrap_GLS(**kwargs)
-        self._jump_diffusion_coefficient = Distribution(self.gradient.samples /
-                                                        (2e4 * self.dims * self._displacements[0].shape[0]))
+        self._jump_diffusion_coefficient = Distribution(
+            self.gradient.samples / (2e4 * self.dims * self._displacements[0].shape[0]))
 
     @property
     def D_J(self) -> Union[Distribution, None]:
@@ -470,15 +449,17 @@ def __init__(self,
                  random_state: np.random.mtrand.RandomState = None,
                  progress: bool = True):
         super().__init__(delta_t, disp_3d, sub_sample_dt, progress)
+        self._iterator = self.iterator(progress, range(len(self._displacements)))
         slice = DIMENSIONALITY[dimension.lower()]
         self.dims = len(dimension.lower())
+        timesteps = (self._delta_t / np.diff(self._delta_t)[0]).astype(int)
         for i in self._iterator:
             disp_slice = self._displacements[i][:, :, slice].reshape(self._displacements[i].shape[0],
                                                                      self._displacements[i].shape[1], self.dims)
             d_squared = np.sum(disp_slice**2, axis=2)
             if d_squared.size <= 1:
                 continue
-            self._n_i = np.append(self._n_i, d_squared.size)
+            self._n_o = np.append(self._n_o, d_squared.size)
             self._euclidian_displacements.append(Distribution(np.sqrt(d_squared.flatten())))
             distro = self.sample_until_normal(d_squared, d_squared.size, n_resamples, max_resamples, alpha,
                                               random_state)
@@ -527,21 +508,23 @@ def __init__(self,
                  random_state: np.random.mtrand.RandomState = None,
                  progress: bool = True):
         super().__init__(delta_t, disp_3d, sub_sample_dt, progress)
+        self._iterator = self.iterator(progress, range(int(len(self._displacements) / 2)))
         slice = DIMENSIONALITY[dimension.lower()]
         self.dims = len(dimension.lower())
+        timesteps = (self._delta_t / np.diff(self._delta_t)[0]).astype(int)
         for i in self._iterator:
             disp_slice = self._displacements[i][:, :, slice].reshape(self._displacements[i].shape[0],
                                                                      self._displacements[i].shape[1], self.dims)
             d_squared = np.sum(disp_slice**2, axis=2)
             coll_motion = np.sum(np.sum(disp_slice, axis=0)**2, axis=-1)
             if coll_motion.size <= 1:
                 continue
-            self._n_i = np.append(self._n_i, coll_motion.size)
+            self._n_o = np.append(self._n_o, coll_motion.size)
             self._euclidian_displacements.append(Distribution(np.sqrt(d_squared.flatten())))
             distro = self.sample_until_normal(coll_motion, coll_motion.size, n_resamples, max_resamples, alpha,
                                               random_state)
             self._distributions.append(distro)
-            self._n = np.append(self._n, coll_motion.mean())
+            self._n = np.append(self._n, distro.n)
             self._s = np.append(self._s, np.std(distro.samples, ddof=1))
             self._v = np.append(self._v, np.var(distro.samples, ddof=1))
             self._ngp = np.append(self._ngp, self.ngp_calculation(d_squared.flatten()))
@@ -587,25 +570,27 @@ def __init__(self,
                  random_state: np.random.mtrand.RandomState = None,
                  progress: bool = True):
         super().__init__(delta_t, disp_3d, sub_sample_dt, progress)
+        self._iterator = self.iterator(progress, range(int(len(self._displacements) / 2)))
         try:
             _ = len(ionic_charge)
         except TypeError:
             ionic_charge = np.ones(self._displacements[0].shape[0]) * ionic_charge
         slice = DIMENSIONALITY[dimension.lower()]
         self.dims = len(dimension.lower())
+        timesteps = (self._delta_t / np.diff(self._delta_t)[0]).astype(int)
         for i in self._iterator:
             disp_slice = self._displacements[i][:, :, slice].reshape(self._displacements[i].shape[0],
                                                                      self._displacements[i].shape[1], self.dims)
             d_squared = np.sum(disp_slice**2, axis=2)
             sq_chg_motion = np.sum(np.sum((ionic_charge * self._displacements[i].T).T, axis=0)**2, axis=-1)
             if sq_chg_motion.size <= 1:
                 continue
-            self._n_i = np.append(self._n_i, sq_chg_motion.size)
+            self._n_o = np.append(self._n_o, sq_chg_motion.size)
             self._euclidian_displacements.append(Distribution(np.sqrt(d_squared.flatten())))
             distro = self.sample_until_normal(sq_chg_motion, sq_chg_motion.size, n_resamples, max_resamples, alpha,
                                               random_state)
             self._distributions.append(distro)
-            self._n = np.append(self._n, sq_chg_motion.mean())
+            self._n = np.append(self._n, distro.n)
             self._s = np.append(self._s, np.std(distro.samples, ddof=1))
             self._v = np.append(self._v, np.var(distro.samples, ddof=1))
             self._ngp = np.append(self._ngp, self.ngp_calculation(d_squared.flatten()))
@@ -626,10 +611,29 @@ def _bootstrap(array: np.ndarray, n_samples: int, n_resamples: int, random_state
     :return: Resampled values from the array
     """
     return [
-        np.mean(resample(array.flatten(), n_samples=n_samples, random_state=random_state)) for j in range(n_resamples)
+        np.mean(resample(array.flatten(), n_samples=n_samples, random_state=random_state).flatten()) for j in range(n_resamples)
     ]
 
 
+def _populate_covariance_matrix(variances: np.ndarray, n_samples: np.ndarray) -> np.ndarray:
+    """
+    Populate the covariance matrix for the generalised least squares methodology.
+
+    :param variances: The variances for each timestep
+    :param n_samples: Number of independent trajectories for each timestep
+
+    :return: An estimated covariance matrix for the system
+    """
+    covariance_matrix = np.zeros((variances.size, variances.size))
+    for i in range(0, variances.size):
+        for j in range(i, variances.size):
+            ratio = n_samples[i] / n_samples[j]
+            value = ratio * variances[i]
+            covariance_matrix[i, j] = value
+            covariance_matrix[j, i] = np.copy(covariance_matrix[i, j])
+    return covariance_matrix
+
+
 def _straight_line(abscissa: np.ndarray, gradient: float, intercept: float = 0.0) -> np.ndarray:
     """
     A one dimensional straight line function.

diff --git a/kinisi/parser.py b/kinisi/parser.py
@@ -60,9 +60,9 @@ def __init__(self,
         if n_steps is None:
             nsteps = drift_corrected.shape[1]
 
-        timesteps = self.get_timesteps(nsteps)
+        self.timesteps = self.get_timesteps(nsteps)
 
-        self.delta_t, self.disp_3d = self.get_disps(timesteps, drift_corrected, progress)
+        self.delta_t, self.disp_3d = self.get_disps(self.timesteps, drift_corrected, progress)
 
     @property
     def volume(self) -> float:
@@ -124,7 +124,7 @@ def get_timesteps(self, nsteps: int) -> np.ndarray:
             min_dt = 1
         if min_dt >= nsteps:
             raise ValueError('min_dt is greater than or equal to the maximum simulation length.')
-        timesteps = np.arange(min_dt, nsteps, 1)
+        timesteps = np.arange(min_dt, nsteps + 1, 1, dtype=int)
         return timesteps
 
     def get_disps(self,
@@ -148,17 +148,19 @@ def get_disps(self,
             iterator = timesteps
         disp_mem = 0
         for i, timestep in enumerate(iterator):
-            disp_mem += np.product(drift_corrected[self.indices, i + 1::i + 1].shape) * 8
+            disp_mem += np.product(drift_corrected[self.indices, timestep::timestep].shape) * 8
         disp_mem *= 1e-9
         if disp_mem > self.memory_limit:
             raise MemoryError(f"The memory limit of this job is {self.memory_limit:.1e} GB but the "
                               f"displacement values will use {disp_mem:.1e} GB. Please either increase "
                               "the memory_limit parameter or descrease the sampling rate (see "
                               "https://kinisi.readthedocs.io/en/latest/memory_limit.html).")
         for i, timestep in enumerate(iterator):
-            disp = np.subtract(drift_corrected[self.indices, timestep::timestep],
-                               drift_corrected[self.indices, :-timestep:timestep])
-            disp_3d.append(disp)
+            disp = np.concatenate([drift_corrected[self.indices, np.newaxis, timestep - 1], 
+                                   np.subtract(drift_corrected[self.indices, timestep:], 
+                                               drift_corrected[self.indices, :-timestep])], 
+                                  axis=1)
+            disp_3d.append(disp[:, ::timestep])
         return delta_t, disp_3d