Merge pull request #135 from lbluque/master

More flexible indices from cutoffs in cluster subspace (+ minor fixes)
CederGroupHub · Sep 2, 2021 · 0fdcbad · 0fdcbad
2 parents cea2946 + f930fb7
commit 0fdcbad
Show file tree

Hide file tree

Showing 139 changed files with 3,801 additions and 22,271 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -18,7 +18,7 @@ Use this section to keep track of changes in the works.
   ([lbluque](https://github.com/lbluque))
 * `UniformlyRandomKernel` for high temperature/random limit sampling.
 `ThermalKernel` ABC class for all temperature based MC Kernels.
-  [\#133](https://github.com/CederGroupHub/smol/pull/134)
+  [\#134](https://github.com/CederGroupHub/smol/pull/134)
   ([lbluque](https://github.com/lbluque))
 
 ### Fixed

diff --git a/docs/build/.buildinfo b/docs/build/.buildinfo
@@ -1,4 +1,4 @@
 # Sphinx build info version 1
 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
-config: 16a90369e52b5ddb1fd305bb17ac2d09
+config: 69cc6bb721c10078b22cbedaabcaf519
 tags: 645f666f9bcd5a90fca523b33c5a78b7
diff --git a/docs/build/.doctrees/api.doctree b/docs/build/.doctrees/api.doctree
diff --git a/docs/build/.doctrees/environment.pickle b/docs/build/.doctrees/environment.pickle
diff --git a/docs/build/.doctrees/examples.doctree b/docs/build/.doctrees/examples.doctree
diff --git a/docs/build/.doctrees/notebooks/1-1-creating-a-ce-w-electrostatics.doctree b/docs/build/.doctrees/notebooks/1-1-creating-a-ce-w-electrostatics.doctree
diff --git a/docs/build/.doctrees/notebooks/1-creating-a-ce.doctree b/docs/build/.doctrees/notebooks/1-creating-a-ce.doctree
diff --git a/docs/build/.doctrees/notebooks/2-1-running-semigrand-mc.doctree b/docs/build/.doctrees/notebooks/2-1-running-semigrand-mc.doctree
diff --git a/docs/build/.doctrees/notebooks/2-running-canonical-mc.doctree b/docs/build/.doctrees/notebooks/2-running-canonical-mc.doctree
diff --git a/docs/build/.doctrees/notebooks/3-training-data-preparation.doctree b/docs/build/.doctrees/notebooks/3-training-data-preparation.doctree
diff --git a/docs/build/.doctrees/notebooks/4-adding-structures-in-parallel.doctree b/docs/build/.doctrees/notebooks/4-adding-structures-in-parallel.doctree
diff --git a/docs/build/.doctrees/notebooks/advanced-ce-functionality.doctree b/docs/build/.doctrees/notebooks/advanced-ce-functionality.doctree
diff --git a/docs/build/.doctrees/notebooks/advanced-cluster-expansion-fitting/create_LMTO_CE.doctree b/docs/build/.doctrees/notebooks/advanced-cluster-expansion-fitting/create_LMTO_CE.doctree
diff --git a/...ld/.doctrees/notebooks/advanced-cluster-expansion-fitting/optimization_LMTO_Bayes.doctree b/...ld/.doctrees/notebooks/advanced-cluster-expansion-fitting/optimization_LMTO_Bayes.doctree
diff --git a/...ild/.doctrees/notebooks/advanced-cluster-expansion-fitting/optimization_LMTO_L0L1.doctree b/...ild/.doctrees/notebooks/advanced-cluster-expansion-fitting/optimization_LMTO_L0L1.doctree
diff --git a/docs/build/.doctrees/notebooks/basis-orthogonalization.doctree b/docs/build/.doctrees/notebooks/basis-orthogonalization.doctree
diff --git a/docs/build/.doctrees/notebooks/translating-pyabinitio-work.doctree b/docs/build/.doctrees/notebooks/translating-pyabinitio-work.doctree
diff --git a/docs/build/.doctrees/smol.cofe.doctree b/docs/build/.doctrees/smol.cofe.doctree
diff --git a/docs/build/.doctrees/smol.cofe.space.doctree b/docs/build/.doctrees/smol.cofe.space.doctree
diff --git a/docs/build/_sources/examples.rst.txt b/docs/build/_sources/examples.rst.txt
@@ -20,7 +20,7 @@ Basic Examples
 
 .. _Running Semi-Grand Canonical Monte Carlo: notebooks/2-1-running-semigrand-mc.ipynb
 
-.. _Preparing cluster expansion training data: notebooks/3-training-data-preparations.ipynb
+.. _Preparing cluster expansion training data: notebooks/3-training-data-preparation.ipynb
 
 .. _Adding structures to a StructureWrangler in parallel: notebooks/4-adding-structures-in-parallel.ipynb
 

diff --git a/docs/build/_sources/notebooks/1-1-creating-a-ce-w-electrostatics.ipynb.txt b/docs/build/_sources/notebooks/1-1-creating-a-ce-w-electrostatics.ipynb.txt
@@ -317,13 +317,6 @@
     "# and the expansion have it, there is no need to do so.\n",
     "save_work(file_path, wrangler, expansion)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {

diff --git a/docs/build/_sources/notebooks/1-creating-a-ce.ipynb.txt b/docs/build/_sources/notebooks/1-creating-a-ce.ipynb.txt
@@ -314,14 +314,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "The predicted energy for a structure with composition Li+5 Ni4+1 Ni3+5 O2-12 is -36.46654400197699 eV/prim.\n",
+      "The predicted energy for a structure with composition Li+2 Ni4+4 Ni3+2 O2-12 is -33.42762309783674 eV/prim.\n",
       "\n",
       "The fitted coefficients are:\n",
       "[-3.44424307e+01  1.52944807e+00  1.52944807e+00 -7.11937730e-02\n",
@@ -376,7 +376,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/tmp/ipykernel_70367/663351370.py:10: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "/tmp/ipykernel_302279/663351370.py:10: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
       "  structure = np.random.choice(wrangler.structures)\n"
      ]
     }
@@ -413,7 +413,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -434,7 +434,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -455,13 +455,6 @@
     "for name, obj in work.items():\n",
     "    print(f'{name}: {type(obj)}\\n')"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {

diff --git a/docs/build/_sources/notebooks/2-1-running-semigrand-mc.ipynb.txt b/docs/build/_sources/notebooks/2-1-running-semigrand-mc.ipynb.txt
diff --git a/docs/build/_sources/notebooks/2-running-canonical-mc.ipynb.txt b/docs/build/_sources/notebooks/2-running-canonical-mc.ipynb.txt
@@ -112,7 +112,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Sampling information: {'name': 'CanonicalEnsemble', 'kernel': 'Metropolis', 'step': 'swap', 'seed': 16199914525253226741}\n"
+      "Sampling information: {'name': 'CanonicalEnsemble', 'kernel': 'Metropolis', 'step': 'swap', 'seed': 6449828449790598858}\n"
      ]
     }
    ],
@@ -211,7 +211,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Sampling 1 chain(s) at 1500.00 K from a cell with 64 sites: 100%|██████████| 1000000/1000000 [01:50<00:00, 9069.45it/s]\n"
+      "Sampling 1 chain(s) from a cell with 64 sites...: 100%|████████████| 1000000/1000000 [01:43<00:00, 9691.60it/s]\n"
      ]
     }
    ],
@@ -233,9 +233,9 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Fraction of succesfull steps (efficiency) 0.390277\n",
-      "The last step energy is -551.4509466160619 eV\n",
-      "The minimum energy in trajectory is -552.6314360816017 eV\n"
+      "Fraction of succesfull steps (efficiency) 0.388293\n",
+      "The last step energy is -552.04643792069 eV\n",
+      "The minimum energy in trajectory is -552.6314360816021 eV\n"
      ]
     }
    ],
@@ -303,8 +303,8 @@
      "text": [
       "A total of 10000 samples taken.\n",
       "A total of 9000 samples used for production.\n",
-      "The average energy is -551.7860869325897 eV\n",
-      "The energy variance is 0.04600543434696906 eV^2\n"
+      "The average energy is -551.7892375412837 eV\n",
+      "The energy variance is 0.04837940701855058 eV^2\n"
      ]
     }
    ],
@@ -363,7 +363,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.9.5"
   }
  },
  "nbformat": 4,

diff --git a/docs/build/_sources/notebooks/3-training-data-preparation.ipynb.txt b/docs/build/_sources/notebooks/3-training-data-preparation.ipynb.txt
@@ -11,24 +11,14 @@
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/lbluque/Develop/pymatgen/pymatgen/ext/matproj.py:454: DeprecationWarning: __init__ is deprecated\n",
-      "MaterialsProjectCompatibility will be updated with new correction classes as well as new values of corrections and uncertainties in 2020\n",
-      "  def get_pourbaix_entries(self, chemsys, solid_compat=MaterialsProjectCompatibility()):\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import numpy as np\n",
     "import json\n",
     "from monty.serialization import loadfn\n",
     "from pymatgen.core.structure import Structure\n",
     "from smol.cofe import ClusterSubspace, StructureWrangler\n",
-    "from smol.cofe.configspace import get_specie"
+    "from smol.cofe.space import get_species"
    ]
   },
   {
@@ -46,6 +36,24 @@
    "execution_count": 2,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/lbluque/Develop/smol/smol/cofe/wrangling/wrangler.py:631: UserWarning: Unable to match Ni4+6 O2-12 with properties {'total_energy': -188.28833} to supercell_structure. Throwing out.\n",
+      " Error Message: Supercell could not be found from structure\n",
+      "  warnings.warn(\n",
+      "/home/lbluque/Develop/smol/smol/cofe/wrangling/wrangler.py:631: UserWarning: Unable to match Li+2 Ni4+4 Ni3+2 O2-12 with properties {'total_energy': -200.13866} to supercell_structure. Throwing out.\n",
+      " Error Message: Mapping could not be found from structure.\n",
+      "  warnings.warn(\n",
+      "/home/lbluque/Develop/smol/smol/cofe/wrangling/wrangler.py:631: UserWarning: Unable to match Li+2 Ni3+2 Ni4+4 O2-12 with properties {'total_energy': -200.42049} to supercell_structure. Throwing out.\n",
+      " Error Message: Mapping could not be found from structure.\n",
+      "  warnings.warn(\n",
+      "/home/lbluque/Develop/smol/smol/cofe/wrangling/wrangler.py:631: UserWarning: Unable to match Li+3 Ni4+4 Ni2+1 Ni3+1 O2-12 with properties {'total_energy': -206.70884} to supercell_structure. Throwing out.\n",
+      " Error Message: Supercell could not be found from structure\n",
+      "  warnings.warn(\n"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -151,8 +159,8 @@
     "concentration = []\n",
     "for energy, occu in zip(wrangler.get_property_vector('total_energy'),\n",
     "                        wrangler.occupancy_strings):\n",
-    "    n_Li = sum(sp == get_specie('Li+') for sp in occu)\n",
-    "    n_vac = sum(sp == get_specie('Vacancy') for sp in occu)\n",
+    "    n_Li = sum(sp == get_species('Li+') for sp in occu)\n",
+    "    n_vac = sum(sp == get_species('Vacancy') for sp in occu)\n",
     "    c_Li = n_Li/(n_Li + n_vac)\n",
     "    mix_en = energy - c_Li*e_LiNiO2 - (1 - c_Li)*e_Ni2O3\n",
     "    concentration.append(c_Li)\n",
@@ -178,11 +186,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from smol.cofe import weights_energy_above_hull, weights_energy_above_composition\n",
+    "from smol.cofe.wrangling import weights_energy_above_hull, weights_energy_above_composition\n",
     "\n",
     "above_compostion = weights_energy_above_composition(wrangler.structures,\n",
     "                                                    wrangler.get_property_vector('total_energy', normalize=False),\n",
@@ -211,55 +219,65 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### 4) Filtering structures\n",
-    "The `StructureWrangler` class can also be used to filter structures to use for a fit based on some criteria.\n",
+    "### 4) Structure Selection\n",
+    "The `StructureWrangler` class can also be used to 'filter' structures to use for a fit based on some criteria. To do so we obtain the indices of all structures that satisfy some filtering critera\n",
     "\n",
-    "Currently only a filter by maximum ewald energy is implemented as part of the class.\n",
-    "\n",
-    "Going forward more filtering options can be implemented as people write and use different methods of filtering functions."
+    "For example here we will obtain all the structures with electrostatic energy below a given cuttoff"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "<ipython-input-6-aea799c1d123>:6: DeprecationWarning: the filter_by_ewald method is going to be deprecated.\n",
-      "The functionality will still be available but with a different interface\n",
-      "  wrangler.filter_by_ewald(max_ewald=2)\n"
-     ]
-    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Kept 26/27 structures with Ewald energies < 2 eV/prim.\n",
-      "The filters are saved as [{'Ewald': {'max_ewald': 2, 'nstructs_removed': 1, 'nstructs_total': 27}}]\n"
+      "Included 26/27 structures with Ewald energies < 2 eV/prim.\n",
+      "Saved indices are ['max_ewald_2']\n"
      ]
     }
    ],
    "source": [
     "# filter by maximum ewald energy\n",
     "# all structures with ewald energy above the cutoff\n",
     "# will be removed\n",
+    "from smol.cofe.wrangling import max_ewald_energy_indices\n",
     "\n",
-    "n_structs_before = wrangler.num_structures\n",
-    "wrangler.filter_by_ewald(max_ewald=2)\n",
+    "# get the structure indices\n",
+    "indices = max_ewald_energy_indices(wrangler,\n",
+    "                                   max_relative_energy=2)\n",
+    "# save them in the structure wrangler\n",
+    "wrangler.add_data_indices('max_ewald_2', indices)\n",
     "\n",
-    "print(f'Kept {wrangler.num_structures}/{n_structs_before} structures with Ewald energies < 2 eV/prim.')\n",
-    "print(f\"The filters are saved as {wrangler.metadata['applied_filters']}\")"
+    "print(f'Included {len(indices)}/{wrangler.num_structures} structures with Ewald energies < 2 eV/prim.')\n",
+    "print(f'Saved indices are {wrangler.available_indices}')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 18,
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Feature matrix shape: (26, 11)\n",
+      "Property vector shape (26,)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# you can use the indices for selected structures to\n",
+    "# obtain only the corresponding values for those structures\n",
+    "feature_matrix = wrangler.feature_matrix[indices]\n",
+    "prop_vector = wrangler.get_property_vector('total_energy')[indices]\n",
+    "\n",
+    "print(f'Feature matrix shape: {feature_matrix.shape}')\n",
+    "print(f'Property vector shape {prop_vector.shape}')"
+   ]
   }
  ],
  "metadata": {
@@ -278,7 +296,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.9.5"
   }
  },
  "nbformat": 4,