From 20080b5d9621f05ad558e24b92d923b994bdf3e1 Mon Sep 17 00:00:00 2001
From: Janosh Riebesell <janosh.riebesell@gmail.com>
Date: Mon, 24 Oct 2022 16:44:41 -0700
Subject: [PATCH] fix WBM index mismatch between CSEs and summary

---
 data/wbm/fetch_process_wbm_dataset.py | 263 ++++++++++++++------------
 tests/test_init.py                    |   6 +-
 2 files changed, 142 insertions(+), 127 deletions(-)

diff --git a/data/wbm/fetch_process_wbm_dataset.py b/data/wbm/fetch_process_wbm_dataset.py
index 08319899..8d6232f5 100644
--- a/data/wbm/fetch_process_wbm_dataset.py
+++ b/data/wbm/fetch_process_wbm_dataset.py
@@ -40,6 +40,7 @@
 
 module_dir = os.path.dirname(__file__)
 today = f"{datetime.now():%Y-%m-%d}"
+warnings.filterwarnings("ignore", category=UserWarning, module="pymatgen")
 
 
 # %% links to google drive files received via email from 1st author Hai-Chen Wang
@@ -58,7 +59,7 @@
     file_path = f"{module_dir}/raw/wbm-structures-step-{step}.json.bz2"
 
     if os.path.exists(file_path):
-        print(f"{step=}: file exists, skipping {file_path=}")
+        print(f"{file_path} already exists, skipping")
         continue
 
     print(f"{step=}")
@@ -66,16 +67,21 @@
 
 
 # %%
-summary_id_file = "1639IFUG7poaDE2uB6aISUOi65ooBwCIg"
 summary_path = f"{module_dir}/raw/wbm-summary.txt"
-gdown.download(f"https://drive.google.com/u/0/uc?id={summary_id_file}", summary_path)
+
+if not os.path.exists(summary_path):
+    summary_id_file = "1639IFUG7poaDE2uB6aISUOi65ooBwCIg"
+    summary_url = f"https://drive.google.com/u/0/uc?id={summary_id_file}"
+    gdown.download(summary_url, summary_path)
 
 
 # %%
 json_paths = sorted(glob(f"{module_dir}/raw/wbm-structures-step-*.json.bz2"))
 step_lens = (61848, 52800, 79205, 40328, 23308)
 # step 3 has 79,211 structures but only 79,205 ComputedStructureEntries
-# the 6 extra structures have missing energy, volume, etc. in the summary file
+# i.e. 6 extra structures which have missing energy, volume, etc. in the summary file
+bad_struct_ids = (70802, 70803, 70825, 70826, 70828, 70829)
+
 
 assert len(json_paths) == len(step_lens), "Mismatch in WBM steps and JSON files"
 wbm_struct_json_checksums = (
@@ -105,24 +111,38 @@
     assert checksum == wbm_struct_json_checksums[step - 1], "bad JSON file checksum"
 
     if step == 3:
-        # step 3 has 6 extra entries, see comment above
-        additional_ids = (70802, 70803, 70825, 70826, 70828, 70829)
-        df = df.drop(index=[f"step_3_{id}" for id in additional_ids])
+        df = df.drop(index=[f"step_3_{id}" for id in bad_struct_ids])
+        # re-index after dropping bad structures to get same indices as summary file
+        # where IDs are consecutive, i.e. step_3_70801 is followed by step_3_70802,
+        # not step_3_70804, etc.
+        df.index = [f"step_3_{idx + 1}" for idx in range(len(df))]
 
     step_len = step_lens[step - 1]
     assert len(df) == step_len, f"bad len for {step=}: {len(df)} != {step_len}"
     dfs_wbm_structs[step] = df
 
 
+# NOTE step 5 is missing 2 initial structures
+assert dict(dfs_wbm_structs[5].isna().sum()) == {"opt": 0, "org": 2}
+assert list(dfs_wbm_structs[5].query("org.isna()").index) == [
+    "step_5_23165",
+    "step_5_23293",
+]
+
+
 # %%
-df_all_steps = pd.concat(dfs_wbm_structs.values())
+df_wbm = pd.concat(dfs_wbm_structs.values())
 
-assert len(df_all_steps) == sum(step_lens)
+assert len(df_wbm) == sum(step_lens)
 
 
 def increment_wbm_material_id(wbm_id: str) -> str:
     """Maps step_1_0, step_1_1, ... onto wbm-step-1-1, wbm-step-1-2, ..."""
-    prefix, step_num, material_num = wbm_id.split("_")
+    try:
+        prefix, step_num, material_num = wbm_id.split("_")
+    except ValueError:
+        print(f"bad {wbm_id=}")
+        return wbm_id
 
     assert prefix == "step"
     assert step_num.isdigit() and material_num.isdigit()
@@ -130,18 +150,14 @@ def increment_wbm_material_id(wbm_id: str) -> str:
     return f"wbm-step-{step_num}-{int(material_num) + 1}"
 
 
-df_all_steps.index = df_all_steps.index.map(increment_wbm_material_id)
-df_all_steps.index.name = "material_id"
-assert df_all_steps.index[0] == "wbm-step-1-1"
-assert df_all_steps.index[-1] == "wbm-step-5-23308"
+df_wbm.index = df_wbm.index.map(increment_wbm_material_id)
+df_wbm.index.name = "material_id"
+assert df_wbm.index[0] == "wbm-step-1-1"
+assert df_wbm.index[-1] == "wbm-step-5-23308"
 
-df_all_steps["initial_structure"] = df_all_steps.pop("org")
-df_all_steps["final_structure"] = df_all_steps.pop("opt")
-assert df_all_steps.columns == ["initial_structure", "final_structure"]
-
-# df_all_steps = pd.read_json(
-#     f"{module_dir}/2022-10-18-wbm-init+final-structures.json.bz2"
-# )
+df_wbm["initial_structure"] = df_wbm.pop("org")
+df_wbm["final_structure"] = df_wbm.pop("opt")
+assert list(df_wbm.columns) == ["initial_structure", "final_structure"]
 
 
 # %% download WBM ComputedStructureEntries from
@@ -156,6 +172,7 @@ def increment_wbm_material_id(wbm_id: str) -> str:
 ):
     file_path = f"{module_dir}/raw/wbm-cse-{filename.lower().replace('_', '-')}"
     if os.path.exists(file_path):
+        print(f"{file_path} already exists, skipping")
         continue
 
     urllib.request.urlretrieve(f"{mat_cloud_url}&{filename=}", file_path)
@@ -177,8 +194,10 @@ def increment_wbm_material_id(wbm_id: str) -> str:
 """
 
 dfs_wbm_cses = {}
-for json_path in tqdm(cse_step_paths):
+for json_path in cse_step_paths:
     step = int(json_path.split(".json.bz2")[0][-1])
+    print(f"{step=}")
+    assert step in range(1, 6)
 
     if step in dfs_wbm_cses:
         print(f"{json_path=} already loaded.")
@@ -192,57 +211,49 @@ def increment_wbm_material_id(wbm_id: str) -> str:
 
 
 # %%
-df_all_steps["computed_structure_entry"] = pd.concat(dfs_wbm_cses.values()).to_numpy()
+df_wbm["computed_structure_entry"] = pd.concat(dfs_wbm_cses.values()).to_numpy()
 
-for cse in tqdm(df_all_steps.computed_structure_entry):
+for mat_id, cse in df_wbm.computed_structure_entry.items():
     # needed to ensure MaterialsProjectCompatibility can process the entries
     cse["parameters"]["run_type"] = (
         "GGA+U" if cse["parameters"]["is_hubbard"] else "GGA"
     )
-
+    cse["entry_id"] = mat_id
+    assert cse["entry_id"].startswith("wbm-step-")
 
 assert pd.Series(
-    cse["parameters"]["run_type"] for cse in tqdm(df_all_steps.computed_structure_entry)
+    cse["parameters"]["run_type"] for cse in tqdm(df_wbm.computed_structure_entry)
 ).value_counts().to_dict() == {"GGA": 248481, "GGA+U": 9008}
 
 
-# %%
-# get composition from CSEs
-
-df_all_steps["composition_from_cse"] = [
+# %% get composition from CSEs
+df_wbm["composition_from_cse"] = [
     ComputedStructureEntry.from_dict(cse).composition
-    for cse in tqdm(df_all_steps.computed_structure_entry)
+    for cse in tqdm(df_wbm.computed_structure_entry)
 ]
 
-df_all_steps["composition_from_relaxed_struct"] = [
-    Structure.from_dict(struct).composition
-    for struct in tqdm(df_all_steps.final_structure)
+df_wbm["composition_from_final_struct"] = [
+    Structure.from_dict(struct).composition for struct in tqdm(df_wbm.final_structure)
 ]
 
-
-# all but 1 composition matches between CSE and relaxed structure
+# all but 1 composition matches between CSE and final structure
 # mismatching ID: wbm-step-1-37977 which becomes equal on reduction:
 # CSE Comp: Ag4 Bi4 O12
-# relaxed Comp: Ag16 Bi16 O48
-
-
-df_unmatched = df_all_steps.query(
-    "composition_from_cse != composition_from_relaxed_struct"
-)
-assert len(df_unmatched) == 1
-assert df_unmatched.index[0] == "wbm-step-1-37977"
+# final structure Comp: Ag16 Bi16 O48
+df_mismatch = df_wbm.query("composition_from_cse != composition_from_final_struct")
+assert len(df_mismatch) == 1
+assert df_mismatch.index[0] == "wbm-step-1-37977"
 assert (
-    df_unmatched.iloc[0].composition_from_cse.reduced_composition
-    == df_unmatched.iloc[0].composition_from_relaxed_struct.reduced_composition
+    df_mismatch.iloc[0].composition_from_cse.reduced_composition
+    == df_mismatch.iloc[0].composition_from_final_struct.reduced_composition
 )
 
-df_all_steps.pop("composition_from_relaxed_struct")  # not needed anymore
+df_wbm.pop("composition_from_final_struct")  # not needed anymore
 
 
-# %%
-# randomly sample structures and ensure they match between CSE and relaxed structure
+# %% randomly sample structures and ensure they match between CSE and final structure
 n_samples = 1000
-for row in tqdm(df_all_steps.sample(n_samples).itertuples(), total=n_samples):
+for row in tqdm(df_wbm.sample(n_samples).itertuples(), total=n_samples):
     struct_final = Structure.from_dict(row.final_structure)
     struct_from_cse = Structure.from_dict(row.computed_structure_entry["structure"])
     assert struct_final.matches(struct_from_cse), f"structure mismatch for {row.Index=}"
@@ -255,22 +266,11 @@ def increment_wbm_material_id(wbm_id: str) -> str:
 
 
 # %%
-for mat_id, cse in df_all_steps.computed_structure_entry.items():
-    cse["entry_id"] = mat_id
-
-assert not any(
-    entry["entry_id"] is None for entry in df_all_steps.computed_structure_entry
-)
-
-df_all_steps["formula_from_cse"] = [
-    x.formula for x in df_all_steps.pop("composition_from_cse")
-]
-df_all_steps.drop(columns=["final_structure", "cse"]).to_json(
+df_wbm["formula_from_cse"] = [x.formula for x in df_wbm.pop("composition_from_cse")]
+df_wbm[["initial_structure", "computed_structure_entry", "formula_from_cse"]].to_json(
     f"{module_dir}/{today}-wbm-cses+init-structs.json.bz2"
 )
 
-# df_all_steps = pd.read_json(f"{module_dir}/2022-10-19-wbm-cses+init-structs.json.bz2")
-
 
 # %%
 col_map = {
@@ -301,80 +301,88 @@ def increment_wbm_material_id(wbm_id: str) -> str:
     df_summary_bz2.reset_index(drop=True).query(query_str),
 )
 
-# fix bad energy in which is 0 in df_summary, -63.68 in CSE
-df_summary.at["wbm-step-2-18689", "energy"] = df_all_steps.loc[
-    "wbm-step-2-18689"
-].computed_structure_entry["energy"]
-
 
 # make sure dropping materials with 0 volume removes exactly 6 materials, the same ones
-# listed in additional_ids above
-assert len(df_summary.query("volume > 0")) == len(df_all_steps)
+# listed in bad_struct_ids above
+assert len(df_summary.query("volume > 0")) == len(df_wbm)
 assert all(
     df_summary.reset_index().query("volume == 0").index.values - sum(step_lens[:2])
-    == additional_ids
+    == bad_struct_ids
 )
 df_summary = df_summary.query("volume > 0")
-df_summary.index = df_all_steps.index
+df_summary.index = df_summary.index.map(increment_wbm_material_id)
+assert sum(df_summary.index != df_wbm.index) == 0
+
+# fix bad energy which is 0 in df_summary but a more realistic -63.68 in CSE
+df_summary.at["wbm-step-2-18689", "uncorrected_energy"] = df_wbm.loc[
+    "wbm-step-2-18689"
+].computed_structure_entry["energy"]
 
 
 # %% scatter plot summary energies vs CSE energies
 df_summary["uncorrected_energy_from_cse"] = [
-    cse["energy"] for cse in tqdm(df_all_steps.computed_structure_entry)
+    cse["energy"] for cse in tqdm(df_wbm.computed_structure_entry)
 ]
 
 # check CSE and summary energies are consistent, only exceeding 0.1 eV difference twice
-e_diff_summary_vs_cse = (
+diff_e_cse_e_summary = (
     df_summary.uncorrected_energy - df_summary.uncorrected_energy_from_cse
 )
 assert (
-    e_diff_summary_vs_cse.max() < 0.15 and sum(e_diff_summary_vs_cse > 0.1) == 2
+    diff_e_cse_e_summary.max() < 0.15 and sum(diff_e_cse_e_summary > 0.1) == 2
 ), df_summary.query("energy - uncorrected_energy_from_cse > 0.1")
 
 density_scatter(df_summary.uncorrected_energy, df_summary.uncorrected_energy_from_cse)
 
 
 # %%
-df_all_steps["cse"] = [
-    ComputedStructureEntry.from_dict(x)
-    for x in tqdm(df_all_steps.computed_structure_entry)
-]
-
-
 # raw WBM ComputedStructureEntries have no energy corrections applied:
-assert all(cse.uncorrected_energy == cse.energy for cse in df_all_steps.cse)
+assert all(cse.uncorrected_energy == cse.energy for cse in df_wbm.cse)
 # summary and CSE n_sites match
-assert all(df_summary.n_sites == len(cse.structure) for cse in df_all_steps.cse)
+assert all(df_summary.n_sites == [len(cse.structure) for cse in df_wbm.cse])
 
 
-mp_compat = MP2020Compat()
-compat_out = mp_compat.process_entries(df_all_steps.cse, clean=True, verbose=True)
-assert len(compat_out) == len(df_all_steps) == len(df_summary)
+mp_compat = MP2020Compat() if False else MPLegacyCompat()
+compat_out = mp_compat.process_entries(df_wbm.cse, clean=True, verbose=True)
 
-n_corrected = sum(cse.uncorrected_energy != cse.energy for cse in df_all_steps.cse)
+mp_compat.process_entry(cse)
+assert len(compat_out) == len(df_wbm) == len(df_summary)
+
+n_corrected = sum(cse.uncorrected_energy != cse.energy for cse in df_wbm.cse)
 if isinstance(mp_compat, MPLegacyCompat):
     assert n_corrected == 39595, f"{n_corrected=}"
 if isinstance(mp_compat, MP2020Compat):
     assert n_corrected == 100931, f"{n_corrected=}"
 
 corr_label = "mp2020" if isinstance(mp_compat, MP2020Compat) else "legacy"
-df_summary[f"e_corrections_{corr_label}"] = [
-    cse.energy - cse.uncorrected_energy for cse in df_all_steps.cse
+df_summary[f"e_correction_{corr_label}"] = [
+    cse.energy - cse.uncorrected_energy for cse in df_wbm.cse
 ]
 
-
-assert df_summary.e_corrections_mp2020.mean().round(4) == -0.9979
-assert df_summary.e_corrections_legacy.mean().round(4) == -0.0643
+assert df_summary.e_correction_mp2020.mean().round(4) == -0.9979
+assert df_summary.e_correction_legacy.mean().round(4) == -0.0643
 assert (df_summary.filter(like="corrections").abs() > 1e-4).sum().to_dict() == {
-    "e_corrections_mp2020": 100931,
-    "e_corrections_legacy": 39595,
+    "e_correction_mp2020": 100931,
+    "e_correction_legacy": 39595,
 }
 
 
+# mp_compat.process_entry(cse) for CSE with id wbm-step-1-24459 causes Jupyter kernel to
+# crash reason unknown, still occurs even after updating deps like pymatgen, numpy,
+# ipykernel, notebook and after re-downloading all data from scratch
+
+#   9%|▉         | 23601/257489 [00:02<00:20, 11661.38it/s]
+# The Kernel crashed while executing code in the the current cell or a previous cell.
+# Please review the code in the cell(s) to identify a possible cause of the failure.
+# Click here for more info. View Jupyter log for further details.
+
+cse = df_wbm.computed_structure_entry["wbm-step-1-24459"]
+cse = ComputedStructureEntry.from_dict(cse)
+mp_compat.process_entry(cse)
+
+
 # %%
-with gzip.open(
-    f"{ROOT}/mb_discovery/energy/2022-10-13-rhys/ppd-mp.pkl.gz", "rb"
-) as zip_file:
+with gzip.open(f"{module_dir}/2022-10-13-rhys/ppd-mp.pkl.gz", "rb") as zip_file:
     ppd_rhys: PatchedPhaseDiagram = pickle.load(zip_file)
 
 
@@ -383,10 +391,9 @@ def increment_wbm_material_id(wbm_id: str) -> str:
 
 
 # %%
-warnings.filterwarnings("ignore", category=UserWarning, module="pymatgen")
-
+# this loop needs the warnings filter above to not crash Jupyter kernel with logs
 # takes ~20 min at 200 it/s for 250k entries in WBM
-for entry in tqdm(df_all_steps.cse):
+for entry in tqdm(df_wbm.cse):
     assert entry.entry_id.startswith("wbm-step-")
     corr_label = "mp2020_" if isinstance(mp_compat, MP2020Compat) else "legacy_"
     # corr_label = "un"
@@ -401,40 +408,52 @@ def increment_wbm_material_id(wbm_id: str) -> str:
         df_summary.at[at_idx] = e_above_hull
 
 
+# %% compute formation energies
+# first make sure source and target dfs have matching indices
+assert sum(df_wbm.index != df_summary.index) == 0
+
+e_form_key = "e_form_per_atom_uncorrected_ppd_mp_rhys"
+for mat_id, cse in tqdm(df_wbm.cse.items(), total=len(df_wbm)):
+    assert mat_id == cse.entry_id, f"{mat_id=} {cse.entry_id=}"
+    assert mat_id in df_summary.index, f"{mat_id=} not in df_summary"
+    df_summary.at[cse.entry_id, e_form_key] = ppd_rhys.get_form_energy_per_atom(cse)
+
+assert len(df_summary) == sum(step_lens)
+
+df_summary["e_form_per_atom_legacy_corrected_ppd_mp_rhys"] = (
+    df_summary[e_form_key] + df_summary.e_correction_legacy
+)
+
+
 # %% calculate formation energies from CSEs wrt MP elemental reference energies
 df_summary["e_form_per_atom_uncorrected"] = [
     get_e_form_per_atom(dict(composition=row.formula, energy=row.uncorrected_energy))
     for row in tqdm(df_summary.itertuples(), total=len(df_summary))
 ]
 
-density_scatter(
-    df=df_summary,
-    x="e_form_per_atom_uncorrected",
-    y="e_form_per_atom_mp2020_corrected",
+
+# %% MP2020 corrections are much larger than legacy corrections
+ax = density_scatter(
+    df_summary.e_correction_legacy / df_summary.n_sites,
+    df_summary.e_correction_mp2020 / df_summary.n_sites,
+    xlabel="legacy corrections (eV / atom)",
+    ylabel="MP2020 corrections (eV / atom)",
 )
+ax.axis("equal")
+# ax.figure.savefig(f"{ROOT}/tmp/{today}-legacy-vs-mp2020-corrections.png")
 
 
 # %%
-df_hull = pd.read_csv(
-    f"{ROOT}/data/2022-06-11-from-rhys/wbm-e-above-mp-hull.csv"
-).set_index("material_id")
-
-
-df_hull[df_all_steps.filter(like="e_above_hull").columns] = df_all_steps.filter(
-    like="e_above_hull"
-)
+df_summary.round(6).to_csv(f"{module_dir}/{today}-wbm-summary.csv")
 
-density_scatter(
-    df=df_hull.query("e_above_hull_legacy_corrected < 3"),
-    x="e_above_hull_mp",
-    y="e_above_hull_legacy_corrected",
-    xlabel=r"E$_\mathrm{above hull}$ from Rhys (legacy corrected)",
-    ylabel=r"E$_\mathrm{above hull}$ from legacy corrected",
+df_summary = pd.read_csv(f"{module_dir}/2022-10-19-wbm-summary.csv").set_index(
+    "material_id"
 )
 
 
-df_hull.query("abs(e_above_hull_mp2020_corrected - e_above_hull_mp) > 0.1")
-
+# %% read WBM dataset from disk
+df_wbm = pd.read_json(f"{module_dir}/2022-10-19-wbm-cses+init-structs.json.bz2")
 
-# %%
-df_summary.round(6).to_csv(f"{module_dir}/{today}-wbm-summary.csv")
+df_wbm["cse"] = [
+    ComputedStructureEntry.from_dict(x) for x in tqdm(df_wbm.computed_structure_entry)
+]
diff --git a/tests/test_init.py b/tests/test_init.py
index c8a3c3cc..2ab136bb 100644
--- a/tests/test_init.py
+++ b/tests/test_init.py
@@ -1,19 +1,15 @@
 import os
 from typing import Any
 
-from mb_discovery import as_dict_handler, chunks
+from mb_discovery import PKG_DIR, ROOT, as_dict_handler, chunks
 
 
 def test_has_root_pkg_dir() -> None:
-
-    from mb_discovery import PKG_DIR, ROOT
-
     assert os.path.isdir(ROOT)
     assert os.path.isdir(PKG_DIR)
 
 
 def test_chunks() -> None:
-
     assert list(chunks([], 1)) == []
     assert list(chunks([1], 1)) == [[1]]
     assert list(chunks([1, 2], 1)) == [[1], [2]]