From 20080b5d9621f05ad558e24b92d923b994bdf3e1 Mon Sep 17 00:00:00 2001 From: Janosh Riebesell Date: Mon, 24 Oct 2022 16:44:41 -0700 Subject: [PATCH] fix WBM index mismatch between CSEs and summary --- data/wbm/fetch_process_wbm_dataset.py | 263 ++++++++++++++------------ tests/test_init.py | 6 +- 2 files changed, 142 insertions(+), 127 deletions(-) diff --git a/data/wbm/fetch_process_wbm_dataset.py b/data/wbm/fetch_process_wbm_dataset.py index 08319899..8d6232f5 100644 --- a/data/wbm/fetch_process_wbm_dataset.py +++ b/data/wbm/fetch_process_wbm_dataset.py @@ -40,6 +40,7 @@ module_dir = os.path.dirname(__file__) today = f"{datetime.now():%Y-%m-%d}" +warnings.filterwarnings("ignore", category=UserWarning, module="pymatgen") # %% links to google drive files received via email from 1st author Hai-Chen Wang @@ -58,7 +59,7 @@ file_path = f"{module_dir}/raw/wbm-structures-step-{step}.json.bz2" if os.path.exists(file_path): - print(f"{step=}: file exists, skipping {file_path=}") + print(f"{file_path} already exists, skipping") continue print(f"{step=}") @@ -66,16 +67,21 @@ # %% -summary_id_file = "1639IFUG7poaDE2uB6aISUOi65ooBwCIg" summary_path = f"{module_dir}/raw/wbm-summary.txt" -gdown.download(f"https://drive.google.com/u/0/uc?id={summary_id_file}", summary_path) + +if not os.path.exists(summary_path): + summary_id_file = "1639IFUG7poaDE2uB6aISUOi65ooBwCIg" + summary_url = f"https://drive.google.com/u/0/uc?id={summary_id_file}" + gdown.download(summary_url, summary_path) # %% json_paths = sorted(glob(f"{module_dir}/raw/wbm-structures-step-*.json.bz2")) step_lens = (61848, 52800, 79205, 40328, 23308) # step 3 has 79,211 structures but only 79,205 ComputedStructureEntries -# the 6 extra structures have missing energy, volume, etc. in the summary file +# i.e. 6 extra structures which have missing energy, volume, etc. in the summary file +bad_struct_ids = (70802, 70803, 70825, 70826, 70828, 70829) + assert len(json_paths) == len(step_lens), "Mismatch in WBM steps and JSON files" wbm_struct_json_checksums = ( @@ -105,24 +111,38 @@ assert checksum == wbm_struct_json_checksums[step - 1], "bad JSON file checksum" if step == 3: - # step 3 has 6 extra entries, see comment above - additional_ids = (70802, 70803, 70825, 70826, 70828, 70829) - df = df.drop(index=[f"step_3_{id}" for id in additional_ids]) + df = df.drop(index=[f"step_3_{id}" for id in bad_struct_ids]) + # re-index after dropping bad structures to get same indices as summary file + # where IDs are consecutive, i.e. step_3_70801 is followed by step_3_70802, + # not step_3_70804, etc. + df.index = [f"step_3_{idx + 1}" for idx in range(len(df))] step_len = step_lens[step - 1] assert len(df) == step_len, f"bad len for {step=}: {len(df)} != {step_len}" dfs_wbm_structs[step] = df +# NOTE step 5 is missing 2 initial structures +assert dict(dfs_wbm_structs[5].isna().sum()) == {"opt": 0, "org": 2} +assert list(dfs_wbm_structs[5].query("org.isna()").index) == [ + "step_5_23165", + "step_5_23293", +] + + # %% -df_all_steps = pd.concat(dfs_wbm_structs.values()) +df_wbm = pd.concat(dfs_wbm_structs.values()) -assert len(df_all_steps) == sum(step_lens) +assert len(df_wbm) == sum(step_lens) def increment_wbm_material_id(wbm_id: str) -> str: """Maps step_1_0, step_1_1, ... onto wbm-step-1-1, wbm-step-1-2, ...""" - prefix, step_num, material_num = wbm_id.split("_") + try: + prefix, step_num, material_num = wbm_id.split("_") + except ValueError: + print(f"bad {wbm_id=}") + return wbm_id assert prefix == "step" assert step_num.isdigit() and material_num.isdigit() @@ -130,18 +150,14 @@ def increment_wbm_material_id(wbm_id: str) -> str: return f"wbm-step-{step_num}-{int(material_num) + 1}" -df_all_steps.index = df_all_steps.index.map(increment_wbm_material_id) -df_all_steps.index.name = "material_id" -assert df_all_steps.index[0] == "wbm-step-1-1" -assert df_all_steps.index[-1] == "wbm-step-5-23308" +df_wbm.index = df_wbm.index.map(increment_wbm_material_id) +df_wbm.index.name = "material_id" +assert df_wbm.index[0] == "wbm-step-1-1" +assert df_wbm.index[-1] == "wbm-step-5-23308" -df_all_steps["initial_structure"] = df_all_steps.pop("org") -df_all_steps["final_structure"] = df_all_steps.pop("opt") -assert df_all_steps.columns == ["initial_structure", "final_structure"] - -# df_all_steps = pd.read_json( -# f"{module_dir}/2022-10-18-wbm-init+final-structures.json.bz2" -# ) +df_wbm["initial_structure"] = df_wbm.pop("org") +df_wbm["final_structure"] = df_wbm.pop("opt") +assert list(df_wbm.columns) == ["initial_structure", "final_structure"] # %% download WBM ComputedStructureEntries from @@ -156,6 +172,7 @@ def increment_wbm_material_id(wbm_id: str) -> str: ): file_path = f"{module_dir}/raw/wbm-cse-{filename.lower().replace('_', '-')}" if os.path.exists(file_path): + print(f"{file_path} already exists, skipping") continue urllib.request.urlretrieve(f"{mat_cloud_url}&{filename=}", file_path) @@ -177,8 +194,10 @@ def increment_wbm_material_id(wbm_id: str) -> str: """ dfs_wbm_cses = {} -for json_path in tqdm(cse_step_paths): +for json_path in cse_step_paths: step = int(json_path.split(".json.bz2")[0][-1]) + print(f"{step=}") + assert step in range(1, 6) if step in dfs_wbm_cses: print(f"{json_path=} already loaded.") @@ -192,57 +211,49 @@ def increment_wbm_material_id(wbm_id: str) -> str: # %% -df_all_steps["computed_structure_entry"] = pd.concat(dfs_wbm_cses.values()).to_numpy() +df_wbm["computed_structure_entry"] = pd.concat(dfs_wbm_cses.values()).to_numpy() -for cse in tqdm(df_all_steps.computed_structure_entry): +for mat_id, cse in df_wbm.computed_structure_entry.items(): # needed to ensure MaterialsProjectCompatibility can process the entries cse["parameters"]["run_type"] = ( "GGA+U" if cse["parameters"]["is_hubbard"] else "GGA" ) - + cse["entry_id"] = mat_id + assert cse["entry_id"].startswith("wbm-step-") assert pd.Series( - cse["parameters"]["run_type"] for cse in tqdm(df_all_steps.computed_structure_entry) + cse["parameters"]["run_type"] for cse in tqdm(df_wbm.computed_structure_entry) ).value_counts().to_dict() == {"GGA": 248481, "GGA+U": 9008} -# %% -# get composition from CSEs - -df_all_steps["composition_from_cse"] = [ +# %% get composition from CSEs +df_wbm["composition_from_cse"] = [ ComputedStructureEntry.from_dict(cse).composition - for cse in tqdm(df_all_steps.computed_structure_entry) + for cse in tqdm(df_wbm.computed_structure_entry) ] -df_all_steps["composition_from_relaxed_struct"] = [ - Structure.from_dict(struct).composition - for struct in tqdm(df_all_steps.final_structure) +df_wbm["composition_from_final_struct"] = [ + Structure.from_dict(struct).composition for struct in tqdm(df_wbm.final_structure) ] - -# all but 1 composition matches between CSE and relaxed structure +# all but 1 composition matches between CSE and final structure # mismatching ID: wbm-step-1-37977 which becomes equal on reduction: # CSE Comp: Ag4 Bi4 O12 -# relaxed Comp: Ag16 Bi16 O48 - - -df_unmatched = df_all_steps.query( - "composition_from_cse != composition_from_relaxed_struct" -) -assert len(df_unmatched) == 1 -assert df_unmatched.index[0] == "wbm-step-1-37977" +# final structure Comp: Ag16 Bi16 O48 +df_mismatch = df_wbm.query("composition_from_cse != composition_from_final_struct") +assert len(df_mismatch) == 1 +assert df_mismatch.index[0] == "wbm-step-1-37977" assert ( - df_unmatched.iloc[0].composition_from_cse.reduced_composition - == df_unmatched.iloc[0].composition_from_relaxed_struct.reduced_composition + df_mismatch.iloc[0].composition_from_cse.reduced_composition + == df_mismatch.iloc[0].composition_from_final_struct.reduced_composition ) -df_all_steps.pop("composition_from_relaxed_struct") # not needed anymore +df_wbm.pop("composition_from_final_struct") # not needed anymore -# %% -# randomly sample structures and ensure they match between CSE and relaxed structure +# %% randomly sample structures and ensure they match between CSE and final structure n_samples = 1000 -for row in tqdm(df_all_steps.sample(n_samples).itertuples(), total=n_samples): +for row in tqdm(df_wbm.sample(n_samples).itertuples(), total=n_samples): struct_final = Structure.from_dict(row.final_structure) struct_from_cse = Structure.from_dict(row.computed_structure_entry["structure"]) assert struct_final.matches(struct_from_cse), f"structure mismatch for {row.Index=}" @@ -255,22 +266,11 @@ def increment_wbm_material_id(wbm_id: str) -> str: # %% -for mat_id, cse in df_all_steps.computed_structure_entry.items(): - cse["entry_id"] = mat_id - -assert not any( - entry["entry_id"] is None for entry in df_all_steps.computed_structure_entry -) - -df_all_steps["formula_from_cse"] = [ - x.formula for x in df_all_steps.pop("composition_from_cse") -] -df_all_steps.drop(columns=["final_structure", "cse"]).to_json( +df_wbm["formula_from_cse"] = [x.formula for x in df_wbm.pop("composition_from_cse")] +df_wbm[["initial_structure", "computed_structure_entry", "formula_from_cse"]].to_json( f"{module_dir}/{today}-wbm-cses+init-structs.json.bz2" ) -# df_all_steps = pd.read_json(f"{module_dir}/2022-10-19-wbm-cses+init-structs.json.bz2") - # %% col_map = { @@ -301,80 +301,88 @@ def increment_wbm_material_id(wbm_id: str) -> str: df_summary_bz2.reset_index(drop=True).query(query_str), ) -# fix bad energy in which is 0 in df_summary, -63.68 in CSE -df_summary.at["wbm-step-2-18689", "energy"] = df_all_steps.loc[ - "wbm-step-2-18689" -].computed_structure_entry["energy"] - # make sure dropping materials with 0 volume removes exactly 6 materials, the same ones -# listed in additional_ids above -assert len(df_summary.query("volume > 0")) == len(df_all_steps) +# listed in bad_struct_ids above +assert len(df_summary.query("volume > 0")) == len(df_wbm) assert all( df_summary.reset_index().query("volume == 0").index.values - sum(step_lens[:2]) - == additional_ids + == bad_struct_ids ) df_summary = df_summary.query("volume > 0") -df_summary.index = df_all_steps.index +df_summary.index = df_summary.index.map(increment_wbm_material_id) +assert sum(df_summary.index != df_wbm.index) == 0 + +# fix bad energy which is 0 in df_summary but a more realistic -63.68 in CSE +df_summary.at["wbm-step-2-18689", "uncorrected_energy"] = df_wbm.loc[ + "wbm-step-2-18689" +].computed_structure_entry["energy"] # %% scatter plot summary energies vs CSE energies df_summary["uncorrected_energy_from_cse"] = [ - cse["energy"] for cse in tqdm(df_all_steps.computed_structure_entry) + cse["energy"] for cse in tqdm(df_wbm.computed_structure_entry) ] # check CSE and summary energies are consistent, only exceeding 0.1 eV difference twice -e_diff_summary_vs_cse = ( +diff_e_cse_e_summary = ( df_summary.uncorrected_energy - df_summary.uncorrected_energy_from_cse ) assert ( - e_diff_summary_vs_cse.max() < 0.15 and sum(e_diff_summary_vs_cse > 0.1) == 2 + diff_e_cse_e_summary.max() < 0.15 and sum(diff_e_cse_e_summary > 0.1) == 2 ), df_summary.query("energy - uncorrected_energy_from_cse > 0.1") density_scatter(df_summary.uncorrected_energy, df_summary.uncorrected_energy_from_cse) # %% -df_all_steps["cse"] = [ - ComputedStructureEntry.from_dict(x) - for x in tqdm(df_all_steps.computed_structure_entry) -] - - # raw WBM ComputedStructureEntries have no energy corrections applied: -assert all(cse.uncorrected_energy == cse.energy for cse in df_all_steps.cse) +assert all(cse.uncorrected_energy == cse.energy for cse in df_wbm.cse) # summary and CSE n_sites match -assert all(df_summary.n_sites == len(cse.structure) for cse in df_all_steps.cse) +assert all(df_summary.n_sites == [len(cse.structure) for cse in df_wbm.cse]) -mp_compat = MP2020Compat() -compat_out = mp_compat.process_entries(df_all_steps.cse, clean=True, verbose=True) -assert len(compat_out) == len(df_all_steps) == len(df_summary) +mp_compat = MP2020Compat() if False else MPLegacyCompat() +compat_out = mp_compat.process_entries(df_wbm.cse, clean=True, verbose=True) -n_corrected = sum(cse.uncorrected_energy != cse.energy for cse in df_all_steps.cse) +mp_compat.process_entry(cse) +assert len(compat_out) == len(df_wbm) == len(df_summary) + +n_corrected = sum(cse.uncorrected_energy != cse.energy for cse in df_wbm.cse) if isinstance(mp_compat, MPLegacyCompat): assert n_corrected == 39595, f"{n_corrected=}" if isinstance(mp_compat, MP2020Compat): assert n_corrected == 100931, f"{n_corrected=}" corr_label = "mp2020" if isinstance(mp_compat, MP2020Compat) else "legacy" -df_summary[f"e_corrections_{corr_label}"] = [ - cse.energy - cse.uncorrected_energy for cse in df_all_steps.cse +df_summary[f"e_correction_{corr_label}"] = [ + cse.energy - cse.uncorrected_energy for cse in df_wbm.cse ] - -assert df_summary.e_corrections_mp2020.mean().round(4) == -0.9979 -assert df_summary.e_corrections_legacy.mean().round(4) == -0.0643 +assert df_summary.e_correction_mp2020.mean().round(4) == -0.9979 +assert df_summary.e_correction_legacy.mean().round(4) == -0.0643 assert (df_summary.filter(like="corrections").abs() > 1e-4).sum().to_dict() == { - "e_corrections_mp2020": 100931, - "e_corrections_legacy": 39595, + "e_correction_mp2020": 100931, + "e_correction_legacy": 39595, } +# mp_compat.process_entry(cse) for CSE with id wbm-step-1-24459 causes Jupyter kernel to +# crash reason unknown, still occurs even after updating deps like pymatgen, numpy, +# ipykernel, notebook and after re-downloading all data from scratch + +# 9%|▉ | 23601/257489 [00:02<00:20, 11661.38it/s] +# The Kernel crashed while executing code in the the current cell or a previous cell. +# Please review the code in the cell(s) to identify a possible cause of the failure. +# Click here for more info. View Jupyter log for further details. + +cse = df_wbm.computed_structure_entry["wbm-step-1-24459"] +cse = ComputedStructureEntry.from_dict(cse) +mp_compat.process_entry(cse) + + # %% -with gzip.open( - f"{ROOT}/mb_discovery/energy/2022-10-13-rhys/ppd-mp.pkl.gz", "rb" -) as zip_file: +with gzip.open(f"{module_dir}/2022-10-13-rhys/ppd-mp.pkl.gz", "rb") as zip_file: ppd_rhys: PatchedPhaseDiagram = pickle.load(zip_file) @@ -383,10 +391,9 @@ def increment_wbm_material_id(wbm_id: str) -> str: # %% -warnings.filterwarnings("ignore", category=UserWarning, module="pymatgen") - +# this loop needs the warnings filter above to not crash Jupyter kernel with logs # takes ~20 min at 200 it/s for 250k entries in WBM -for entry in tqdm(df_all_steps.cse): +for entry in tqdm(df_wbm.cse): assert entry.entry_id.startswith("wbm-step-") corr_label = "mp2020_" if isinstance(mp_compat, MP2020Compat) else "legacy_" # corr_label = "un" @@ -401,40 +408,52 @@ def increment_wbm_material_id(wbm_id: str) -> str: df_summary.at[at_idx] = e_above_hull +# %% compute formation energies +# first make sure source and target dfs have matching indices +assert sum(df_wbm.index != df_summary.index) == 0 + +e_form_key = "e_form_per_atom_uncorrected_ppd_mp_rhys" +for mat_id, cse in tqdm(df_wbm.cse.items(), total=len(df_wbm)): + assert mat_id == cse.entry_id, f"{mat_id=} {cse.entry_id=}" + assert mat_id in df_summary.index, f"{mat_id=} not in df_summary" + df_summary.at[cse.entry_id, e_form_key] = ppd_rhys.get_form_energy_per_atom(cse) + +assert len(df_summary) == sum(step_lens) + +df_summary["e_form_per_atom_legacy_corrected_ppd_mp_rhys"] = ( + df_summary[e_form_key] + df_summary.e_correction_legacy +) + + # %% calculate formation energies from CSEs wrt MP elemental reference energies df_summary["e_form_per_atom_uncorrected"] = [ get_e_form_per_atom(dict(composition=row.formula, energy=row.uncorrected_energy)) for row in tqdm(df_summary.itertuples(), total=len(df_summary)) ] -density_scatter( - df=df_summary, - x="e_form_per_atom_uncorrected", - y="e_form_per_atom_mp2020_corrected", + +# %% MP2020 corrections are much larger than legacy corrections +ax = density_scatter( + df_summary.e_correction_legacy / df_summary.n_sites, + df_summary.e_correction_mp2020 / df_summary.n_sites, + xlabel="legacy corrections (eV / atom)", + ylabel="MP2020 corrections (eV / atom)", ) +ax.axis("equal") +# ax.figure.savefig(f"{ROOT}/tmp/{today}-legacy-vs-mp2020-corrections.png") # %% -df_hull = pd.read_csv( - f"{ROOT}/data/2022-06-11-from-rhys/wbm-e-above-mp-hull.csv" -).set_index("material_id") - - -df_hull[df_all_steps.filter(like="e_above_hull").columns] = df_all_steps.filter( - like="e_above_hull" -) +df_summary.round(6).to_csv(f"{module_dir}/{today}-wbm-summary.csv") -density_scatter( - df=df_hull.query("e_above_hull_legacy_corrected < 3"), - x="e_above_hull_mp", - y="e_above_hull_legacy_corrected", - xlabel=r"E$_\mathrm{above hull}$ from Rhys (legacy corrected)", - ylabel=r"E$_\mathrm{above hull}$ from legacy corrected", +df_summary = pd.read_csv(f"{module_dir}/2022-10-19-wbm-summary.csv").set_index( + "material_id" ) -df_hull.query("abs(e_above_hull_mp2020_corrected - e_above_hull_mp) > 0.1") - +# %% read WBM dataset from disk +df_wbm = pd.read_json(f"{module_dir}/2022-10-19-wbm-cses+init-structs.json.bz2") -# %% -df_summary.round(6).to_csv(f"{module_dir}/{today}-wbm-summary.csv") +df_wbm["cse"] = [ + ComputedStructureEntry.from_dict(x) for x in tqdm(df_wbm.computed_structure_entry) +] diff --git a/tests/test_init.py b/tests/test_init.py index c8a3c3cc..2ab136bb 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -1,19 +1,15 @@ import os from typing import Any -from mb_discovery import as_dict_handler, chunks +from mb_discovery import PKG_DIR, ROOT, as_dict_handler, chunks def test_has_root_pkg_dir() -> None: - - from mb_discovery import PKG_DIR, ROOT - assert os.path.isdir(ROOT) assert os.path.isdir(PKG_DIR) def test_chunks() -> None: - assert list(chunks([], 1)) == [] assert list(chunks([1], 1)) == [[1]] assert list(chunks([1, 2], 1)) == [[1], [2]]