Skip to content

Commit

Permalink
add models/chgnet/{test_chgnet,join_m3gnet_results}.py
Browse files Browse the repository at this point in the history
convert remaining hard-coded data filepaths to use DATA_FILES

wrap ML relaxations in try/except to skip failing structures
not crashing jobs reduces missing preds rate
  • Loading branch information
janosh committed Jun 20, 2023
1 parent 59010fa commit da39074
Show file tree
Hide file tree
Showing 12 changed files with 405 additions and 88 deletions.
6 changes: 3 additions & 3 deletions models/bowsr/join_bowsr_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
import pymatviz
from tqdm import tqdm

from matbench_discovery import ROOT, today
from matbench_discovery import today
from matbench_discovery.data import DATA_FILES

__author__ = "Janosh Riebesell"
__date__ = "2022-09-22"
Expand Down Expand Up @@ -40,8 +41,7 @@


# %% compare against WBM formation energy targets to make sure we got sensible results
data_path = f"{ROOT}/data/wbm/2022-10-19-wbm-summary.csv"
df_wbm = pd.read_csv(data_path).set_index("material_id")
df_wbm = pd.read_csv(DATA_FILES.wbm_summary).set_index("material_id")


print(
Expand Down
40 changes: 19 additions & 21 deletions models/bowsr/test_bowsr.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,12 @@
import wandb
from maml.apps.bowsr.model.megnet import MEGNet
from maml.apps.bowsr.optimizer import BayesianOptimizer
from pymatgen.core import Structure
from pymatgen.entries.computed_entries import ComputedStructureEntry
from tqdm import tqdm

from matbench_discovery import DEBUG, ROOT, timestamp, today
from matbench_discovery.data import as_dict_handler
from matbench_discovery import DEBUG, timestamp, today
from matbench_discovery.data import DATA_FILES, as_dict_handler
from matbench_discovery.slurm import slurm_submit

__author__ = "Janosh Riebesell"
Expand All @@ -37,7 +39,7 @@
job_name = f"bowsr-{energy_model}-wbm-{task_type}{'-debug' if DEBUG else ''}"
out_dir = os.environ.get("SBATCH_OUTPUT", f"{module_dir}/{today}-{job_name}")

data_path = f"{ROOT}/data/wbm/2022-10-19-wbm-init-structs.json.bz2"
data_path = DATA_FILES.wbm_initial_structures

slurm_vars = slurm_submit(
job_name=job_name,
Expand Down Expand Up @@ -106,40 +108,36 @@
relax_results: dict[str, dict[str, Any]] = {}

if task_type == "IS2RE":
from pymatgen.core import Structure

structures = df_this_job.initial_structure.map(Structure.from_dict)
structures = df_this_job.initial_structure.map(Structure.from_dict).to_dict()
elif task_type == "RS2RE":
from pymatgen.entries.computed_entries import ComputedStructureEntry

structures = df_this_job.cse.map(
lambda x: ComputedStructureEntry.from_dict(x).structure
)
).to_dict()
else:
raise ValueError(f"Unknown {task_type = }")


for material_id, structure in tqdm(
structures.items(), desc="Main loop", total=len(structures), disable=None
):
for material_id in tqdm(structures, desc="Main loop", disable=None):
structure = structures[material_id]
if material_id in relax_results:
continue
try:
optimizer = BayesianOptimizer(
model=model, structure=structure, **bayes_optim_kwargs
)
optimizer.set_bounds()
# reason for devnull: https://github.com/materialsvirtuallab/maml/issues/469
# reason for /dev/null: https://github.com/materialsvirtuallab/maml/issues/469
with open(os.devnull, "w") as devnull, contextlib.redirect_stdout(devnull):
optimizer.optimize(**optimize_kwargs)

structure_bowsr, energy_bowsr = optimizer.get_optimized_structure_and_energy()
try:
struct_bowsr, energy_bowsr = optimizer.get_optimized_structure_and_energy()
except Exception as error:
print(f"Failed to relax {material_id}: {error}")

results = {
f"e_form_per_atom_bowsr_{energy_model}": model.predict_energy(
structure_bowsr
),
"structure_bowsr": structure_bowsr,
f"e_form_per_atom_bowsr_{energy_model}": model.predict_energy(struct_bowsr),
"structure_bowsr": struct_bowsr,
f"energy_bowsr_{energy_model}": energy_bowsr,
}

Expand All @@ -149,9 +147,9 @@


# %%
df_output = pd.DataFrame(relax_results).T
df_output.index.name = "material_id"
df_out = pd.DataFrame(relax_results).T
df_out.index.name = "material_id"

df_output.reset_index().to_json(out_path, default_handler=as_dict_handler)
df_out.reset_index().to_json(out_path, default_handler=as_dict_handler)

wandb.log_artifact(out_path, type=job_name)
8 changes: 4 additions & 4 deletions models/cgcnn/test_cgcnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
from torch.utils.data import DataLoader
from tqdm import tqdm

from matbench_discovery import CHECKPOINT_DIR, DEBUG, ROOT, WANDB_PATH, today
from matbench_discovery.data import df_wbm
from matbench_discovery import CHECKPOINT_DIR, DEBUG, WANDB_PATH, today
from matbench_discovery.data import DATA_FILES, df_wbm
from matbench_discovery.plots import wandb_scatter
from matbench_discovery.slurm import slurm_submit

Expand Down Expand Up @@ -46,15 +46,15 @@

# %%
if task_type == "IS2RE":
data_path = f"{ROOT}/data/wbm/2022-10-19-wbm-init-structs.json.bz2"
data_path = DATA_FILES.wbm_initial_structures
# or for debug
# data_path = f"{ROOT}/data/wbm/2022-10-19-wbm-init-structs.json-1k-samples.bz2"
# created with:
# df = df.sample(1000)
# df.reset_index().to_json(data_path.replace(".json", "-1k-samples.json"))
input_col = "initial_structure"
elif task_type == "RS2RE":
data_path = f"{ROOT}/data/wbm/2022-10-19-wbm-computed-structure-entries.json.bz2"
data_path = DATA_FILES.wbm_computed_structure_entries
input_col = "relaxed_structure"
else:
raise ValueError(f"Unexpected {task_type=}")
Expand Down
144 changes: 144 additions & 0 deletions models/chgnet/join_chgnet_results.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
"""Concatenate chgnet results from multiple data files generated by slurm job array
into single file.
"""


# %%
from __future__ import annotations

import os
import warnings
from glob import glob

import pandas as pd
from megnet.utils.models import load_model
from pymatgen.core import Structure
from pymatgen.entries.compatibility import MaterialsProject2020Compatibility
from pymatgen.entries.computed_entries import ComputedStructureEntry
from pymatviz import density_scatter
from tqdm import tqdm

from matbench_discovery import today
from matbench_discovery.data import DATA_FILES, as_dict_handler
from matbench_discovery.energy import get_e_form_per_atom
from matbench_discovery.preds import df_wbm as df_summary
from matbench_discovery.preds import e_form_col

__author__ = "Janosh Riebesell"
__date__ = "2023-03-01"

warnings.filterwarnings(action="ignore", category=UserWarning, module="pymatgen")


# %%
module_dir = os.path.dirname(__file__)
task_type = "IS2RE"
date = "2023-03-02"
glob_pattern = f"{date}-chgnet-wbm-{task_type}*/*.json.gz"
file_paths = sorted(glob(f"{module_dir}/{glob_pattern}"))
print(f"Found {len(file_paths):,} files for {glob_pattern = }")

dfs: dict[str, pd.DataFrame] = {}


# %%
for file_path in tqdm(file_paths):
if file_path in dfs:
continue
df = pd.read_json(file_path).set_index("material_id")
# drop trajectory to save memory
dfs[file_path] = df.drop(columns="chgnet_trajectory")


# %%
df_chgnet = pd.concat(dfs.values()).round(4)


# %%
df_wbm = pd.read_json(DATA_FILES.wbm_computed_structure_entries).set_index(
"material_id"
)

df_wbm["cse"] = [
ComputedStructureEntry.from_dict(x) for x in tqdm(df_wbm.computed_structure_entry)
]


# %% transfer chgnet energies and relaxed structures WBM CSEs
cse: ComputedStructureEntry
for row in tqdm(df_chgnet.itertuples(), total=len(df_chgnet)):
mat_id, struct_dict, chgnet_energy, *_ = row
chgnet_struct = Structure.from_dict(struct_dict)
cse = df_wbm.loc[mat_id, "cse"]
cse._energy = chgnet_energy # cse._energy is the uncorrected energy
cse._structure = chgnet_struct
df_chgnet.loc[mat_id, "cse"] = cse


# %%
df_chgnet["e_form_per_atom_chgnet_uncorrected"] = [
get_e_form_per_atom(cse) for cse in tqdm(df_chgnet.cse)
]


# %% apply energy corrections
out = MaterialsProject2020Compatibility().process_entries(
df_chgnet.cse, verbose=True, clean=True
)
assert len(out) == len(df_chgnet)


# %% compute corrected formation energies
df_chgnet["e_form_per_atom_chgnet"] = [
get_e_form_per_atom(cse) for cse in tqdm(df_chgnet.cse)
]

df_chgnet[e_form_col] = df_summary[e_form_col]


# %%
ax = density_scatter(
df=df_chgnet, x="e_form_per_atom_chgnet", y="e_form_per_atom_chgnet_uncorrected"
)
ax = density_scatter(df=df_chgnet, x="e_form_per_atom_chgnet", y=e_form_col)


# %% load 2019 MEGNet formation energy model
megnet_mp_e_form = load_model("Eform_MP_2019")
megnet_e_form_preds: dict[str, float] = {}


# %% predict formation energies on chgnet relaxed structure with MEGNet
for material_id, cse in tqdm(df_wbm.cse.items(), total=len(df_wbm)):
if material_id in megnet_e_form_preds:
continue
try:
struct = cse.structure
[e_form_per_atom] = megnet_mp_e_form.predict_structure(struct)
megnet_e_form_preds[material_id] = e_form_per_atom
except Exception as exc:
print(f"Failed to predict {material_id=}: {exc}")

df_chgnet["e_form_per_atom_chgnet_megnet"] = pd.Series(megnet_e_form_preds)

assert (
n_isna := df_chgnet.e_form_per_atom_chgnet_megnet.isna().sum()
) < 10, f"{n_isna=}, expected 7 or similar"


# %%
ax = density_scatter(
df=df_chgnet, x="e_form_per_atom_chgnet", y="e_form_per_atom_chgnet_megnet"
)


# %%
out_path = f"{module_dir}/{today}-chgnet-wbm-{task_type}.json.gz"
df_chgnet = df_chgnet.round(4)
df_chgnet.reset_index().to_json(out_path, default_handler=as_dict_handler)

df_chgnet.select_dtypes("number").to_csv(out_path.replace(".json.gz", ".csv"))

# in_path = f"{module_dir}/2022-10-31-chgnet-wbm-IS2RE.json.gz"
# df_chgnet_csv = pd.read_csv(in_path.replace(".json.gz", ".csv"))
# df_chgnet = pd.read_json(in_path).set_index("material_id")
39 changes: 39 additions & 0 deletions models/chgnet/metadata.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
model_name: CHGNet
model_version: 0.0.1
matbench_discovery_version: 1.0
date_added: "2023-03-03"
date_published: "2023-03-01"
authors:
- name: Bowen Deng
affiliation: UC Berkeley
- name: Peichen Zhong
affiliation: UC Berkeley
orcid: https://orcid.org/0000-0003-1921-1628
email: zhongpc@berkeley.edu
- name: KyuJung Jun
affiliation: UC Berkeley
orcid: https://orcid.org/0000-0003-1974-028X
- name: Kevin Han
affiliation: UC Berkeley
orcid: https://orcid.org/0000-0002-4028-2108
- name: Christopher J. Bartel
affiliation: University of Minnesota
orcid: https://orcid.org/0000-0002-5198-5036
- name: Christopher J. Bartel
affiliation: Gerbrand Ceder
orcid: https://orcid.org/0000-0001-9275-3605
email: gceder@berkeley.edu
repo: https://github.com/CederGroupHub/chgnet
doi: https://doi.org/10.48550/arXiv.2302.14231
preprint: https://arxiv.org/abs/2302.14231
requirements:
torch: 1.11.0
ase: 3.22.0
pymatgen: 2022.10.22
numpy: 1.24.0
pandas: 1.5.1
trained_on_benchmark: false

notes:
description: The Crystal Hamiltonian Graph Neural Network (CHGNet) is a universal GNN-based interatomic potential trained on energies, forces, stresses and magnetic moments from the MP trajectory dataset containing ∼1.5 million inorganic structures.
training: Using pre-trained model released with preprint. Training set unreleased until after review.
Loading

0 comments on commit da39074

Please sign in to comment.