diff --git a/matbench_discovery/plots.py b/matbench_discovery/plots.py index 0dd47d9e..cc94fe76 100644 --- a/matbench_discovery/plots.py +++ b/matbench_discovery/plots.py @@ -63,6 +63,7 @@ def unit(text: str) -> str: formula="Formula", ) model_labels = dict( + alignn="ALIGNN", bowsr_megnet="BOWSR + MEGNet", chgnet="CHGNet", chgnet_megnet="CHGNet + MEGNet", diff --git a/matbench_discovery/preds.py b/matbench_discovery/preds.py index eb33d0f2..2dc80f1f 100644 --- a/matbench_discovery/preds.py +++ b/matbench_discovery/preds.py @@ -60,6 +60,9 @@ class PredFiles(Files): # wrenformer 10-member ensemble wrenformer = "wrenformer/2022-11-15-wrenformer-IS2RE-preds.csv" + alignn = "alignn/2023-06-02-alignn-wbm-IS2RE.csv.gz" + alignn_pretrained = "alignn/2023-06-03-mp-e-form-alignn-wbm-IS2RE.csv.gz" + # model_labels remaps model keys to pretty plot labels (see Files) PRED_FILES = PredFiles(root=f"{ROOT}/models", key_map=model_labels) diff --git a/matbench_discovery/structure.py b/matbench_discovery/structure.py index 89fe9f4f..4c267235 100644 --- a/matbench_discovery/structure.py +++ b/matbench_discovery/structure.py @@ -13,6 +13,9 @@ def perturb_structure(struct: Structure, gamma: float = 1.5) -> Structure: """Perturb the atomic coordinates of a pymatgen structure. Used for CGCNN+P training set augmentation. + Not identical but very similar to the perturbation method used in + https://nature.com/articles/s41524-022-00891-8#Fig5. + Args: struct (Structure): pymatgen structure to be perturbed gamma (float, optional): Weibull distribution parameter. Defaults to 1.5. diff --git a/models/alignn/2023-06-02-alignn-wbm-IS2RE.csv.gz b/models/alignn/2023-06-02-alignn-wbm-IS2RE.csv.gz new file mode 100644 index 00000000..a3fa74e0 Binary files /dev/null and b/models/alignn/2023-06-02-alignn-wbm-IS2RE.csv.gz differ diff --git a/models/alignn/2023-06-03-mp-e-form-alignn-wbm-IS2RE.csv.gz b/models/alignn/2023-06-03-mp-e-form-alignn-wbm-IS2RE.csv.gz new file mode 100644 index 00000000..e084a446 Binary files /dev/null and b/models/alignn/2023-06-03-mp-e-form-alignn-wbm-IS2RE.csv.gz differ diff --git a/models/alignn/alignn-2023.01.10.patch b/models/alignn/alignn-2023.01.10.patch new file mode 100644 index 00000000..507e0060 --- /dev/null +++ b/models/alignn/alignn-2023.01.10.patch @@ -0,0 +1,179 @@ +diff --git a/alignn/data.py b/alignn/data.py +index 175b915..42ce26e 100644 +--- a/alignn/data.py ++++ b/alignn/data.py +@@ -171,8 +171,9 @@ def get_id_train_val_test( + # full train/val test split + # ids = ids[::-1] + id_train = ids[:n_train] +- id_val = ids[-(n_val + n_test) : -n_test] # noqa:E203 +- id_test = ids[-n_test:] ++ id_val = ids[-(n_val + n_test) : -n_test] if n_test > 0 else ids[-(n_val + n_test) :] # noqa:E203 ++ id_test = ids[n_test:] if n_test > 0 else [] ++ + return id_train, id_val, id_test + + +@@ -508,7 +509,7 @@ def get_train_val_loaders( + classification=classification_threshold is not None, + output_dir=output_dir, + tmp_name="test_data", +- ) ++ ) if len(dataset_test) > 0 else None + + collate_fn = train_data.collate + # print("line_graph,line_dih_graph", line_graph, line_dih_graph) +@@ -528,7 +529,7 @@ def get_train_val_loaders( + + val_loader = DataLoader( + val_data, +- batch_size=batch_size, ++ batch_size=1, + shuffle=False, + collate_fn=collate_fn, + drop_last=True, +@@ -549,9 +550,13 @@ def get_train_val_loaders( + torch.save(train_loader, train_sample) + torch.save(val_loader, val_sample) + torch.save(test_loader, test_sample) ++ + print("n_train:", len(train_loader.dataset)) +- print("n_val:", len(val_loader.dataset)) +- print("n_test:", len(test_loader.dataset)) ++ if val_loader.dataset is not None: ++ print("n_val:", len(val_loader.dataset)) ++ if test_loader.dataset is not None: ++ print("n_test:", len(test_loader.dataset)) ++ + return ( + train_loader, + val_loader, +diff --git a/alignn/train.py b/alignn/train.py +index 4fa072e..0b6330a 100644 +--- a/alignn/train.py ++++ b/alignn/train.py +@@ -69,7 +69,7 @@ torch.set_default_dtype(torch.float32) + + device = "cpu" + if torch.cuda.is_available(): +- device = torch.device("cuda") ++ device = torch.device("cuda:0") + + + def activated_output_transform(output): +@@ -817,6 +817,8 @@ def train_dgl( + train_eos = EpochOutputStore() + train_eos.attach(train_evaluator) + ++ best_mae = np.inf ++ + # collect evaluation performance + @trainer.on(Events.EPOCH_COMPLETED) + def log_results(engine): +@@ -839,6 +841,20 @@ def train_dgl( + history["train"][metric].append(tm) + history["validation"][metric].append(vm) + ++## Adapted so that the best model is saved ++## ---------------------------------------------------------------------------- ++ nonlocal best_mae ++ ++ if metric == 'mae' and vm < best_mae: ++ ++ best_mae = vm ++ ++ out_path = os.path.join(config.output_dir, 'best-model.pth') ++ print(f'Saving model with MAE={vm} to file "{out_path}"') ++ torch.save(net.state_dict(), out_path) ++ ++## ---------------------------------------------------------------------------- ++ + # for metric in metrics.keys(): + # history["train"][metric].append(tmetrics[metric]) + # history["validation"][metric].append(vmetrics[metric]) +@@ -978,39 +994,40 @@ def train_dgl( + and not classification + and config.model.output_features == 1 + ): +- net.eval() +- f = open( +- os.path.join(config.output_dir, "prediction_results_test_set.csv"), +- "w", +- ) +- f.write("id,target,prediction\n") +- targets = [] +- predictions = [] +- with torch.no_grad(): +- ids = test_loader.dataset.ids # [test_loader.dataset.indices] +- for dat, id in zip(test_loader, ids): +- g, lg, target = dat +- out_data = net([g.to(device), lg.to(device)]) +- out_data = out_data.cpu().numpy().tolist() +- if config.standard_scalar_and_pca: +- sc = pk.load( +- open(os.path.join(tmp_output_dir, "sc.pkl"), "rb") +- ) +- out_data = sc.transform(np.array(out_data).reshape(-1, 1))[ +- 0 +- ][0] +- target = target.cpu().numpy().flatten().tolist() +- if len(target) == 1: +- target = target[0] +- f.write("%s, %6f, %6f\n" % (id, target, out_data)) +- targets.append(target) +- predictions.append(out_data) +- f.close() ++ if test_loader is not None: ++ net.eval() ++ f = open( ++ os.path.join(config.output_dir, "prediction_results_test_set.csv"), ++ "w", ++ ) ++ f.write("id,target,prediction\n") ++ targets = [] ++ predictions = [] ++ with torch.no_grad(): ++ ids = test_loader.dataset.ids # [test_loader.dataset.indices] ++ for dat, id in zip(test_loader, ids): ++ g, lg, target = dat ++ out_data = net([g.to(device), lg.to(device)]) ++ out_data = out_data.cpu().numpy().tolist() ++ if config.standard_scalar_and_pca: ++ sc = pk.load( ++ open(os.path.join(tmp_output_dir, "sc.pkl"), "rb") ++ ) ++ out_data = sc.transform(np.array(out_data).reshape(-1, 1))[ ++ 0 ++ ][0] ++ target = target.cpu().numpy().flatten().tolist() ++ if len(target) == 1: ++ target = target[0] ++ f.write("%s, %6f, %6f\n" % (id, target, out_data)) ++ targets.append(target) ++ predictions.append(out_data) ++ f.close() + +- print( +- "Test MAE:", +- mean_absolute_error(np.array(targets), np.array(predictions)), +- ) ++ print( ++ "Test MAE:", ++ mean_absolute_error(np.array(targets), np.array(predictions)), ++ ) + if config.store_outputs and not classification: + x = [] + y = [] +diff --git a/alignn/train_folder.py b/alignn/train_folder.py +index b532d4e..c9eb4a5 100644 +--- a/alignn/train_folder.py ++++ b/alignn/train_folder.py +@@ -180,7 +180,7 @@ def train_for_folder( + train_val_test_loaders=[ + train_loader, + val_loader, +- test_loader, ++ None, + prepare_batch, + ], + ) diff --git a/models/alignn/alignn-config.json b/models/alignn/alignn-config.json new file mode 100644 index 00000000..2ae7b43f --- /dev/null +++ b/models/alignn/alignn-config.json @@ -0,0 +1,54 @@ +{ + "version": "112bbedebdaecf59fb18e11c929080fb2f358246", + "dataset": "user_data", + "target": "target", + "atom_features": "cgcnn", + "neighbor_strategy": "k-nearest", + "id_tag": "jid", + "random_seed": 123, + "classification_threshold": null, + "n_val": null, + "n_test": null, + "n_train": null, + "train_ratio": 0.9, + "val_ratio": 0.1, + "test_ratio": 0.0, + "target_multiplication_factor": null, + "epochs": 1000, + "n_early_stopping": 100, + "batch_size": 128, + "weight_decay": 1e-5, + "learning_rate": 0.001, + "filename": "sample", + "warmup_steps": 2000, + "criterion": "l1", + "optimizer": "adamw", + "scheduler": "onecycle", + "pin_memory": true, + "save_dataloader": false, + "write_checkpoint": true, + "write_predictions": true, + "store_outputs": true, + "progress": true, + "log_tensorboard": false, + "standard_scalar_and_pca": false, + "use_canonize": true, + "num_workers": 4, + "cutoff": 8.0, + "max_neighbors": 12, + "keep_data_order": false, + "model": { + "name": "alignn", + "alignn_layers": 4, + "gcn_layers": 4, + "atom_input_features": 92, + "edge_input_features": 80, + "triplet_input_features": 40, + "embedding_features": 64, + "hidden_features": 256, + "output_features": 1, + "link": "identity", + "zero_inflated": false, + "classification": false + } +} diff --git a/models/alignn/make_train_data.py b/models/alignn/make_train_data.py new file mode 100644 index 00000000..b52507e1 --- /dev/null +++ b/models/alignn/make_train_data.py @@ -0,0 +1,60 @@ +# %% Imports +import os + +import pandas as pd +from pymatgen.core import Structure +from sklearn.model_selection import train_test_split +from tqdm import tqdm, trange + +from matbench_discovery.data import DATA_FILES +from matbench_discovery.structure import perturb_structure + +__author__ = "Philipp Benner" +__date__ = "2023-06-02" + + +# %% +target_col = "formation_energy_per_atom" +input_col = "structure" +id_col = "material_id" +n_perturb = 0 + + +# %% load structures +df_cse = pd.read_json(DATA_FILES.mp_computed_structure_entries).set_index(id_col) +df_cse[input_col] = [ + Structure.from_dict(cse[input_col]) for cse in tqdm(df_cse.entry, disable=None) +] + +# load energies +df = pd.read_csv(DATA_FILES.mp_energies).set_index(id_col) +df[input_col] = df_cse[input_col] +assert target_col in df + + +# %% augment with randomly perturbed structures +df_aug = df.copy() +structs = df_aug.pop(input_col) +for idx in trange(n_perturb, desc="Generating perturbed structures"): + df_aug[input_col] = [perturb_structure(x) for x in structs] + df = pd.concat([df, df_aug.set_index(f"{x}-aug={idx+1}" for x in df_aug.index)]) + +del df_aug + + +# %% export data +X_train, X_test, y_train, y_test = train_test_split( + df[input_col], df[target_col], test_size=0.05, random_state=42 +) + +for samples, targets, label in ((X_train, y_train, "train"), (X_test, y_test, "test")): + out_dir = f"{label}-data" + os.makedirs(out_dir, exist_ok=True) + + targets.to_csv(f"{out_dir}/targets.csv") + + struct: Structure + for mat_id, struct in tqdm( + samples.items(), desc="Saving structures", total=len(samples) + ): + struct.to(f"{out_dir}/{mat_id}.poscar", fmt="POSCAR") diff --git a/models/alignn/metadata.yml b/models/alignn/metadata.yml new file mode 100644 index 00000000..f7ed783a --- /dev/null +++ b/models/alignn/metadata.yml @@ -0,0 +1,31 @@ +model_name: ALIGNN +model_version: 2023.01.10 +matbench_discovery_version: 1.0 +date_added: "2023-06-02" +date_published: "2021-02-22" +authors: + - name: Kamal Choudhary + affiliation: National Institute of Standards and Technology + email: kamal.choudhary@nist.gov + orcid: https://orcid.org/0000-0001-9737-8074 + - name: Brian DeCost + affiliation: National Institute of Standards and Technology + orcid: https://orcid.org/0000-0002-3459-5888 + email: zhongpc@berkeley.edu + - name: Philipp Benner + affiliation: Bundesanstalt für Materialforschung und -prüfung BAM + orcid: https://orcid.org/0000-0002-0912-8137 + github: https://github.com/pbenner +repo: https://github.com/usnistgov/alignn +url: https://jarvis.nist.gov/jalignn +doi: https://nature.com/articles/s41524-021-00650-1 +preprint: https://arxiv.org/abs/2209.05554 +requirements: + ase: 3.22.0 + dgl-cu111: 0.6.1 + numpy: 1.24.3 + pandas: 2.0.1 + scikit-learn: 1.2.2 + torch: 1.9.0+cu111 +trained_for_benchmark: true +# hyperparams: see align-config.json diff --git a/models/alignn/readme.md b/models/alignn/readme.md new file mode 100644 index 00000000..dd809767 --- /dev/null +++ b/models/alignn/readme.md @@ -0,0 +1,21 @@ +## ALIGNN formation energy predictions on WBM test set + +ALIGNN is trained using L1 loss and 1000 epochs. The model that performs best on the validation set is saved and used for predictions (requires minor adaptation of the ALIGNN source). The modifications to the ALIGNN source code are provided as patch `alignn-2023.01.10.patch`, which was applied to ALIGNN version `2023.01.10`. In addition, all Python requirements are given as `requirements.txt`. + +To reproduce the `alignn` package state used for this submission, run + +```bash +pip install alignn==2023.01.10 +alignn_dir=$(python -c "import alignn; print(alignn.__path__[0])") +cd $alignn_dir +git apply /path/to/alignn-2023.01.10.patch +``` + +Replace `/path/to/` with the actual path to the patch file. + +The directory contains the following files, which must be executed in the given order to reproduce the results: + +1. `train_data.py`: Export Matbench Discovery training data to ALIGNN compatible format. This script outputs training data in the directory `data_train`. In addition, a small test data set is set apart and stored in the directory `data_test` +1. `train_alignn.py`: Train an ALIGNN model on previously exported data. The resulting model is stored in the directory `data-train-result` +1. `test_data.py`: Export WBM test data in ALIGNN-compatible format. The data is stored in the directory `data-test-wbm` +1. `test_alignn.py`: Test a trained ALIGNN model on the WBM data. Predictions are stored in the file `test_alignn_result.json` diff --git a/models/alignn/test_alignn.py b/models/alignn/test_alignn.py new file mode 100644 index 00000000..44f50770 --- /dev/null +++ b/models/alignn/test_alignn.py @@ -0,0 +1,137 @@ +# %% +from __future__ import annotations + +import json +import os +from importlib.metadata import version + +import pandas as pd +import torch +import wandb +from alignn.config import TrainingConfig +from alignn.models.alignn import ALIGNN +from alignn.pretrained import all_models, get_figshare_model +from jarvis.core.graphs import Graph +from pymatgen.core import Structure +from pymatgen.io.jarvis import JarvisAtomsAdaptor +from sklearn.metrics import r2_score +from tqdm import tqdm + +from matbench_discovery import DEBUG, today +from matbench_discovery.data import DATA_FILES, df_wbm +from matbench_discovery.plots import wandb_scatter +from matbench_discovery.slurm import slurm_submit + +__author__ = "Janosh Riebesell, Philipp Benner" +__date__ = "2023-06-03" + +module_dir = os.path.dirname(__file__) + + +# %% +model_name = "mp_e_form_alignn" # pre-trained by NIST +# model_name = f"{module_dir}/data-train-result/best-model.pth" +task_type = "IS2RE" +target_col = "e_form_per_atom_mp2020_corrected" +input_col = "initial_structure" +id_col = "material_id" +device = "cuda" if torch.cuda.is_available() else "cpu" +job_name = f"{model_name}-wbm-{task_type}{'-debug' if DEBUG else ''}" +out_dir = os.getenv("SBATCH_OUTPUT", f"{module_dir}/{today}-{job_name}") + + +if model_name in all_models: # load pre-trained model + model = get_figshare_model(model_name) + pred_col = "e_form_per_atom_alignn_pretrained" +elif os.path.isfile(model_name): + pred_col = "e_form_per_atom_alignn" + with open(f"{module_dir}/alignn-config.json") as file: + config = TrainingConfig(**json.load(file)) + + model = ALIGNN(config.model) + # load trained ALIGNN model + state_dict = torch.load(model_name, map_location=device) + model.load_state_dict(state_dict) + model = model.to(device) +else: + raise ValueError( + f"{model_name=} not found, train a model or use pre-trained {list(all_models)}" + ) + +slurm_vars = slurm_submit( + job_name=job_name, + partition="ampere", + account="LEE-SL3-GPU", + time="12:0:0", + out_dir=out_dir, + slurm_flags="--nodes 1 --gpus-per-node 1", + pre_cmd=". /etc/profile.d/modules.sh; module load rhel8/default-amp;" + "module load cuda/11.8", +) + + +# %% Load data +data_path = { + "IS2RE": DATA_FILES.wbm_initial_structures, + "RS2RE": DATA_FILES.wbm_computed_structure_entries, +}[task_type] +input_col = {"IS2RE": "initial_structure", "RS2RE": "relaxed_structure"}[task_type] + +df_in = pd.read_json(data_path).set_index(id_col) + +df_in[target_col] = df_wbm[target_col] +if task_type == "RS2RE": + df_in[input_col] = [x["structure"] for x in df_in.computed_structure_entry] +assert input_col in df_in, f"{input_col=} not in {list(df_in)}" + +df_in[input_col] = [ + JarvisAtomsAdaptor.get_atoms(Structure.from_dict(x)) + for x in tqdm(df_in[input_col], leave=False, desc="Converting to JARVIS atoms") +] + + +# %% +run_params = dict( + data_path=data_path, + **{f"{dep}_version": version(dep) for dep in ("megnet", "numpy")}, + model_name=model_name, + task_type=task_type, + target_col=target_col, + df=dict(shape=str(df_in.shape), columns=", ".join(df_in)), + slurm_vars=slurm_vars, +) + +wandb.init(project="matbench-discovery", name=job_name, config=run_params) + + +# %% Predict +model.eval() +e_form_preds: dict[str, float] = {} +with torch.no_grad(): # get predictions + for material_id, atoms in tqdm( + df_in[input_col].items(), + total=len(df_in), + desc=f"Predicting {target_col=} {task_type}", + ): + atom_graph, line_graph = Graph.atom_dgl_multigraph(atoms) + e_form = model([atom_graph.to(device), line_graph.to(device)]).item() + + e_form_preds[material_id] = e_form + +df_wbm[pred_col] = e_form_preds + +df_wbm[pred_col] -= df_wbm.e_correction_per_atom_mp_legacy +df_wbm[pred_col] += df_wbm.e_correction_per_atom_mp2020 + +df_wbm[pred_col].round(4).to_csv(f"{module_dir}/{today}-{model_name}-wbm-IS2RE.csv.gz") + + +# %% +table = wandb.Table(dataframe=df_wbm[[target_col, pred_col]].reset_index()) + +MAE = (df_wbm[target_col] - df_wbm[pred_col]).abs().mean() +R2 = r2_score(df_wbm[target_col], df_wbm[pred_col]) +title = f"{model_name} {task_type} {MAE=:.4} {R2=:.4}" +print(title) + +wandb_scatter(table, fields=dict(x=target_col, y=pred_col), title=title) diff --git a/models/alignn/train_alignn.py b/models/alignn/train_alignn.py new file mode 100644 index 00000000..e6243cd2 --- /dev/null +++ b/models/alignn/train_alignn.py @@ -0,0 +1,19 @@ +"""Train a ALIGNN on target_col of data_path.""" + + +# %% +from alignn.train_folder import train_for_folder + +__author__ = "Philipp Benner" +__date__ = "2023-06-02" + + +# %% +train_for_folder( + root_dir="data_train", + config_name="alignn-config.json", + keep_data_order=False, + output_dir="data-train-result", + epochs=1000, + file_format="poscar", +) diff --git a/models/cgcnn/test_cgcnn.py b/models/cgcnn/test_cgcnn.py index 1f2c7f30..55b52cf8 100644 --- a/models/cgcnn/test_cgcnn.py +++ b/models/cgcnn/test_cgcnn.py @@ -60,7 +60,7 @@ df[input_col] = [x["structure"] for x in df.computed_structure_entry] assert input_col in df, f"{input_col=} not in {list(df)}" -df[input_col] = [Structure.from_dict(x) for x in tqdm(df[input_col], disable=None)] +df[input_col] = [Structure.from_dict(dct) for dct in tqdm(df[input_col], disable=None)] filters = { # "display_name": {"$regex": "^train-cgcnn-robust-augment=3-"}, @@ -117,7 +117,7 @@ ) slurm_job_id = os.getenv("SLURM_JOB_ID", "debug") -df.round(4).to_csv(f"{out_dir}/{job_name}-preds-{slurm_job_id}.csv") +df.round(4).to_csv(f"{out_dir}/{job_name}-preds-{slurm_job_id}.csv.gz") pred_col = f"{e_form_col}_pred_ens" assert pred_col in df, f"{pred_col=} not in {list(df)}" table = wandb.Table(dataframe=df[[e_form_col, pred_col]].reset_index()) diff --git a/models/cgcnn/train_cgcnn.py b/models/cgcnn/train_cgcnn.py index dad4c803..b3c34f0f 100644 --- a/models/cgcnn/train_cgcnn.py +++ b/models/cgcnn/train_cgcnn.py @@ -29,9 +29,10 @@ target_col = "formation_energy_per_atom" input_col = "structure" id_col = "material_id" -perturb = 0 # 0 for no perturbation, n>1 means train on n perturbations of each crystal +# 0 for no perturbation, n>1 means train on n perturbations of each crystal # in the training set all assigned the same original target energy -job_name = f"train-cgcnn-robust-{perturb=}{'-debug' if DEBUG else ''}" +n_perturb = 0 +job_name = f"train-cgcnn-robust-{n_perturb=}{'-debug' if DEBUG else ''}" print(f"{job_name=}") robust = "robust" in job_name.lower() ensemble_size = 10 @@ -61,19 +62,23 @@ # %% data_path = DATA_FILES.mp_energies print(f"{data_path=}") -df = pd.read_json(data_path).set_index(id_col) -df[input_col] = [Structure.from_dict(s) for s in tqdm(df[input_col], disable=None)] -assert target_col in df +df_in = pd.read_json(data_path).set_index(id_col) +df_in[input_col] = [ + Structure.from_dict(s) for s in tqdm(df_in[input_col], disable=None) +] +assert target_col in df_in -df_aug = df.copy() +df_aug = df_in.copy() structs = df_aug.pop(input_col) -for idx in trange(perturb, desc="Generating perturbed structures"): +for idx in trange(n_perturb, desc="Generating perturbed structures"): df_aug[input_col] = [perturb_structure(x) for x in structs] - df = pd.concat([df, df_aug.set_index(f"{x}-aug={idx+1}" for x in df_aug.index)]) + df_in = pd.concat( + [df_in, df_aug.set_index(f"{x}-aug={idx+1}" for x in df_aug.index)] + ) del df_aug -train_df, test_df = df_train_test_split(df, test_size=0.05) +train_df, test_df = df_train_test_split(df_in, test_size=0.05) print(f"{train_df.shape=}") train_data = CrystalGraphData(train_df, task_dict={target_col: task_type}) @@ -88,7 +93,7 @@ ) # 1 for regression, n_classes for classification -n_targets = [1 if task_type == "regression" else df[target_col].max() + 1] +n_targets = [1 if task_type == "regression" else df_in[target_col].max() + 1] model_params = dict( n_targets=n_targets, @@ -106,7 +111,7 @@ train_df=dict(shape=str(train_data.df.shape), columns=", ".join(train_df)), test_df=dict(shape=str(test_data.df.shape), columns=", ".join(test_df)), slurm_vars=slurm_vars, - perturb=perturb, + n_perturb=n_perturb, input_col=input_col, ) diff --git a/models/chgnet/join_chgnet_results.py b/models/chgnet/join_chgnet_results.py index 691a1242..ea26fc41 100644 --- a/models/chgnet/join_chgnet_results.py +++ b/models/chgnet/join_chgnet_results.py @@ -66,7 +66,7 @@ # %% out_path = f"{module_dir}/{today}-chgnet-wbm-{task_type}" df_chgnet = df_chgnet.round(4) -df_chgnet.select_dtypes("number").to_csv(f"{out_path}.csv") +df_chgnet.select_dtypes("number").to_csv(f"{out_path}.csv.gz") df_chgnet.reset_index().to_json(f"{out_path}.json.gz", default_handler=as_dict_handler) # in_path = f"{module_dir}/2023-03-04-chgnet-wbm-IS2RE.json.gz" diff --git a/models/m3gnet/join_m3gnet_results.py b/models/m3gnet/join_m3gnet_results.py index 8396934e..1ef88e39 100644 --- a/models/m3gnet/join_m3gnet_results.py +++ b/models/m3gnet/join_m3gnet_results.py @@ -93,7 +93,7 @@ # %% out_path = f"{module_dir}/{today}-m3gnet-{model_type}-wbm-{task_type}" df_m3gnet = df_m3gnet.round(4) -df_m3gnet.select_dtypes("number").to_csv(f"{out_path}.csv") +df_m3gnet.select_dtypes("number").to_csv(f"{out_path}.csv.gz") df_m3gnet.reset_index().to_json(f"{out_path}.json.gz", default_handler=as_dict_handler) diff --git a/models/megnet/test_megnet.py b/models/megnet/test_megnet.py index f143e687..34e28b01 100644 --- a/models/megnet/test_megnet.py +++ b/models/megnet/test_megnet.py @@ -51,7 +51,7 @@ # %% slurm_array_task_id = int(os.getenv("SLURM_ARRAY_TASK_ID", "0")) -out_path = f"{out_dir}/megnet-e-form-preds.csv" +out_path = f"{out_dir}/megnet-e-form-preds.csv.gz" if os.path.isfile(out_path): raise SystemExit(f"{out_path = } already exists, exciting early") @@ -121,8 +121,7 @@ + df_wbm.e_correction_per_atom_mp2020 ).to_frame(name=pred_col) -df_megnet.round(4).to_csv("2022-11-18-megnet-wbm-IS2RE/megnet-e-form-preds.csv") - +df_megnet.round(4).to_csv(out_path) # df_megnet = pd.read_csv(f"{ROOT}/models/{PRED_FILES.megnet}").set_index("material_id") diff --git a/models/voronoi/train_test_voronoi_rf.py b/models/voronoi/train_test_voronoi_rf.py index 60f7635f..20a878c9 100644 --- a/models/voronoi/train_test_voronoi_rf.py +++ b/models/voronoi/train_test_voronoi_rf.py @@ -28,7 +28,7 @@ print(f"{task_type=}") out_dir = f"{module_dir}/{today}-train-test" -out_path = f"{out_dir}/e-form-preds-{task_type}.csv" +out_path = f"{out_dir}/e-form-preds-{task_type}.csv.gz" if os.path.isfile(out_path): raise SystemExit(f"{out_path = } already exists, exciting early") diff --git a/models/voronoi/voronoi_featurize_dataset.py b/models/voronoi/voronoi_featurize_dataset.py index 839ea7a0..964e9d25 100644 --- a/models/voronoi/voronoi_featurize_dataset.py +++ b/models/voronoi/voronoi_featurize_dataset.py @@ -70,7 +70,9 @@ elif data_name == "wbm" and input_col == "initial_structure": struct_dicts = df_in.initial_structure -df_in[input_col] = [Structure.from_dict(x) for x in tqdm(struct_dicts, disable=None)] +df_in[input_col] = [ + Structure.from_dict(dct) for dct in tqdm(struct_dicts, disable=None) +] # %% diff --git a/models/wrenformer/test_wrenformer.py b/models/wrenformer/test_wrenformer.py index 25180974..1399ea51 100644 --- a/models/wrenformer/test_wrenformer.py +++ b/models/wrenformer/test_wrenformer.py @@ -104,7 +104,7 @@ df = df.round(4) slurm_job_id = os.getenv("SLURM_JOB_ID", "debug") -df.to_csv(f"{out_dir}/{job_name}-preds-{slurm_job_id}.csv") +df.to_csv(f"{out_dir}/{job_name}-preds-{slurm_job_id}.csv.gz") # %% diff --git a/pyproject.toml b/pyproject.toml index 8b8e8225..e00c4851 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,7 +55,9 @@ Package = "https://pypi.org/project/matbench-discovery" test = ["pytest", "pytest-cov"] # how to specify git deps: https://stackoverflow.com/a/73572379 running-models = [ + "alignn", "chgnet", + "jarvis-tools", # torch needs to install before aviary "torch", diff --git a/scripts/calc_wandb_model_runtimes.py b/scripts/calc_wandb_model_runtimes.py index 9889b995..aeba003b 100644 --- a/scripts/calc_wandb_model_runtimes.py +++ b/scripts/calc_wandb_model_runtimes.py @@ -18,7 +18,7 @@ from pymatviz.utils import save_fig from tqdm import tqdm -from matbench_discovery import FIGS, PDF_FIGS, WANDB_PATH +from matbench_discovery import FIGS, MODELS, PDF_FIGS, WANDB_PATH from matbench_discovery.preds import df_metrics, df_preds __author__ = "Janosh Riebesell" @@ -122,7 +122,7 @@ df_stats.attrs["All Models Run Time"] = df_stats[time_col].sum() print(f"{df_stats[time_col].sum()=:.0f} hours") -# df_stats.round(2).to_json(f"{MODELS}/model-stats.json", orient="index") +df_stats.round(2).to_json(f"{MODELS}/model-stats.json", orient="index") df_time = ( df_stats.sort_index() .filter(like=time_col) diff --git a/scripts/make_metrics_tables.py b/scripts/make_metrics_tables.py index 0d07a910..49f0b121 100644 --- a/scripts/make_metrics_tables.py +++ b/scripts/make_metrics_tables.py @@ -46,7 +46,8 @@ # %% -ontology = { +ontology = { # (training type, test type, model type) + "ALIGNN": ("RS2RE", "IS2RE", "GNN"), "CHGNet": ("S2EFSM", "IS2RE-SR", "UIP-GNN"), "M3GNet": ("S2EFS", "IS2RE-SR", "UIP-GNN"), "MEGNet": ("RS2RE", "IS2E", "GNN"), @@ -74,8 +75,8 @@ for label, df, extra_hide_metrics in ( # hide redundant metrics (TPR = Recall, FPR = 1 - TNR, FNR = 1 - TPR) - ("", df_metrics, []), ("-first-10k", df_metrics_10k, ["TPR", "TNR"]), + ("", df_metrics, []), ): df_table = pd.concat([df, df_ont]).rename(index={"R2": R2_col}) df_table.index.name = "Model" diff --git a/scripts/readme.md b/scripts/readme.md index 0e571a4b..e95c6947 100644 --- a/scripts/readme.md +++ b/scripts/readme.md @@ -1,8 +1 @@ -# About `scripts/` - -This folder contains - -- plotting scripts -- [`ctk_structure_viewer.py`](ctk_structure_viewer.py) a [Crystal Toolkit app](https://github.com/materialsproject/crystaltoolkit) to 3d-render crystals when hovering corresponding points in a `plotly` figure -- [`update_wandb_runs.py`](update_wandb_runs.py) use the [WandB](https://wandb.ai/janosh/matbench-discovery) API to update run metadata -- [`make_api_docs.py`](make_api_docs.py): [auto-generate markdown API docs](https://github.com/ml-tooling/lazydocs) for the `matbench-discovery` PyPI package +See the module doc strings for the purpose of each script. diff --git a/scripts/scatter_e_above_hull_models.py b/scripts/scatter_e_above_hull_models.py index c2c6105e..f9927dd8 100644 --- a/scripts/scatter_e_above_hull_models.py +++ b/scripts/scatter_e_above_hull_models.py @@ -52,7 +52,7 @@ legend_order = list(df_metrics.T.MAE.sort_values().index) -# %% determine each point's classification to color them by +# determine each point's classification to color them by true_pos, false_neg, false_pos, true_neg = classify_stable( df_bin[each_true_col], df_bin[each_pred_col] ) @@ -220,7 +220,7 @@ **axis_titles, ) # fig.layout.height = 1000 -fig.layout.width = 1100 +# fig.layout.width = 1100 fig.layout.margin.update(l=40, r=10, t=10, b=50) fig.update_xaxes(matches=None) fig.update_yaxes(matches=None) diff --git a/site/src/app.css b/site/src/app.css index 6fb74511..f5f2b95a 100644 --- a/site/src/app.css +++ b/site/src/app.css @@ -68,7 +68,6 @@ button { border-radius: 3pt; background: teal; padding: 2pt 4pt; - font-size: 12pt; } a { color: var(--blue); diff --git a/site/src/figs/metrics-table-first-10k.svelte b/site/src/figs/metrics-table-first-10k.svelte index b96c8cb3..b5937210 100644 --- a/site/src/figs/metrics-table-first-10k.svelte +++ b/site/src/figs/metrics-table-first-10k.svelte @@ -51,70 +51,81 @@