janosh · janosh · Jun 4, 2023 · Jun 2, 2023 · Jun 2, 2023 · Jun 2, 2023
diff --git a/matbench_discovery/plots.py b/matbench_discovery/plots.py
@@ -63,6 +63,7 @@ def unit(text: str) -> str:
     formula="Formula",
 )
 model_labels = dict(
+    alignn="ALIGNN",
     bowsr_megnet="BOWSR + MEGNet",
     chgnet="CHGNet",
     chgnet_megnet="CHGNet + MEGNet",

diff --git a/matbench_discovery/preds.py b/matbench_discovery/preds.py
@@ -60,6 +60,9 @@ class PredFiles(Files):
     # wrenformer 10-member ensemble
     wrenformer = "wrenformer/2022-11-15-wrenformer-IS2RE-preds.csv"
 
+    alignn = "alignn/2023-06-02-alignn-wbm-IS2RE.csv.gz"
+    alignn_pretrained = "alignn/2023-06-03-mp-e-form-alignn-wbm-IS2RE.csv.gz"
+
 
 # model_labels remaps model keys to pretty plot labels (see Files)
 PRED_FILES = PredFiles(root=f"{ROOT}/models", key_map=model_labels)

diff --git a/matbench_discovery/structure.py b/matbench_discovery/structure.py
@@ -13,6 +13,9 @@ def perturb_structure(struct: Structure, gamma: float = 1.5) -> Structure:
     """Perturb the atomic coordinates of a pymatgen structure. Used for CGCNN+P
     training set augmentation.
 
+    Not identical but very similar to the perturbation method used in
+    https://nature.com/articles/s41524-022-00891-8#Fig5.
+
     Args:
         struct (Structure): pymatgen structure to be perturbed
         gamma (float, optional): Weibull distribution parameter. Defaults to 1.5.

diff --git a/models/alignn/2023-06-02-alignn-wbm-IS2RE.csv.gz b/models/alignn/2023-06-02-alignn-wbm-IS2RE.csv.gz
diff --git a/models/alignn/2023-06-03-mp-e-form-alignn-wbm-IS2RE.csv.gz b/models/alignn/2023-06-03-mp-e-form-alignn-wbm-IS2RE.csv.gz
diff --git a/models/alignn/alignn-2023.01.10.patch b/models/alignn/alignn-2023.01.10.patch
@@ -0,0 +1,179 @@
+diff --git a/alignn/data.py b/alignn/data.py
+index 175b915..42ce26e 100644
+--- a/alignn/data.py
++++ b/alignn/data.py
+@@ -171,8 +171,9 @@ def get_id_train_val_test(
+     # full train/val test split
+     # ids = ids[::-1]
+     id_train = ids[:n_train]
+-    id_val = ids[-(n_val + n_test) : -n_test]  # noqa:E203
+-    id_test = ids[-n_test:]
++    id_val = ids[-(n_val + n_test) : -n_test]  if n_test > 0 else ids[-(n_val + n_test) :] # noqa:E203
++    id_test = ids[n_test:] if n_test > 0 else []
++
+     return id_train, id_val, id_test
+
+
+@@ -508,7 +509,7 @@ def get_train_val_loaders(
+             classification=classification_threshold is not None,
+             output_dir=output_dir,
+             tmp_name="test_data",
+-        )
++        ) if len(dataset_test) > 0 else None
+
+         collate_fn = train_data.collate
+         # print("line_graph,line_dih_graph", line_graph, line_dih_graph)
+@@ -528,7 +529,7 @@ def get_train_val_loaders(
+
+         val_loader = DataLoader(
+             val_data,
+-            batch_size=batch_size,
++            batch_size=1,
+             shuffle=False,
+             collate_fn=collate_fn,
+             drop_last=True,
+@@ -549,9 +550,13 @@ def get_train_val_loaders(
+             torch.save(train_loader, train_sample)
+             torch.save(val_loader, val_sample)
+             torch.save(test_loader, test_sample)
++
+     print("n_train:", len(train_loader.dataset))
+-    print("n_val:", len(val_loader.dataset))
+-    print("n_test:", len(test_loader.dataset))
++    if val_loader.dataset is not None:
++        print("n_val:", len(val_loader.dataset))
++    if test_loader.dataset is not None:
++        print("n_test:", len(test_loader.dataset))
++
+     return (
+         train_loader,
+         val_loader,
+diff --git a/alignn/train.py b/alignn/train.py
+index 4fa072e..0b6330a 100644
+--- a/alignn/train.py
++++ b/alignn/train.py
+@@ -69,7 +69,7 @@ torch.set_default_dtype(torch.float32)
+
+ device = "cpu"
+ if torch.cuda.is_available():
+-    device = torch.device("cuda")
++    device = torch.device("cuda:0")
+
+
+ def activated_output_transform(output):
+@@ -817,6 +817,8 @@ def train_dgl(
+         train_eos = EpochOutputStore()
+         train_eos.attach(train_evaluator)
+
++    best_mae = np.inf
++
+     # collect evaluation performance
+     @trainer.on(Events.EPOCH_COMPLETED)
+     def log_results(engine):
+@@ -839,6 +841,20 @@ def train_dgl(
+             history["train"][metric].append(tm)
+             history["validation"][metric].append(vm)
+
++## Adapted so that the best model is saved
++## ----------------------------------------------------------------------------
++            nonlocal best_mae
++
++            if metric == 'mae' and vm < best_mae:
++
++                best_mae = vm
++
++                out_path = os.path.join(config.output_dir, 'best-model.pth')
++                print(f'Saving model with MAE={vm} to file "{out_path}"')
++                torch.save(net.state_dict(), out_path)
++
++## ----------------------------------------------------------------------------
++
+         # for metric in metrics.keys():
+         #    history["train"][metric].append(tmetrics[metric])
+         #    history["validation"][metric].append(vmetrics[metric])
+@@ -978,39 +994,40 @@ def train_dgl(
+         and not classification
+         and config.model.output_features == 1
+     ):
+-        net.eval()
+-        f = open(
+-            os.path.join(config.output_dir, "prediction_results_test_set.csv"),
+-            "w",
+-        )
+-        f.write("id,target,prediction\n")
+-        targets = []
+-        predictions = []
+-        with torch.no_grad():
+-            ids = test_loader.dataset.ids  # [test_loader.dataset.indices]
+-            for dat, id in zip(test_loader, ids):
+-                g, lg, target = dat
+-                out_data = net([g.to(device), lg.to(device)])
+-                out_data = out_data.cpu().numpy().tolist()
+-                if config.standard_scalar_and_pca:
+-                    sc = pk.load(
+-                        open(os.path.join(tmp_output_dir, "sc.pkl"), "rb")
+-                    )
+-                    out_data = sc.transform(np.array(out_data).reshape(-1, 1))[
+-                        0
+-                    ][0]
+-                target = target.cpu().numpy().flatten().tolist()
+-                if len(target) == 1:
+-                    target = target[0]
+-                f.write("%s, %6f, %6f\n" % (id, target, out_data))
+-                targets.append(target)
+-                predictions.append(out_data)
+-        f.close()
++        if test_loader is not None:
++            net.eval()
++            f = open(
++                os.path.join(config.output_dir, "prediction_results_test_set.csv"),
++                "w",
++            )
++            f.write("id,target,prediction\n")
++            targets = []
++            predictions = []
++            with torch.no_grad():
++                ids = test_loader.dataset.ids  # [test_loader.dataset.indices]
++                for dat, id in zip(test_loader, ids):
++                    g, lg, target = dat
++                    out_data = net([g.to(device), lg.to(device)])
++                    out_data = out_data.cpu().numpy().tolist()
++                    if config.standard_scalar_and_pca:
++                        sc = pk.load(
++                            open(os.path.join(tmp_output_dir, "sc.pkl"), "rb")
++                        )
++                        out_data = sc.transform(np.array(out_data).reshape(-1, 1))[
++                            0
++                        ][0]
++                    target = target.cpu().numpy().flatten().tolist()
++                    if len(target) == 1:
++                        target = target[0]
++                    f.write("%s, %6f, %6f\n" % (id, target, out_data))
++                    targets.append(target)
++                    predictions.append(out_data)
++            f.close()
+
+-        print(
+-            "Test MAE:",
+-            mean_absolute_error(np.array(targets), np.array(predictions)),
+-        )
++            print(
++                "Test MAE:",
++                mean_absolute_error(np.array(targets), np.array(predictions)),
++            )
+         if config.store_outputs and not classification:
+             x = []
+             y = []
+diff --git a/alignn/train_folder.py b/alignn/train_folder.py
+index b532d4e..c9eb4a5 100644
+--- a/alignn/train_folder.py
++++ b/alignn/train_folder.py
+@@ -180,7 +180,7 @@ def train_for_folder(
+         train_val_test_loaders=[
+             train_loader,
+             val_loader,
+-            test_loader,
++            None,
+             prepare_batch,
+         ],
+     )
diff --git a/models/alignn/alignn-config.json b/models/alignn/alignn-config.json
@@ -0,0 +1,54 @@
+{
+  "version": "112bbedebdaecf59fb18e11c929080fb2f358246",
+  "dataset": "user_data",
+  "target": "target",
+  "atom_features": "cgcnn",
+  "neighbor_strategy": "k-nearest",
+  "id_tag": "jid",
+  "random_seed": 123,
+  "classification_threshold": null,
+  "n_val": null,
+  "n_test": null,
+  "n_train": null,
+  "train_ratio": 0.9,
+  "val_ratio": 0.1,
+  "test_ratio": 0.0,
+  "target_multiplication_factor": null,
+  "epochs": 1000,
+  "n_early_stopping": 100,
+  "batch_size": 128,
+  "weight_decay": 1e-5,
+  "learning_rate": 0.001,
+  "filename": "sample",
+  "warmup_steps": 2000,
+  "criterion": "l1",
+  "optimizer": "adamw",
+  "scheduler": "onecycle",
+  "pin_memory": true,
+  "save_dataloader": false,
+  "write_checkpoint": true,
+  "write_predictions": true,
+  "store_outputs": true,
+  "progress": true,
+  "log_tensorboard": false,
+  "standard_scalar_and_pca": false,
+  "use_canonize": true,
+  "num_workers": 4,
+  "cutoff": 8.0,
+  "max_neighbors": 12,
+  "keep_data_order": false,
+  "model": {
+    "name": "alignn",
+    "alignn_layers": 4,
+    "gcn_layers": 4,
+    "atom_input_features": 92,
+    "edge_input_features": 80,
+    "triplet_input_features": 40,
+    "embedding_features": 64,
+    "hidden_features": 256,
+    "output_features": 1,
+    "link": "identity",
+    "zero_inflated": false,
+    "classification": false
+  }
+}
diff --git a/models/alignn/make_train_data.py b/models/alignn/make_train_data.py
@@ -0,0 +1,60 @@
+# %% Imports
+import os
+
+import pandas as pd
+from pymatgen.core import Structure
+from sklearn.model_selection import train_test_split
+from tqdm import tqdm, trange
+
+from matbench_discovery.data import DATA_FILES
+from matbench_discovery.structure import perturb_structure
+
+__author__ = "Philipp Benner"
+__date__ = "2023-06-02"
+
+
+# %%
+target_col = "formation_energy_per_atom"
+input_col = "structure"
+id_col = "material_id"
+n_perturb = 0
+
+
+# %% load structures
+df_cse = pd.read_json(DATA_FILES.mp_computed_structure_entries).set_index(id_col)
+df_cse[input_col] = [
+    Structure.from_dict(cse[input_col]) for cse in tqdm(df_cse.entry, disable=None)
+]
+
+# load energies
+df = pd.read_csv(DATA_FILES.mp_energies).set_index(id_col)
+df[input_col] = df_cse[input_col]
+assert target_col in df
+
+
+# %% augment with randomly perturbed structures
+df_aug = df.copy()
+structs = df_aug.pop(input_col)
+for idx in trange(n_perturb, desc="Generating perturbed structures"):
+    df_aug[input_col] = [perturb_structure(x) for x in structs]
+    df = pd.concat([df, df_aug.set_index(f"{x}-aug={idx+1}" for x in df_aug.index)])
+
+del df_aug
+
+
+# %% export data
+X_train, X_test, y_train, y_test = train_test_split(
+    df[input_col], df[target_col], test_size=0.05, random_state=42
+)
+
+for samples, targets, label in ((X_train, y_train, "train"), (X_test, y_test, "test")):
+    out_dir = f"{label}-data"
+    os.makedirs(out_dir, exist_ok=True)
+
+    targets.to_csv(f"{out_dir}/targets.csv")
+
+    struct: Structure
+    for mat_id, struct in tqdm(
+        samples.items(), desc="Saving structures", total=len(samples)
+    ):
+        struct.to(f"{out_dir}/{mat_id}.poscar", fmt="POSCAR")
diff --git a/models/alignn/metadata.yml b/models/alignn/metadata.yml
@@ -0,0 +1,31 @@
+model_name: ALIGNN
+model_version: 2023.01.10
+matbench_discovery_version: 1.0
+date_added: "2023-06-02"
+date_published: "2021-02-22"
+authors:
+  - name: Kamal Choudhary
+    affiliation: National Institute of Standards and Technology
+    email: kamal.choudhary@nist.gov
+    orcid: https://orcid.org/0000-0001-9737-8074
+  - name: Brian DeCost
+    affiliation: National Institute of Standards and Technology
+    orcid: https://orcid.org/0000-0002-3459-5888
+    email: zhongpc@berkeley.edu
+  - name: Philipp Benner
+    affiliation: Bundesanstalt für Materialforschung und -prüfung BAM
+    orcid: https://orcid.org/0000-0002-0912-8137
+    github: https://github.com/pbenner
+repo: https://github.com/usnistgov/alignn
+url: https://jarvis.nist.gov/jalignn
+doi: https://nature.com/articles/s41524-021-00650-1
+preprint: https://arxiv.org/abs/2209.05554
+requirements:
+  ase: 3.22.0
+  dgl-cu111: 0.6.1
+  numpy: 1.24.3
+  pandas: 2.0.1
+  scikit-learn: 1.2.2
+  torch: 1.9.0+cu111
+trained_for_benchmark: true
+# hyperparams: see align-config.json
diff --git a/models/alignn/readme.md b/models/alignn/readme.md
@@ -0,0 +1,21 @@
+## ALIGNN formation energy predictions on WBM test set
+
+ALIGNN is trained using L1 loss and 1000 epochs. The model that performs best on the validation set is saved and used for predictions (requires minor adaptation of the ALIGNN source). The modifications to the ALIGNN source code are provided as patch `alignn-2023.01.10.patch`, which was applied to ALIGNN version `2023.01.10`. In addition, all Python requirements are given as `requirements.txt`.
+
+To reproduce the `alignn` package state used for this submission, run
+
+```bash
+pip install alignn==2023.01.10
+alignn_dir=$(python -c "import alignn; print(alignn.__path__[0])")
+cd $alignn_dir
+git apply /path/to/alignn-2023.01.10.patch
+```
+
+Replace `/path/to/` with the actual path to the patch file.
+
+The directory contains the following files, which must be executed in the given order to reproduce the results:
+
+1. `train_data.py`: Export Matbench Discovery training data to ALIGNN compatible format. This script outputs training data in the directory `data_train`. In addition, a small test data set is set apart and stored in the directory `data_test`
+1. `train_alignn.py`: Train an ALIGNN model on previously exported data. The resulting model is stored in the directory `data-train-result`
+1. `test_data.py`: Export WBM test data in ALIGNN-compatible format. The data is stored in the directory `data-test-wbm`
+1. `test_alignn.py`: Test a trained ALIGNN model on the WBM data. Predictions are stored in the file `test_alignn_result.json`