remove svelte config prerender.handleHttpError: warn (site now builds…

… error free) add pydocstyle pre-commit hook and fix legacy violations rename target_col to e_form_col across train/test scripts tweak site Footer
janosh · Jun 20, 2023 · f84171b · f84171b
1 parent 387722a
commit f84171b
Show file tree

Hide file tree

Showing 24 changed files with 267 additions and 107 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -88,3 +88,9 @@ repos:
           - eslint-plugin-svelte3
           - "@typescript-eslint/eslint-plugin"
           - "@typescript-eslint/parser"
+
+  - repo: https://github.com/PyCQA/pydocstyle
+    rev: 6.1.1
+    hooks:
+      - id: pydocstyle
+        exclude: tests
diff --git a/matbench_discovery/__init__.py b/matbench_discovery/__init__.py
@@ -1,3 +1,5 @@
+"""Global variables used all across the matbench_discovery package."""
+
 from __future__ import annotations
 
 import os

diff --git a/matbench_discovery/data.py b/matbench_discovery/data.py
@@ -98,11 +98,11 @@ def load_train_test(
 
         cache_path = f"{cache_dir}/{version}/{file}"
         if os.path.isfile(cache_path):
-            print(f"Loading '{key}' from cached file at '{cache_path}'")
+            print(f"Loading {key!r} from cached file at {cache_path!r}")
             df = reader(cache_path, **kwargs)
         else:
             url = f"{RAW_REPO_URL}/{version}/data/{file}"
-            print(f"Downloading '{key}' from {url}")
+            print(f"Downloading {key!r} from {url}")
             try:
                 df = reader(url)
             except urllib.error.HTTPError as exc:

diff --git a/matbench_discovery/plots.py b/matbench_discovery/plots.py
@@ -90,8 +90,7 @@ def hist_classified_stable_vs_hull_dist(
     y_label: str = "Number of materials",
     **kwargs: Any,
 ) -> tuple[plt.Axes | go.Figure, dict[str, float]]:
-    """
-    Histogram of the energy difference (either according to DFT ground truth [default]
+    """Histogram of the energy difference (either according to DFT ground truth [default]
     or model predicted energy) to the convex hull for materials in the WBM data set. The
     histogram is broken down into true positives, false negatives, false positives, and
     true negatives based on whether the model predicts candidates to be below the known
@@ -258,13 +257,13 @@ def rolling_mae_vs_hull_dist(
     e_above_hull_error: pd.Series,
     window: float = 0.02,
     bin_width: float = 0.001,
-    x_lim: tuple[float, float] = (-0.2, 0.3),
-    y_lim: tuple[float, float] = (0.0, 0.14),
+    x_lim: tuple[float, float] = (-0.2, 0.2),
+    y_lim: tuple[float, float] = (0, 0.15),
     ax: plt.Axes = None,
     backend: Backend = "plotly",
     y_label: str = "rolling MAE (eV/atom)",
     **kwargs: Any,
-) -> plt.Axes:
+) -> plt.Axes | go.Figure:
     """Rolling mean absolute error as the energy to the convex hull is varied. A scale
     bar is shown for the windowing period of 40 meV per atom used when calculating the
     rolling MAE. The standard error in the mean is shaded around each curve. The
@@ -288,9 +287,8 @@ def rolling_mae_vs_hull_dist(
         y_label (str, optional): y-axis label. Defaults to "rolling MAE (eV/atom)".
 
     Returns:
-        plt.Axes: _description_
+        plt.Axes | go.Figure: matplotlib Axes or plotly Figure depending on backend.
     """
-
     bins = np.arange(*x_lim, bin_width)
 
     rolling_maes = np.zeros_like(bins)
@@ -309,6 +307,8 @@ def rolling_mae_vs_hull_dist(
     # cancellation among similar chemistries, supporting ref:
     # https://journals.aps.org/prb/abstract/10.1103/PhysRevB.85.155208
     dft_acc = 0.025
+    # used by plotly branch of this function, unrecognized by matplotlib
+    fig = kwargs.pop("fig", None)
 
     if backend == "matplotlib":
         ax = ax or plt.gca()
@@ -385,25 +385,33 @@ def rolling_mae_vs_hull_dist(
             title=title,
             **kwargs,
         )
+        line_color = ax.data[0].line.color
         ax_std = go.Scatter(
             x=list(bins) + list(bins)[::-1],  # bins, then bins reversed
             y=list(rolling_maes + 2 * rolling_stds)
             + list(rolling_maes - 2 * rolling_stds)[::-1],  # upper, then lower reversed
             fill="toself",
             line_color="white",
-            fillcolor=ax.data[0].line.color,
+            fillcolor=line_color,
             opacity=0.3,
             hoverinfo="skip",
             showlegend=False,
         )
         ax.add_trace(ax_std)
 
+        if isinstance(fig, go.Figure):
+            # if passed existing plotly figure, add traces to it
+            # return without changing layout and adding annotations
+            fig.add_traces(ax.data)
+            return fig
+
+        legend = dict(title=None, xanchor="right", x=1, yanchor="bottom", y=0)
         ax.update_layout(
             dict(
-                xaxis_title="E<sub>above hull</sub> (eV/atom)",
+                xaxis_title="E<sub>above MP hull</sub> (eV/atom)",
                 yaxis_title="rolling MAE (eV/atom)",
             ),
-            legend=dict(title=None, xanchor="right", x=1, yanchor="bottom", y=0),
+            legend=legend,
         )
         ax.update_xaxes(range=x_lim)
         ax.update_yaxes(range=y_lim)
@@ -424,31 +432,39 @@ def rolling_mae_vs_hull_dist(
         )
         ax.add_traces([err_gt_each_region, ml_err_lt_dft_err_region])
         ax.add_annotation(
-            x=4 * dft_acc,
+            x=dft_acc,
             y=dft_acc,
-            text="Corrected GGA DFT Accuracy",
+            text="<a href='https://doi.org/10.1103/PhysRevB.85.155208'>Corrected GGA DFT "
+            "Accuracy</a>",
             showarrow=True,
-            # arrowhead=1,
-            ax=-dft_acc,
+            xshift=10,
+            arrowhead=1,
+            ax=4 * dft_acc,
             ay=dft_acc,
+            axref="x",
+            ayref="y",
         )
 
         ax.data = ax.data[::-1]  # bring px.line() to front
-        # show MAE window size
+        # plot rectangle to indicate MAE window size
         x0, y0 = x_lim[0] + 0.01, y_lim[0] + 0.01
         ax.add_annotation(
-            x=x0 + 0.05,
-            y=y0 + 0.01,
-            text=f"rolling MAE window<br>{window} eV/atom",
+            x=x0 + window,
+            y=y0,
+            text=f"rolling {window=} eV/atom",
             showarrow=False,
+            xshift=8,
+            yshift=-4,
+            yanchor="bottom",
+            xanchor="left",
         )
         ax.add_shape(
             type="rect",
             x0=x0,
             y0=y0,
             x1=x0 + window,
             y1=y0 + window / 5,
-            fillcolor="black",
+            fillcolor=line_color,
         )
 
     return ax
@@ -592,7 +608,7 @@ def cumulative_precision_recall(
             facet_col_wrap=3,
             facet_col_spacing=0.03,
             # pivot df in case we want to show all 3 metrics in each plot's hover
-            # requires fixing index mismatch due to df subsampling above
+            # requires fixing index mismatch due to df sub-sampling above
             # customdata=dict(
             #     df_cum.reset_index()
             #     .pivot(index="index", columns="metric")["Voronoi RF above hull pred"]

diff --git a/models/cgcnn/test_cgcnn.py b/models/cgcnn/test_cgcnn.py
@@ -61,8 +61,8 @@
 
 df = pd.read_json(data_path).set_index("material_id")
 
-target_col = "e_form_per_atom_mp2020_corrected"
-df[target_col] = df_wbm[target_col]
+e_form_col = "e_form_per_atom_mp2020_corrected"
+df[e_form_col] = df_wbm[e_form_col]
 if task_type == "RS2RE":
     df[input_col] = [x["structure"] for x in df.computed_structure_entry]
 assert input_col in df, f"{input_col=} not in {list(df)}"
@@ -92,7 +92,7 @@
     torch_version=version("torch"),
     ensemble_size=len(runs),
     task_type=task_type,
-    target_col=target_col,
+    target_col=e_form_col,
     input_col=input_col,
     wandb_run_filters=filters,
     slurm_vars=slurm_vars,
@@ -101,7 +101,7 @@
 wandb.init(project="matbench-discovery", name=job_name, config=run_params)
 
 cg_data = CrystalGraphData(
-    df, task_dict={target_col: "regression"}, structure_col=input_col
+    df, task_dict={e_form_col: "regression"}, structure_col=input_col
 )
 data_loader = DataLoader(
     cg_data, batch_size=1024, shuffle=False, collate_fn=collate_batch
@@ -114,16 +114,16 @@
     # dropping isolated-atom structs means len(cg_data.df) < len(df)
     cache_dir=CHECKPOINT_DIR,
     df=cg_data.df.drop(columns=input_col),
-    target_col=target_col,
+    target_col=e_form_col,
     model_cls=CrystalGraphConvNet,
     data_loader=data_loader,
 )
 
 slurm_job_id = os.environ.get("SLURM_JOB_ID", "debug")
 df.round(4).to_csv(f"{out_dir}/{job_name}-preds-{slurm_job_id}.csv")
-pred_col = f"{target_col}_pred_ens"
+pred_col = f"{e_form_col}_pred_ens"
 assert pred_col in df, f"{pred_col=} not in {list(df)}"
-table = wandb.Table(dataframe=df[[target_col, pred_col]].reset_index())
+table = wandb.Table(dataframe=df[[e_form_col, pred_col]].reset_index())
 
 
 # %%
@@ -132,4 +132,4 @@
 
 title = f"CGCNN {task_type} ensemble={len(runs)} {MAE=:.4} {R2=:.4}"
 
-wandb_scatter(table, fields=dict(x=target_col, y=pred_col), title=title)
+wandb_scatter(table, fields=dict(x=e_form_col, y=pred_col), title=title)
diff --git a/models/megnet/test_megnet.py b/models/megnet/test_megnet.py
@@ -50,8 +50,8 @@
 data_path = f"{ROOT}/data/wbm/2022-10-19-wbm-init-structs.json.bz2"
 print(f"\nJob started running {timestamp}")
 print(f"{data_path=}")
-target_col = "e_form_per_atom_mp2020_corrected"
-assert target_col in df_wbm, f"{target_col=} not in {list(df_wbm)=}"
+e_form_col = "e_form_per_atom_mp2020_corrected"
+assert e_form_col in df_wbm, f"{e_form_col=} not in {list(df_wbm)=}"
 
 df_wbm_structs = pd.read_json(data_path).set_index("material_id")
 megnet_mp_e_form = load_model(model_name := "Eform_MP_2019")
@@ -64,7 +64,7 @@
     numpy_version=version("numpy"),
     model_name=model_name,
     task_type=task_type,
-    target_col=target_col,
+    target_col=e_form_col,
     df=dict(shape=str(df_wbm_structs.shape), columns=", ".join(df_wbm_structs)),
     slurm_vars=slurm_vars,
 )
@@ -109,11 +109,11 @@
 
 
 # %%
-table = wandb.Table(dataframe=df_wbm[[target_col, pred_col]].reset_index())
+table = wandb.Table(dataframe=df_wbm[[e_form_col, pred_col]].reset_index())
 
-MAE = (df_wbm[target_col] - df_wbm[pred_col]).abs().mean()
-R2 = r2_score(df_wbm[target_col], df_wbm[pred_col])
+MAE = (df_wbm[e_form_col] - df_wbm[pred_col]).abs().mean()
+R2 = r2_score(df_wbm[e_form_col], df_wbm[pred_col])
 title = f"{model_name} {task_type} {MAE=:.4} {R2=:.4}"
 print(title)
 
-wandb_scatter(table, fields=dict(x=target_col, y=pred_col), title=title)
+wandb_scatter(table, fields=dict(x=e_form_col, y=pred_col), title=title)
diff --git a/models/voronoi/__init__.py b/models/voronoi/__init__.py
@@ -1,10 +1,12 @@
+"""
+Recreate the featurizer used by Ward et al. in
+https://journals.aps.org/prb/abstract/10.1103/PhysRevB.96.024104
+"""
+
 import matminer.featurizers.composition as fc
 import matminer.featurizers.structure as fs
 from matminer.featurizers.base import MultipleFeaturizer
 
-# Create the featurizer: Ward et al. use a variety of different featurizers
-# https://journals.aps.org/prb/abstract/10.1103/PhysRevB.96.024104
-
 composition_features = [
     # Ward+Wolverton' Magpie https://rdcu.be/c3jug
     fc.ElementProperty.from_preset("magpie"),

diff --git a/models/voronoi/train_test_voronoi_rf.py b/models/voronoi/train_test_voronoi_rf.py
@@ -47,20 +47,20 @@
 
 mp_energies_path = f"{ROOT}/data/mp/2022-08-13-mp-energies.json.gz"
 df_mp = pd.read_json(mp_energies_path).set_index("material_id")
-train_target_col = "formation_energy_per_atom"
+train_e_form_col = "formation_energy_per_atom"
 
 test_path = f"{module_dir}/2022-11-18-features-wbm-{task_type}.csv.bz2"
 df_test = pd.read_csv(test_path).set_index("material_id")
 print(f"{df_test.shape=}")
 
-test_target_col = "e_form_per_atom_mp2020_corrected"
+test_e_form_col = "e_form_per_atom_mp2020_corrected"
 
 
 for df, df_tar, col in (
-    (df_train, df_mp, train_target_col),
-    (df_test, df_wbm, test_target_col),
+    (df_train, df_mp, train_e_form_col),
+    (df_test, df_wbm, test_e_form_col),
 ):
-    df[train_target_col] = df_tar[train_target_col]
+    df[train_e_form_col] = df_tar[train_e_form_col]
     nans = df_tar[col].isna().sum()
     assert nans == 0, f"{nans} NaNs in {col} targets"
 
@@ -74,8 +74,8 @@
     matminer_version=version("matminer"),
     numpy_version=version("numpy"),
     model_name=model_name,
-    train_target_col=train_target_col,
-    test_target_col=test_target_col,
+    train_target_col=train_e_form_col,
+    test_target_col=test_e_form_col,
     df_train=dict(shape=str(df_train.shape)),
     df_test=dict(shape=str(df_test.shape)),
     slurm_vars=slurm_vars,
@@ -103,7 +103,7 @@
 
 
 # %%
-model.fit(df_train[feature_names], df_train[train_target_col])
+model.fit(df_train[feature_names], df_train[train_e_form_col])
 
 
 # %%
@@ -121,13 +121,13 @@
 df_wbm[pred_col].round(4).to_csv(out_path)
 
 table = wandb.Table(
-    dataframe=df_wbm[["formula", test_target_col, pred_col]].reset_index()
+    dataframe=df_wbm[["formula", test_e_form_col, pred_col]].reset_index()
 )
 
 df_wbm[pred_col].isna().sum()
-MAE = (df_wbm[test_target_col] - df_wbm[pred_col]).abs().mean()
-R2 = r2_score(*df_wbm[[test_target_col, pred_col]].dropna().to_numpy().T)
+MAE = (df_wbm[test_e_form_col] - df_wbm[pred_col]).abs().mean()
+R2 = r2_score(*df_wbm[[test_e_form_col, pred_col]].dropna().to_numpy().T)
 title = f"{model_name} {task_type} {MAE=:.3} {R2=:.3}"
 print(title)
 
-wandb_scatter(table, fields=dict(x=test_target_col, y=pred_col), title=title)
+wandb_scatter(table, fields=dict(x=test_e_form_col, y=pred_col), title=title)