Skip to content

Commit

Permalink
remove svelte config prerender.handleHttpError: warn (site now builds…
Browse files Browse the repository at this point in the history
… error free)

add pydocstyle pre-commit hook and fix legacy violations
rename target_col to e_form_col across train/test scripts
tweak site Footer
  • Loading branch information
janosh committed Jun 20, 2023
1 parent 387722a commit f84171b
Show file tree
Hide file tree
Showing 24 changed files with 267 additions and 107 deletions.
6 changes: 6 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,3 +88,9 @@ repos:
- eslint-plugin-svelte3
- "@typescript-eslint/eslint-plugin"
- "@typescript-eslint/parser"

- repo: https://github.com/PyCQA/pydocstyle
rev: 6.1.1
hooks:
- id: pydocstyle
exclude: tests
2 changes: 2 additions & 0 deletions matbench_discovery/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"""Global variables used all across the matbench_discovery package."""

from __future__ import annotations

import os
Expand Down
4 changes: 2 additions & 2 deletions matbench_discovery/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,11 +98,11 @@ def load_train_test(

cache_path = f"{cache_dir}/{version}/{file}"
if os.path.isfile(cache_path):
print(f"Loading '{key}' from cached file at '{cache_path}'")
print(f"Loading {key!r} from cached file at {cache_path!r}")
df = reader(cache_path, **kwargs)
else:
url = f"{RAW_REPO_URL}/{version}/data/{file}"
print(f"Downloading '{key}' from {url}")
print(f"Downloading {key!r} from {url}")
try:
df = reader(url)
except urllib.error.HTTPError as exc:
Expand Down
56 changes: 36 additions & 20 deletions matbench_discovery/plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,7 @@ def hist_classified_stable_vs_hull_dist(
y_label: str = "Number of materials",
**kwargs: Any,
) -> tuple[plt.Axes | go.Figure, dict[str, float]]:
"""
Histogram of the energy difference (either according to DFT ground truth [default]
"""Histogram of the energy difference (either according to DFT ground truth [default]
or model predicted energy) to the convex hull for materials in the WBM data set. The
histogram is broken down into true positives, false negatives, false positives, and
true negatives based on whether the model predicts candidates to be below the known
Expand Down Expand Up @@ -258,13 +257,13 @@ def rolling_mae_vs_hull_dist(
e_above_hull_error: pd.Series,
window: float = 0.02,
bin_width: float = 0.001,
x_lim: tuple[float, float] = (-0.2, 0.3),
y_lim: tuple[float, float] = (0.0, 0.14),
x_lim: tuple[float, float] = (-0.2, 0.2),
y_lim: tuple[float, float] = (0, 0.15),
ax: plt.Axes = None,
backend: Backend = "plotly",
y_label: str = "rolling MAE (eV/atom)",
**kwargs: Any,
) -> plt.Axes:
) -> plt.Axes | go.Figure:
"""Rolling mean absolute error as the energy to the convex hull is varied. A scale
bar is shown for the windowing period of 40 meV per atom used when calculating the
rolling MAE. The standard error in the mean is shaded around each curve. The
Expand All @@ -288,9 +287,8 @@ def rolling_mae_vs_hull_dist(
y_label (str, optional): y-axis label. Defaults to "rolling MAE (eV/atom)".
Returns:
plt.Axes: _description_
plt.Axes | go.Figure: matplotlib Axes or plotly Figure depending on backend.
"""

bins = np.arange(*x_lim, bin_width)

rolling_maes = np.zeros_like(bins)
Expand All @@ -309,6 +307,8 @@ def rolling_mae_vs_hull_dist(
# cancellation among similar chemistries, supporting ref:
# https://journals.aps.org/prb/abstract/10.1103/PhysRevB.85.155208
dft_acc = 0.025
# used by plotly branch of this function, unrecognized by matplotlib
fig = kwargs.pop("fig", None)

if backend == "matplotlib":
ax = ax or plt.gca()
Expand Down Expand Up @@ -385,25 +385,33 @@ def rolling_mae_vs_hull_dist(
title=title,
**kwargs,
)
line_color = ax.data[0].line.color
ax_std = go.Scatter(
x=list(bins) + list(bins)[::-1], # bins, then bins reversed
y=list(rolling_maes + 2 * rolling_stds)
+ list(rolling_maes - 2 * rolling_stds)[::-1], # upper, then lower reversed
fill="toself",
line_color="white",
fillcolor=ax.data[0].line.color,
fillcolor=line_color,
opacity=0.3,
hoverinfo="skip",
showlegend=False,
)
ax.add_trace(ax_std)

if isinstance(fig, go.Figure):
# if passed existing plotly figure, add traces to it
# return without changing layout and adding annotations
fig.add_traces(ax.data)
return fig

legend = dict(title=None, xanchor="right", x=1, yanchor="bottom", y=0)
ax.update_layout(
dict(
xaxis_title="E<sub>above hull</sub> (eV/atom)",
xaxis_title="E<sub>above MP hull</sub> (eV/atom)",
yaxis_title="rolling MAE (eV/atom)",
),
legend=dict(title=None, xanchor="right", x=1, yanchor="bottom", y=0),
legend=legend,
)
ax.update_xaxes(range=x_lim)
ax.update_yaxes(range=y_lim)
Expand All @@ -424,31 +432,39 @@ def rolling_mae_vs_hull_dist(
)
ax.add_traces([err_gt_each_region, ml_err_lt_dft_err_region])
ax.add_annotation(
x=4 * dft_acc,
x=dft_acc,
y=dft_acc,
text="Corrected GGA DFT Accuracy",
text="<a href='https://doi.org/10.1103/PhysRevB.85.155208'>Corrected GGA DFT "
"Accuracy</a>",
showarrow=True,
# arrowhead=1,
ax=-dft_acc,
xshift=10,
arrowhead=1,
ax=4 * dft_acc,
ay=dft_acc,
axref="x",
ayref="y",
)

ax.data = ax.data[::-1] # bring px.line() to front
# show MAE window size
# plot rectangle to indicate MAE window size
x0, y0 = x_lim[0] + 0.01, y_lim[0] + 0.01
ax.add_annotation(
x=x0 + 0.05,
y=y0 + 0.01,
text=f"rolling MAE window<br>{window} eV/atom",
x=x0 + window,
y=y0,
text=f"rolling {window=} eV/atom",
showarrow=False,
xshift=8,
yshift=-4,
yanchor="bottom",
xanchor="left",
)
ax.add_shape(
type="rect",
x0=x0,
y0=y0,
x1=x0 + window,
y1=y0 + window / 5,
fillcolor="black",
fillcolor=line_color,
)

return ax
Expand Down Expand Up @@ -592,7 +608,7 @@ def cumulative_precision_recall(
facet_col_wrap=3,
facet_col_spacing=0.03,
# pivot df in case we want to show all 3 metrics in each plot's hover
# requires fixing index mismatch due to df subsampling above
# requires fixing index mismatch due to df sub-sampling above
# customdata=dict(
# df_cum.reset_index()
# .pivot(index="index", columns="metric")["Voronoi RF above hull pred"]
Expand Down
16 changes: 8 additions & 8 deletions models/cgcnn/test_cgcnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@

df = pd.read_json(data_path).set_index("material_id")

target_col = "e_form_per_atom_mp2020_corrected"
df[target_col] = df_wbm[target_col]
e_form_col = "e_form_per_atom_mp2020_corrected"
df[e_form_col] = df_wbm[e_form_col]
if task_type == "RS2RE":
df[input_col] = [x["structure"] for x in df.computed_structure_entry]
assert input_col in df, f"{input_col=} not in {list(df)}"
Expand Down Expand Up @@ -92,7 +92,7 @@
torch_version=version("torch"),
ensemble_size=len(runs),
task_type=task_type,
target_col=target_col,
target_col=e_form_col,
input_col=input_col,
wandb_run_filters=filters,
slurm_vars=slurm_vars,
Expand All @@ -101,7 +101,7 @@
wandb.init(project="matbench-discovery", name=job_name, config=run_params)

cg_data = CrystalGraphData(
df, task_dict={target_col: "regression"}, structure_col=input_col
df, task_dict={e_form_col: "regression"}, structure_col=input_col
)
data_loader = DataLoader(
cg_data, batch_size=1024, shuffle=False, collate_fn=collate_batch
Expand All @@ -114,16 +114,16 @@
# dropping isolated-atom structs means len(cg_data.df) < len(df)
cache_dir=CHECKPOINT_DIR,
df=cg_data.df.drop(columns=input_col),
target_col=target_col,
target_col=e_form_col,
model_cls=CrystalGraphConvNet,
data_loader=data_loader,
)

slurm_job_id = os.environ.get("SLURM_JOB_ID", "debug")
df.round(4).to_csv(f"{out_dir}/{job_name}-preds-{slurm_job_id}.csv")
pred_col = f"{target_col}_pred_ens"
pred_col = f"{e_form_col}_pred_ens"
assert pred_col in df, f"{pred_col=} not in {list(df)}"
table = wandb.Table(dataframe=df[[target_col, pred_col]].reset_index())
table = wandb.Table(dataframe=df[[e_form_col, pred_col]].reset_index())


# %%
Expand All @@ -132,4 +132,4 @@

title = f"CGCNN {task_type} ensemble={len(runs)} {MAE=:.4} {R2=:.4}"

wandb_scatter(table, fields=dict(x=target_col, y=pred_col), title=title)
wandb_scatter(table, fields=dict(x=e_form_col, y=pred_col), title=title)
14 changes: 7 additions & 7 deletions models/megnet/test_megnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@
data_path = f"{ROOT}/data/wbm/2022-10-19-wbm-init-structs.json.bz2"
print(f"\nJob started running {timestamp}")
print(f"{data_path=}")
target_col = "e_form_per_atom_mp2020_corrected"
assert target_col in df_wbm, f"{target_col=} not in {list(df_wbm)=}"
e_form_col = "e_form_per_atom_mp2020_corrected"
assert e_form_col in df_wbm, f"{e_form_col=} not in {list(df_wbm)=}"

df_wbm_structs = pd.read_json(data_path).set_index("material_id")
megnet_mp_e_form = load_model(model_name := "Eform_MP_2019")
Expand All @@ -64,7 +64,7 @@
numpy_version=version("numpy"),
model_name=model_name,
task_type=task_type,
target_col=target_col,
target_col=e_form_col,
df=dict(shape=str(df_wbm_structs.shape), columns=", ".join(df_wbm_structs)),
slurm_vars=slurm_vars,
)
Expand Down Expand Up @@ -109,11 +109,11 @@


# %%
table = wandb.Table(dataframe=df_wbm[[target_col, pred_col]].reset_index())
table = wandb.Table(dataframe=df_wbm[[e_form_col, pred_col]].reset_index())

MAE = (df_wbm[target_col] - df_wbm[pred_col]).abs().mean()
R2 = r2_score(df_wbm[target_col], df_wbm[pred_col])
MAE = (df_wbm[e_form_col] - df_wbm[pred_col]).abs().mean()
R2 = r2_score(df_wbm[e_form_col], df_wbm[pred_col])
title = f"{model_name} {task_type} {MAE=:.4} {R2=:.4}"
print(title)

wandb_scatter(table, fields=dict(x=target_col, y=pred_col), title=title)
wandb_scatter(table, fields=dict(x=e_form_col, y=pred_col), title=title)
8 changes: 5 additions & 3 deletions models/voronoi/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
"""
Recreate the featurizer used by Ward et al. in
https://journals.aps.org/prb/abstract/10.1103/PhysRevB.96.024104
"""

import matminer.featurizers.composition as fc
import matminer.featurizers.structure as fs
from matminer.featurizers.base import MultipleFeaturizer

# Create the featurizer: Ward et al. use a variety of different featurizers
# https://journals.aps.org/prb/abstract/10.1103/PhysRevB.96.024104

composition_features = [
# Ward+Wolverton' Magpie https://rdcu.be/c3jug
fc.ElementProperty.from_preset("magpie"),
Expand Down
24 changes: 12 additions & 12 deletions models/voronoi/train_test_voronoi_rf.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,20 +47,20 @@

mp_energies_path = f"{ROOT}/data/mp/2022-08-13-mp-energies.json.gz"
df_mp = pd.read_json(mp_energies_path).set_index("material_id")
train_target_col = "formation_energy_per_atom"
train_e_form_col = "formation_energy_per_atom"

test_path = f"{module_dir}/2022-11-18-features-wbm-{task_type}.csv.bz2"
df_test = pd.read_csv(test_path).set_index("material_id")
print(f"{df_test.shape=}")

test_target_col = "e_form_per_atom_mp2020_corrected"
test_e_form_col = "e_form_per_atom_mp2020_corrected"


for df, df_tar, col in (
(df_train, df_mp, train_target_col),
(df_test, df_wbm, test_target_col),
(df_train, df_mp, train_e_form_col),
(df_test, df_wbm, test_e_form_col),
):
df[train_target_col] = df_tar[train_target_col]
df[train_e_form_col] = df_tar[train_e_form_col]
nans = df_tar[col].isna().sum()
assert nans == 0, f"{nans} NaNs in {col} targets"

Expand All @@ -74,8 +74,8 @@
matminer_version=version("matminer"),
numpy_version=version("numpy"),
model_name=model_name,
train_target_col=train_target_col,
test_target_col=test_target_col,
train_target_col=train_e_form_col,
test_target_col=test_e_form_col,
df_train=dict(shape=str(df_train.shape)),
df_test=dict(shape=str(df_test.shape)),
slurm_vars=slurm_vars,
Expand Down Expand Up @@ -103,7 +103,7 @@


# %%
model.fit(df_train[feature_names], df_train[train_target_col])
model.fit(df_train[feature_names], df_train[train_e_form_col])


# %%
Expand All @@ -121,13 +121,13 @@
df_wbm[pred_col].round(4).to_csv(out_path)

table = wandb.Table(
dataframe=df_wbm[["formula", test_target_col, pred_col]].reset_index()
dataframe=df_wbm[["formula", test_e_form_col, pred_col]].reset_index()
)

df_wbm[pred_col].isna().sum()
MAE = (df_wbm[test_target_col] - df_wbm[pred_col]).abs().mean()
R2 = r2_score(*df_wbm[[test_target_col, pred_col]].dropna().to_numpy().T)
MAE = (df_wbm[test_e_form_col] - df_wbm[pred_col]).abs().mean()
R2 = r2_score(*df_wbm[[test_e_form_col, pred_col]].dropna().to_numpy().T)
title = f"{model_name} {task_type} {MAE=:.3} {R2=:.3}"
print(title)

wandb_scatter(table, fields=dict(x=test_target_col, y=pred_col), title=title)
wandb_scatter(table, fields=dict(x=test_e_form_col, y=pred_col), title=title)
Loading

0 comments on commit f84171b

Please sign in to comment.