Skip to content

Commit

Permalink
add site/src/figs/mp-vs-mp-trj-vs-wbm-arity-hist.svelte and site/src/…
Browse files Browse the repository at this point in the history
…figs/mp-trj-n-sites-hist.svelte to /data page
  • Loading branch information
janosh committed Dec 1, 2023
1 parent 9376630 commit 46366d1
Show file tree
Hide file tree
Showing 12 changed files with 188 additions and 28 deletions.
126 changes: 112 additions & 14 deletions data/mp/eda_mp_trj.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import numpy as np
import pandas as pd
import plotly.express as px
from pymatgen.core import Composition
from pymatviz import count_elements, ptable_heatmap, ptable_heatmap_ratio, ptable_hists
from pymatviz.io import save_fig
from pymatviz.utils import si_fmt
Expand All @@ -22,10 +23,12 @@
ROOT,
SITE_FIGS,
formula_col,
id_col,
n_sites_col,
stress_col,
stress_trace_col,
)
from matbench_discovery.data import DATA_FILES
from matbench_discovery.data import DATA_FILES, df_wbm

__author__ = "Janosh Riebesell"
__date__ = "2023-11-22"
Expand All @@ -36,6 +39,22 @@
forces_col = "forces"


# %% load MP element counts by occurrence to compute ratio with MPtrj
mp_occu_counts = pd.read_json(
f"{data_page}/mp-element-counts-by-occurrence.json", typ="series"
)
df_mp = pd.read_csv(DATA_FILES.mp_energies, na_filter=False).set_index(id_col)


# %% --- load preprocessed MPtrj summary data ---
mp_trj_summary_path = f"{DATA_DIR}/mp/mp-trj-2022-09-summary.json.bz2"
if os.path.isfile(mp_trj_summary_path):
df_mp_trj = pd.read_json(mp_trj_summary_path)
df_mp_trj.index.name = "frame_id"
else:
print("MPtrj summary data not found, run cell below to generate")


# %% downloaded mptrj-gga-ggapu.tar.gz from https://drive.google.com/drive/folders/1JQ-ry1RHvNliVg1Ut5OuyUxne51RHiT_
# and extracted the mptrj-gga-ggapu directory (6.2 GB) to data/mp using macOS Finder
# then zipped it to mp-trj-extxyz.zip (also using Finder, 1.6 GB)
Expand Down Expand Up @@ -84,12 +103,7 @@


# %%
df_mp_trj.to_json(f"{DATA_DIR}/mp/mp-trj-2022-09-summary.json.bz2")


# %% --- load preprocessed MPtrj summary data ---
df_mp_trj = pd.read_json(f"{DATA_DIR}/mp/mp-trj-2022-09-summary.json.bz2")
df_mp_trj.index.name = "frame_id"
df_mp_trj.to_json(mp_trj_summary_path)


# %% plot per-element magmom histograms
Expand Down Expand Up @@ -164,13 +178,6 @@
save_fig(ax_ptable, f"{PDF_FIGS}/{img_name}.pdf")


# %% load MP element counts by occurrence to compute ratio with MPtrj
mp_occu_counts = pd.read_json(
f"{data_page}/mp-element-counts-by-occurrence.json", typ="series"
)
df_mp = pd.read_csv(DATA_FILES.mp_energies, na_filter=False)


# %%
normalized = True
ax_ptable = ptable_heatmap_ratio(
Expand Down Expand Up @@ -271,3 +278,94 @@
fig.show()
save_fig(fig, f"{PDF_FIGS}/mp-trj-magmoms-hist.pdf", **pdf_kwds)
save_fig(fig, f"{SITE_FIGS}/mp-trj-magmoms-hist.svelte")


# %%
arity_col = "arity"
for df in (df_mp_trj, df_mp, df_wbm):
if arity_col not in df:
df[arity_col] = df[formula_col].map(Composition).map(len)


# %%
df_arity = pd.DataFrame(
{
key: df[arity_col].value_counts().sort_index() / len(df)
for key, df in (("MP", df_mp), ("MPtrj", df_mp_trj), ("WBM", df_wbm))
}
)
df_arity = df_arity.query("0 < index < 7")

fig = px.bar(df_arity, barmode="group")
fig.update_traces(marker_line_width=0)
fig.layout.legend.update(x=1, y=1, xanchor="right", yanchor="top", title=None)
fig.layout.margin = dict(l=0, r=0, b=0, t=0)
fig.layout.yaxis.title = "Fraction of Structures in Dataset"
fig.layout.xaxis.title = "Number of Elements in Formula"

fig.show()
img_name = "mp-vs-mp-trj-vs-wbm-arity-hist"
save_fig(fig, f"{SITE_FIGS}/{img_name}.svelte")
save_fig(fig, f"{PDF_FIGS}/{img_name}.pdf", width=450, height=280)


# %% calc n_sites from forces len
df_mp_trj[n_sites_col] = df_mp_trj[forces_col].map(len)
log_y = False
n_sites_hist, n_sites_bins = np.histogram(
df_mp_trj[n_sites_col], bins=range(1, df_mp_trj[n_sites_col].max() + 1)
)

n_struct_col = "Number of Structures"
df_n_sites = pd.DataFrame({n_sites_col: n_sites_bins[:-1], n_struct_col: n_sites_hist})


# %% plot n_sites distribution
fig = px.bar(df_n_sites, x=n_sites_col, y=n_struct_col, log_y=log_y, range_x=(1, 200))
# add inset plot with log scale
fig.add_bar(
x=df_n_sites[n_sites_col],
y=df_n_sites[n_struct_col],
showlegend=False,
xaxis="x2",
yaxis="y2",
marker=dict(color=fig.data[0].marker.color), # same color as main plot
)

bin_width = n_sites_bins[1] - n_sites_bins[0]
fig.update_traces(width=bin_width, marker_line_width=0)
# add cumulative distribution as 2nd y axis
fig.add_scatter(
x=df_n_sites[n_sites_col],
y=df_n_sites[n_struct_col].cumsum() / df_n_sites[n_struct_col].sum(),
mode="lines",
name="Cumulative",
xaxis="x3",
yaxis="y3",
hovertemplate="x: %{x}<br>y: %{y:.1%}",
)
# add inset title 'log-scaled to show tail'
inset_domain = [0.4, 1]
fig.layout.xaxis2 = dict(domain=inset_domain, anchor="y2")
fig.layout.yaxis2 = dict(
domain=inset_domain,
anchor="x2",
type="log",
title="log-scaled to show tail",
title_standoff=0,
)

fig.layout.yaxis3 = dict( # move y3 axis to right side of y2
overlaying="y2", side="right", tickformat=".0%"
)
fig.layout.xaxis3 = dict(overlaying="x2", visible=False)

fig.layout.margin = dict(l=5, r=5, b=5, t=5)
fig.layout.legend.update(x=0.96, y=0.25, xanchor="right")
fig.show()

img_name = "mp-trj-n-sites-hist"
if log_y:
img_name += "-log"
save_fig(fig, f"{SITE_FIGS}/{img_name}.svelte")
# save_fig(fig, f"{PDF_FIGS}/{img_name}.pdf", width=450, height=300)
12 changes: 10 additions & 2 deletions data/mp/get_mp_energies.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,13 @@
from pymatviz.utils import annotate_metrics
from tqdm import tqdm

from matbench_discovery import STABILITY_THRESHOLD, id_col, today
from matbench_discovery import (
STABILITY_THRESHOLD,
formula_col,
id_col,
n_sites_col,
today,
)
from matbench_discovery.data import DATA_FILES

"""
Expand All @@ -34,6 +40,7 @@
"energy_above_hull",
"decomposition_enthalpy",
"energy_type",
"nsites",
}

with MPRester(use_document_model=False) as mpr:
Expand All @@ -47,6 +54,7 @@

# %%
df = pd.DataFrame(docs).set_index(id_col)
df = df.rename(columns={"formula_pretty": formula_col, "nsites": n_sites_col})

df_spg = pd.json_normalize(df.pop("symmetry"))[["number", "symbol"]]
df["spacegroup_symbol"] = df_spg.symbol.to_numpy()
Expand Down Expand Up @@ -106,7 +114,7 @@
y="energy_above_hull",
color=mask_above_line.map({True: "red", False: "blue"}),
# backend="plotly",
# hover_data=["index", "formula_pretty", "formation_energy_per_atom"],
hover_data=["index", formula_col, "formation_energy_per_atom"],
)
# most points lie on line y=x for x > 0 and y = 0 for x < 0.
n_above_line = sum(mask_above_line)
Expand Down
34 changes: 29 additions & 5 deletions data/wbm/eda_wbm.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,9 @@


# %% load MP training set
df_mp = pd.read_csv(DATA_FILES.mp_energies, na_filter=False)
df_mp = pd.read_csv(DATA_FILES.mp_energies, na_filter=False, na_values=[])

df_mp[df_mp[formula_col].isna()]


# %%
Expand All @@ -50,10 +52,8 @@
)
wbm_comp_counts = count_elements(df_wbm[formula_col], count_mode="composition")

mp_occu_counts = count_elements(df_mp.formula_pretty, count_mode="occurrence").astype(
int
)
mp_comp_counts = count_elements(df_mp.formula_pretty, count_mode="composition")
mp_occu_counts = count_elements(df_mp[formula_col], count_mode="occurrence").astype(int)
mp_comp_counts = count_elements(df_mp[formula_col], count_mode="composition")

all_counts = (
("wbm", "occurrence", wbm_occu_counts),
Expand Down Expand Up @@ -315,3 +315,27 @@
# https://github.com/plotly/plotly.py/issues/4115
# https://github.com/plotly/plotly.js/issues/5341
# https://github.com/plotly/plotly.js/issues/4728


# %% compute compositional arity histograms
arity_col = "arity"
df_wbm[arity_col] = df_wbm[formula_col].map(Composition).map(len)
df_mp[arity_col] = df_mp[formula_col].map(Composition).map(len)

mp_arity_counts = df_mp[arity_col].value_counts().sort_index() / len(df_mp)
wbm_arity_counts = df_wbm[arity_col].value_counts().sort_index() / len(df_wbm)

df_arity = pd.DataFrame({"MP": mp_arity_counts, "WBM": wbm_arity_counts}).query(
"0 < index < 7"
)

fig = px.bar(df_arity, barmode="group")
fig.layout.legend.update(x=1, y=1, xanchor="right", yanchor="top", title=None)
fig.layout.margin = dict(l=0, r=0, b=0, t=0)
fig.layout.yaxis.title = "Fraction of Structures in Dataset"
fig.layout.xaxis.title = "Number of Elements in Formula"

fig.show()
img_name = "mp-vs-wbm-arity-hist"
save_fig(fig, f"{SITE_FIGS}/{img_name}.svelte")
save_fig(fig, f"{PDF_FIGS}/{img_name}.pdf", width=450, height=280)
3 changes: 2 additions & 1 deletion matbench_discovery/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
formula_col = "formula"
stress_col = "stress"
stress_trace_col = "stress_trace"
n_sites_col = "n_sites"

# load figshare 1.0.0
with open(f"{FIGSHARE}/1.0.0.json") as file:
Expand All @@ -65,7 +66,7 @@
crystal_sys="Crystal system",
spg_num="Space group",
n_wyckoff="Number of Wyckoff positions",
n_sites="Lattice site count",
n_sites="Number of atoms",
energy_per_atom=f"Energy {ev_per_atom}",
e_form=f"DFT E<sub>form</sub> {ev_per_atom}",
e_above_hull=f"E<sub>hull dist</sub> {ev_per_atom}",
Expand Down
2 changes: 1 addition & 1 deletion scripts/analyze_model_failure_cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@
# %%
df_mp = pd.read_csv(DATA_FILES.mp_energies, na_filter=False).set_index(id_col)
train_count_col = "MP Occurrences"
df_elem_counts = count_elements(df_mp.formula_pretty, count_mode="occurrence").to_frame(
df_elem_counts = count_elements(df_mp[formula_col], count_mode="occurrence").to_frame(
name=train_count_col
)
n_examp_for_rarest_elem_col = "Examples for rarest element in structure"
Expand Down
5 changes: 2 additions & 3 deletions scripts/project_compositions.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from pymatgen.core import Composition
from tqdm import tqdm

from matbench_discovery import DATA_DIR, id_col
from matbench_discovery import DATA_DIR, formula_col, id_col
from matbench_discovery.data import DATA_FILES
from matbench_discovery.slurm import slurm_submit

Expand Down Expand Up @@ -80,9 +80,8 @@ def sum_one_hot_elem(formula: str) -> np.ndarray[Any, np.int64]:
return sum(identity[el.Z - 1] * amt for el, amt in Composition(formula).items())


in_col = {"wbm": "formula", "mp": "formula_pretty"}[data_name]
one_hot_encoding = np.array(
[sum_one_hot_elem(formula) for formula in tqdm(df_in[in_col])]
[sum_one_hot_elem(formula) for formula in tqdm(df_in[formula_col])]
)

projections = projector.fit_transform(one_hot_encoding)
Expand Down
1 change: 1 addition & 0 deletions site/src/figs/mp-trj-n-sites-hist.svelte

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions site/src/figs/mp-vs-mp-trj-vs-wbm-arity-hist.svelte

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 46366d1

Please sign in to comment.