add site/src/figs/mp-vs-mp-trj-vs-wbm-arity-hist.svelte and site/src/…

…figs/mp-trj-n-sites-hist.svelte to /data page
janosh · Dec 1, 2023 · 46366d1 · 46366d1
1 parent 9376630
commit 46366d1
Show file tree

Hide file tree

Showing 12 changed files with 188 additions and 28 deletions.
diff --git a/data/mp/eda_mp_trj.py b/data/mp/eda_mp_trj.py
@@ -11,6 +11,7 @@
 import numpy as np
 import pandas as pd
 import plotly.express as px
+from pymatgen.core import Composition
 from pymatviz import count_elements, ptable_heatmap, ptable_heatmap_ratio, ptable_hists
 from pymatviz.io import save_fig
 from pymatviz.utils import si_fmt
@@ -22,10 +23,12 @@
     ROOT,
     SITE_FIGS,
     formula_col,
+    id_col,
+    n_sites_col,
     stress_col,
     stress_trace_col,
 )
-from matbench_discovery.data import DATA_FILES
+from matbench_discovery.data import DATA_FILES, df_wbm
 
 __author__ = "Janosh Riebesell"
 __date__ = "2023-11-22"
@@ -36,6 +39,22 @@
 forces_col = "forces"
 
 
+# %% load MP element counts by occurrence to compute ratio with MPtrj
+mp_occu_counts = pd.read_json(
+    f"{data_page}/mp-element-counts-by-occurrence.json", typ="series"
+)
+df_mp = pd.read_csv(DATA_FILES.mp_energies, na_filter=False).set_index(id_col)
+
+
+# %% --- load preprocessed MPtrj summary data ---
+mp_trj_summary_path = f"{DATA_DIR}/mp/mp-trj-2022-09-summary.json.bz2"
+if os.path.isfile(mp_trj_summary_path):
+    df_mp_trj = pd.read_json(mp_trj_summary_path)
+    df_mp_trj.index.name = "frame_id"
+else:
+    print("MPtrj summary data not found, run cell below to generate")
+
+
 # %% downloaded mptrj-gga-ggapu.tar.gz from https://drive.google.com/drive/folders/1JQ-ry1RHvNliVg1Ut5OuyUxne51RHiT_
 # and extracted the mptrj-gga-ggapu directory (6.2 GB) to data/mp using macOS Finder
 # then zipped it to mp-trj-extxyz.zip (also using Finder, 1.6 GB)
@@ -84,12 +103,7 @@
 
 
 # %%
-df_mp_trj.to_json(f"{DATA_DIR}/mp/mp-trj-2022-09-summary.json.bz2")
-
-
-# %% --- load preprocessed MPtrj summary data ---
-df_mp_trj = pd.read_json(f"{DATA_DIR}/mp/mp-trj-2022-09-summary.json.bz2")
-df_mp_trj.index.name = "frame_id"
+df_mp_trj.to_json(mp_trj_summary_path)
 
 
 # %% plot per-element magmom histograms
@@ -164,13 +178,6 @@
 save_fig(ax_ptable, f"{PDF_FIGS}/{img_name}.pdf")
 
 
-# %% load MP element counts by occurrence to compute ratio with MPtrj
-mp_occu_counts = pd.read_json(
-    f"{data_page}/mp-element-counts-by-occurrence.json", typ="series"
-)
-df_mp = pd.read_csv(DATA_FILES.mp_energies, na_filter=False)
-
-
 # %%
 normalized = True
 ax_ptable = ptable_heatmap_ratio(
@@ -271,3 +278,94 @@
 fig.show()
 save_fig(fig, f"{PDF_FIGS}/mp-trj-magmoms-hist.pdf", **pdf_kwds)
 save_fig(fig, f"{SITE_FIGS}/mp-trj-magmoms-hist.svelte")
+
+
+# %%
+arity_col = "arity"
+for df in (df_mp_trj, df_mp, df_wbm):
+    if arity_col not in df:
+        df[arity_col] = df[formula_col].map(Composition).map(len)
+
+
+# %%
+df_arity = pd.DataFrame(
+    {
+        key: df[arity_col].value_counts().sort_index() / len(df)
+        for key, df in (("MP", df_mp), ("MPtrj", df_mp_trj), ("WBM", df_wbm))
+    }
+)
+df_arity = df_arity.query("0 < index < 7")
+
+fig = px.bar(df_arity, barmode="group")
+fig.update_traces(marker_line_width=0)
+fig.layout.legend.update(x=1, y=1, xanchor="right", yanchor="top", title=None)
+fig.layout.margin = dict(l=0, r=0, b=0, t=0)
+fig.layout.yaxis.title = "Fraction of Structures in Dataset"
+fig.layout.xaxis.title = "Number of Elements in Formula"
+
+fig.show()
+img_name = "mp-vs-mp-trj-vs-wbm-arity-hist"
+save_fig(fig, f"{SITE_FIGS}/{img_name}.svelte")
+save_fig(fig, f"{PDF_FIGS}/{img_name}.pdf", width=450, height=280)
+
+
+# %% calc n_sites from forces len
+df_mp_trj[n_sites_col] = df_mp_trj[forces_col].map(len)
+log_y = False
+n_sites_hist, n_sites_bins = np.histogram(
+    df_mp_trj[n_sites_col], bins=range(1, df_mp_trj[n_sites_col].max() + 1)
+)
+
+n_struct_col = "Number of Structures"
+df_n_sites = pd.DataFrame({n_sites_col: n_sites_bins[:-1], n_struct_col: n_sites_hist})
+
+
+# %% plot n_sites distribution
+fig = px.bar(df_n_sites, x=n_sites_col, y=n_struct_col, log_y=log_y, range_x=(1, 200))
+# add inset plot with log scale
+fig.add_bar(
+    x=df_n_sites[n_sites_col],
+    y=df_n_sites[n_struct_col],
+    showlegend=False,
+    xaxis="x2",
+    yaxis="y2",
+    marker=dict(color=fig.data[0].marker.color),  # same color as main plot
+)
+
+bin_width = n_sites_bins[1] - n_sites_bins[0]
+fig.update_traces(width=bin_width, marker_line_width=0)
+# add cumulative distribution as 2nd y axis
+fig.add_scatter(
+    x=df_n_sites[n_sites_col],
+    y=df_n_sites[n_struct_col].cumsum() / df_n_sites[n_struct_col].sum(),
+    mode="lines",
+    name="Cumulative",
+    xaxis="x3",
+    yaxis="y3",
+    hovertemplate="x: %{x}<br>y: %{y:.1%}",
+)
+# add inset title 'log-scaled to show tail'
+inset_domain = [0.4, 1]
+fig.layout.xaxis2 = dict(domain=inset_domain, anchor="y2")
+fig.layout.yaxis2 = dict(
+    domain=inset_domain,
+    anchor="x2",
+    type="log",
+    title="log-scaled to show tail",
+    title_standoff=0,
+)
+
+fig.layout.yaxis3 = dict(  # move y3 axis to right side of y2
+    overlaying="y2", side="right", tickformat=".0%"
+)
+fig.layout.xaxis3 = dict(overlaying="x2", visible=False)
+
+fig.layout.margin = dict(l=5, r=5, b=5, t=5)
+fig.layout.legend.update(x=0.96, y=0.25, xanchor="right")
+fig.show()
+
+img_name = "mp-trj-n-sites-hist"
+if log_y:
+    img_name += "-log"
+save_fig(fig, f"{SITE_FIGS}/{img_name}.svelte")
+# save_fig(fig, f"{PDF_FIGS}/{img_name}.pdf", width=450, height=300)
diff --git a/data/mp/get_mp_energies.py b/data/mp/get_mp_energies.py
@@ -8,7 +8,13 @@
 from pymatviz.utils import annotate_metrics
 from tqdm import tqdm
 
-from matbench_discovery import STABILITY_THRESHOLD, id_col, today
+from matbench_discovery import (
+    STABILITY_THRESHOLD,
+    formula_col,
+    id_col,
+    n_sites_col,
+    today,
+)
 from matbench_discovery.data import DATA_FILES
 
 """
@@ -34,6 +40,7 @@
     "energy_above_hull",
     "decomposition_enthalpy",
     "energy_type",
+    "nsites",
 }
 
 with MPRester(use_document_model=False) as mpr:
@@ -47,6 +54,7 @@
 
 # %%
 df = pd.DataFrame(docs).set_index(id_col)
+df = df.rename(columns={"formula_pretty": formula_col, "nsites": n_sites_col})
 
 df_spg = pd.json_normalize(df.pop("symmetry"))[["number", "symbol"]]
 df["spacegroup_symbol"] = df_spg.symbol.to_numpy()
@@ -106,7 +114,7 @@
     y="energy_above_hull",
     color=mask_above_line.map({True: "red", False: "blue"}),
     # backend="plotly",
-    # hover_data=["index", "formula_pretty", "formation_energy_per_atom"],
+    hover_data=["index", formula_col, "formation_energy_per_atom"],
 )
 # most points lie on line y=x for x > 0 and y = 0 for x < 0.
 n_above_line = sum(mask_above_line)

diff --git a/data/wbm/eda_wbm.py b/data/wbm/eda_wbm.py
@@ -41,7 +41,9 @@
 
 
 # %% load MP training set
-df_mp = pd.read_csv(DATA_FILES.mp_energies, na_filter=False)
+df_mp = pd.read_csv(DATA_FILES.mp_energies, na_filter=False, na_values=[])
+
+df_mp[df_mp[formula_col].isna()]
 
 
 # %%
@@ -50,10 +52,8 @@
 )
 wbm_comp_counts = count_elements(df_wbm[formula_col], count_mode="composition")
 
-mp_occu_counts = count_elements(df_mp.formula_pretty, count_mode="occurrence").astype(
-    int
-)
-mp_comp_counts = count_elements(df_mp.formula_pretty, count_mode="composition")
+mp_occu_counts = count_elements(df_mp[formula_col], count_mode="occurrence").astype(int)
+mp_comp_counts = count_elements(df_mp[formula_col], count_mode="composition")
 
 all_counts = (
     ("wbm", "occurrence", wbm_occu_counts),
@@ -315,3 +315,27 @@
 # https://github.com/plotly/plotly.py/issues/4115
 # https://github.com/plotly/plotly.js/issues/5341
 # https://github.com/plotly/plotly.js/issues/4728
+
+
+# %% compute compositional arity histograms
+arity_col = "arity"
+df_wbm[arity_col] = df_wbm[formula_col].map(Composition).map(len)
+df_mp[arity_col] = df_mp[formula_col].map(Composition).map(len)
+
+mp_arity_counts = df_mp[arity_col].value_counts().sort_index() / len(df_mp)
+wbm_arity_counts = df_wbm[arity_col].value_counts().sort_index() / len(df_wbm)
+
+df_arity = pd.DataFrame({"MP": mp_arity_counts, "WBM": wbm_arity_counts}).query(
+    "0 < index < 7"
+)
+
+fig = px.bar(df_arity, barmode="group")
+fig.layout.legend.update(x=1, y=1, xanchor="right", yanchor="top", title=None)
+fig.layout.margin = dict(l=0, r=0, b=0, t=0)
+fig.layout.yaxis.title = "Fraction of Structures in Dataset"
+fig.layout.xaxis.title = "Number of Elements in Formula"
+
+fig.show()
+img_name = "mp-vs-wbm-arity-hist"
+save_fig(fig, f"{SITE_FIGS}/{img_name}.svelte")
+save_fig(fig, f"{PDF_FIGS}/{img_name}.pdf", width=450, height=280)
diff --git a/matbench_discovery/__init__.py b/matbench_discovery/__init__.py
@@ -48,6 +48,7 @@
 formula_col = "formula"
 stress_col = "stress"
 stress_trace_col = "stress_trace"
+n_sites_col = "n_sites"
 
 # load figshare 1.0.0
 with open(f"{FIGSHARE}/1.0.0.json") as file:
@@ -65,7 +66,7 @@
     crystal_sys="Crystal system",
     spg_num="Space group",
     n_wyckoff="Number of Wyckoff positions",
-    n_sites="Lattice site count",
+    n_sites="Number of atoms",
     energy_per_atom=f"Energy {ev_per_atom}",
     e_form=f"DFT E<sub>form</sub> {ev_per_atom}",
     e_above_hull=f"E<sub>hull dist</sub> {ev_per_atom}",

diff --git a/scripts/analyze_model_failure_cases.py b/scripts/analyze_model_failure_cases.py
@@ -155,7 +155,7 @@
 # %%
 df_mp = pd.read_csv(DATA_FILES.mp_energies, na_filter=False).set_index(id_col)
 train_count_col = "MP Occurrences"
-df_elem_counts = count_elements(df_mp.formula_pretty, count_mode="occurrence").to_frame(
+df_elem_counts = count_elements(df_mp[formula_col], count_mode="occurrence").to_frame(
     name=train_count_col
 )
 n_examp_for_rarest_elem_col = "Examples for rarest element in structure"

diff --git a/scripts/project_compositions.py b/scripts/project_compositions.py
@@ -11,7 +11,7 @@
 from pymatgen.core import Composition
 from tqdm import tqdm
 
-from matbench_discovery import DATA_DIR, id_col
+from matbench_discovery import DATA_DIR, formula_col, id_col
 from matbench_discovery.data import DATA_FILES
 from matbench_discovery.slurm import slurm_submit
 
@@ -80,9 +80,8 @@ def sum_one_hot_elem(formula: str) -> np.ndarray[Any, np.int64]:
     return sum(identity[el.Z - 1] * amt for el, amt in Composition(formula).items())
 
 
-in_col = {"wbm": "formula", "mp": "formula_pretty"}[data_name]
 one_hot_encoding = np.array(
-    [sum_one_hot_elem(formula) for formula in tqdm(df_in[in_col])]
+    [sum_one_hot_elem(formula) for formula in tqdm(df_in[formula_col])]
 )
 
 projections = projector.fit_transform(one_hot_encoding)

diff --git a/site/src/figs/mp-trj-n-sites-hist.svelte b/site/src/figs/mp-trj-n-sites-hist.svelte
diff --git a/site/src/figs/mp-vs-mp-trj-vs-wbm-arity-hist.svelte b/site/src/figs/mp-vs-mp-trj-vs-wbm-arity-hist.svelte