diff --git a/data/skipatom_mp2022/mp_2022_10_28.oxi.dim1000.model b/data/skipatom_mp2022/mp_2022_10_28.oxi.dim1000.model deleted file mode 100644 index ae30501..0000000 Binary files a/data/skipatom_mp2022/mp_2022_10_28.oxi.dim1000.model and /dev/null differ diff --git a/data/skipatom_mp2022/mp_2022_10_28.oxi.dim150.model b/data/skipatom_mp2022/mp_2022_10_28.oxi.dim150.model deleted file mode 100644 index cf22daf..0000000 Binary files a/data/skipatom_mp2022/mp_2022_10_28.oxi.dim150.model and /dev/null differ diff --git a/data/skipatom_mp2022/mp_2022_10_28.oxi.dim200.model b/data/skipatom_mp2022/mp_2022_10_28.oxi.dim200.model deleted file mode 100644 index caef15c..0000000 Binary files a/data/skipatom_mp2022/mp_2022_10_28.oxi.dim200.model and /dev/null differ diff --git a/data/skipatom_mp2022/mp_2022_10_28.oxi.dim200_c.model b/data/skipatom_mp2022/mp_2022_10_28.oxi.dim200_c.model deleted file mode 100644 index 01b8584..0000000 Binary files a/data/skipatom_mp2022/mp_2022_10_28.oxi.dim200_c.model and /dev/null differ diff --git a/data/skipatom_mp2022/mp_2022_10_28.oxi.dim250.model b/data/skipatom_mp2022/mp_2022_10_28.oxi.dim250.model deleted file mode 100644 index 6550f17..0000000 Binary files a/data/skipatom_mp2022/mp_2022_10_28.oxi.dim250.model and /dev/null differ diff --git a/data/skipatom_mp2022/mp_2022_10_28.oxi.dim260.model b/data/skipatom_mp2022/mp_2022_10_28.oxi.dim260.model deleted file mode 100644 index 6e1835d..0000000 Binary files a/data/skipatom_mp2022/mp_2022_10_28.oxi.dim260.model and /dev/null differ diff --git a/data/skipatom_mp2022/mp_2022_10_28.oxi.dim30.model b/data/skipatom_mp2022/mp_2022_10_28.oxi.dim30.model deleted file mode 100644 index 2508c1b..0000000 Binary files a/data/skipatom_mp2022/mp_2022_10_28.oxi.dim30.model and /dev/null differ diff --git a/data/skipatom_mp2022/mp_2022_10_28.oxi.dim300.model b/data/skipatom_mp2022/mp_2022_10_28.oxi.dim300.model deleted file mode 100644 index bba95ae..0000000 Binary files a/data/skipatom_mp2022/mp_2022_10_28.oxi.dim300.model and /dev/null differ diff --git a/data/skipatom_mp2022/mp_2022_10_28.oxi.dim400.model b/data/skipatom_mp2022/mp_2022_10_28.oxi.dim400.model deleted file mode 100644 index 968f621..0000000 Binary files a/data/skipatom_mp2022/mp_2022_10_28.oxi.dim400.model and /dev/null differ diff --git a/data/skipatom_mp2022/mp_2022_10_28.oxi.dim800.model b/data/skipatom_mp2022/mp_2022_10_28.oxi.dim800.model deleted file mode 100644 index 6c5c58f..0000000 Binary files a/data/skipatom_mp2022/mp_2022_10_28.oxi.dim800.model and /dev/null differ diff --git a/data/skipatom_mp2022/mp_2022_10_28.oxi.dim86.model b/data/skipatom_mp2022/mp_2022_10_28.oxi.dim86.model deleted file mode 100644 index 5cf7a2e..0000000 Binary files a/data/skipatom_mp2022/mp_2022_10_28.oxi.dim86.model and /dev/null differ diff --git a/data_prep/MP_skip_training_dataset.pdf b/data_prep/MP_skip_training_dataset.pdf deleted file mode 100644 index 9d4e78e..0000000 Binary files a/data_prep/MP_skip_training_dataset.pdf and /dev/null differ diff --git a/data_prep/MP_skip_training_dataset_alt.pdf b/data_prep/MP_skip_training_dataset_alt.pdf deleted file mode 100644 index 45c07b3..0000000 Binary files a/data_prep/MP_skip_training_dataset_alt.pdf and /dev/null differ diff --git a/data_prep/Property_dataset.pdf b/data_prep/Property_dataset.pdf deleted file mode 100644 index 87e14c4..0000000 Binary files a/data_prep/Property_dataset.pdf and /dev/null differ diff --git a/data_prep/mp_dataset_distribution.pdf b/data_prep/mp_dataset_distribution.pdf deleted file mode 100644 index dc0851a..0000000 Binary files a/data_prep/mp_dataset_distribution.pdf and /dev/null differ diff --git a/data_prep/oxi-mp_property_dataset_unique_formulas.json.gz b/data_prep/oxi-mp_property_dataset_unique_formulas.json.gz deleted file mode 100644 index 57ea9cd..0000000 Binary files a/data_prep/oxi-mp_property_dataset_unique_formulas.json.gz and /dev/null differ diff --git a/data_prep/process_dataset_latest.ipynb b/data_prep/process_dataset_latest.ipynb deleted file mode 100644 index bd18698..0000000 --- a/data_prep/process_dataset_latest.ipynb +++ /dev/null @@ -1,431 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from typing import Dict\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "from monty.serialization import loadfn, dumpfn\n", - "import os\n", - "import pandas as pd\n", - "from matplotlib.colors import LogNorm\n", - "from matplotlib.ticker import LogFormatter" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data_path = \"../data/oxi-mp_property_dataset.json.gz\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df = loadfn(data_path)\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def num_els(comp_dict):\n", - " return len(comp_dict.keys())\n", - "\n", - "\n", - "df[\"num_els\"] = df.composition.apply(num_els)\n", - "df[\"num_sp\"] = df.oxi_composition.apply(num_els)\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.query(\"num_els==4 & num_sp==6\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[\"oxi_composition\"][110018]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sns.set_theme(\n", - " context=\"paper\",\n", - " style=\"ticks\",\n", - " font_scale=1.3,\n", - ")\n", - "fig, axes = plt.subplots(1, 2, figsize=(12, 0.7 * 9), sharey=True)\n", - "\n", - "for ax, dat in zip(axes.flatten(), [\"num_els\", \"num_sp\"]):\n", - " sns.countplot(df, x=dat, ax=ax)\n", - " ax.bar_label(ax.containers[0])\n", - "\n", - "# sns.scatterplot(df,x=\"num_els\",y=\"num_sp\",alpha=0.4,s=100,ax=axes[2])\n", - "\n", - "axes[0].set_xlabel(\"Number of elements\")\n", - "axes[1].set_xlabel(\"Number of species\")\n", - "axes[0].set_ylabel(\"Number of compounds\")\n", - "axes[1].set_ylabel(\"\")\n", - "plt.tight_layout()\n", - "plt.savefig(\n", - " \"mp_dataset_distribution.pdf\", dpi=300, bbox_inches=\"tight\", transparent=True\n", - ")\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Create a layout with subplot2grid\n", - "fig = plt.figure(figsize=(12, 10))\n", - "\n", - "# Define the grid for subplots\n", - "ax1 = plt.subplot2grid((2, 4), (0, 0), colspan=2)\n", - "ax2 = plt.subplot2grid((2, 4), (0, 2), colspan=2, sharey=ax1)\n", - "ax3 = plt.subplot2grid((2, 4), (1, 1), colspan=2)\n", - "axes = [ax1, ax2, ax3]\n", - "# Use Seaborn to enhance the plots\n", - "sns.scatterplot(df, x=\"num_els\", y=\"num_sp\", alpha=0.4, s=100, ax=ax3, rasterized=True)\n", - "for ax, dat in zip([ax1, ax2], [\"num_els\", \"num_sp\"]):\n", - " sns.countplot(df, x=dat, ax=ax)\n", - " ax.bar_label(ax.containers[0])\n", - "\n", - "# Add labels, legends, etc. as needed\n", - "axes[0].set_xlabel(\"Number of elements\")\n", - "axes[1].set_xlabel(\"Number of species\")\n", - "axes[0].set_ylabel(\"Number of compounds\")\n", - "axes[1].set_ylabel(\"\")\n", - "\n", - "axes[2].set_xlabel(\"Number of elements\")\n", - "axes[2].set_ylabel(\"Number of species\")\n", - "axes[2].set_yticks(range(0, 11))\n", - "axes[2].set_xticks(range(0, 10))\n", - "axes[2].plot(\n", - " range(0, 11),\n", - " range(0, 11),\n", - " \"k:\",\n", - " label=\"$N_{elements}=N_{species}$\",\n", - ")\n", - "axes[2].legend()\n", - "fig.text(0.05, 1, \"(a)\", weight=\"bold\")\n", - "fig.text(0.52, 1, \"(b)\", weight=\"bold\")\n", - "fig.text(0.26, 0.46, \"(c)\", weight=\"bold\")\n", - "# Adjust spacing\n", - "# plt.subplots_adjust(hspace=0.5)\n", - "plt.tight_layout()\n", - "# Show the plot\n", - "plt.savefig(\n", - " \"MP_skip_training_dataset.pdf\", dpi=300, bbox_inches=\"tight\", transparent=True\n", - ")\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Create a layout with subplot2grid\n", - "fig = plt.figure(figsize=(1.2 * 12, 1.2 * 10))\n", - "\n", - "# Define the grid for subplots\n", - "ax1 = plt.subplot2grid((2, 4), (0, 0), colspan=2)\n", - "ax2 = plt.subplot2grid((2, 4), (0, 2), colspan=2, sharey=ax1)\n", - "ax3 = plt.subplot2grid((2, 4), (1, 1), colspan=2)\n", - "axes = [ax1, ax2, ax3]\n", - "# Use Seaborn to enhance the plots\n", - "for ax, dat in zip([ax1, ax2], [\"num_els\", \"num_sp\"]):\n", - " sns.countplot(df, x=dat, ax=ax)\n", - " ax.bar_label(ax.containers[0])\n", - "\n", - "sns.heatmap(\n", - " pivot,\n", - " annot=True,\n", - " cmap=\"Blues\",\n", - " fmt=\".0f\",\n", - " robust=True,\n", - " linewidth=0.5,\n", - " cbar_kws={\"label\": \"Number of structures\"},\n", - " norm=LogNorm(),\n", - " ax=ax3,\n", - ")\n", - "cbar = ax3.collections[0].colorbar\n", - "cbar.set_ticks([1, 10, 100, 1000, 10000]) # Customize ticks based on your data\n", - "cbar.set_ticklabels(\n", - " [\"1\", \"10\", \"100\", \"1000\", \"10000\"]\n", - ") # Customize labels based on your data\n", - "\n", - "\n", - "# Add labels, legends, etc. as needed\n", - "axes[0].set_xlabel(\"Number of elements\")\n", - "axes[1].set_xlabel(\"Number of species\")\n", - "axes[0].set_ylabel(\"Number of compounds\")\n", - "axes[1].set_ylabel(\"\")\n", - "\n", - "axes[2].set_xlabel(\"Number of elements\")\n", - "axes[2].set_ylabel(\"Number of species\")\n", - "axes[2].tick_params(axis=\"y\", rotation=0)\n", - "fig.text(0.05, 1, \"(a)\", weight=\"bold\")\n", - "fig.text(0.52, 1, \"(b)\", weight=\"bold\")\n", - "fig.text(0.26, 0.46, \"(c)\", weight=\"bold\")\n", - "# Adjust spacing\n", - "# plt.subplots_adjust(hspace=0.5)\n", - "plt.tight_layout()\n", - "# Show the plot\n", - "plt.savefig(\n", - " \"MP_skip_training_dataset_alt.pdf\", dpi=300, bbox_inches=\"tight\", transparent=True\n", - ")\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pivot = df.groupby([\"num_sp\", \"num_els\"]).size().unstack().fillna(0)\n", - "pivot.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fig, ax = plt.subplots(figsize=(12, 12))\n", - "sns.heatmap(\n", - " pivot,\n", - " annot=True,\n", - " cmap=\"Blues\",\n", - " fmt=\".0f\",\n", - " robust=True,\n", - " linewidth=0.5,\n", - " cbar_kws={\"label\": \"Number of structures\"},\n", - " norm=LogNorm(),\n", - " ax=ax,\n", - ")\n", - "cbar = ax.collections[0].colorbar\n", - "cbar.set_ticks([1, 10, 100, 1000, 10000]) # Customize ticks based on your data\n", - "cbar.set_ticklabels(\n", - " [\"1\", \"10\", \"100\", \"1000\", \"10000\"]\n", - ") # Customize labels based on your data\n", - "plt.yticks(rotation=0)\n", - "plt.tight_layout()\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(f\"Data set size: {df.shape[0]}\")\n", - "print(f\"Unique formula: {df.formula_pretty.nunique()}\")\n", - "unique_dicts_count_comp = len(df[\"composition\"].apply(str).unique())\n", - "\n", - "unique_dicts_count_oxi_comp = len(df[\"oxi_composition\"].apply(str).unique())\n", - "print(f\"Unique elemental compositions: {unique_dicts_count_comp}\")\n", - "\n", - "print(f\"Unique ionic compositions: {unique_dicts_count_oxi_comp}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Normalise compositions\n", - "\n", - "\n", - "def _get_fractional_composition(el_dict: str) -> Dict[str, float]:\n", - " elamt = {}\n", - " natoms = 0\n", - " for el, v in el_dict.items():\n", - " elamt[el] = v\n", - " natoms += abs(v)\n", - " return {el: elamt[el] / natoms for el in elamt}\n", - "\n", - "\n", - "_get_fractional_composition(df[\"oxi_composition\"][0])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "len(df[\"oxi_composition\"].apply(_get_fractional_composition).apply(str).unique())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[\"oxi_composition\"].apply(_get_fractional_composition).apply(str).value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "unique_df = df.sort_values(\n", - " by=[\"formula_pretty\", \"formation_energy_per_atom\"]\n", - ").drop_duplicates(subset=\"formula_pretty\", ignore_index=True)\n", - "print(f\"Data size with only lowest energy polymorphs: {len(unique_df)}\")\n", - "unique_df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "unique_df.is_magnetic.sum()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "unique_df.band_gap.mean()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "unique_df.query(\"band_gap > 0\").plot()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\"is_metal\".replace(\"_\", \" \").capitalize()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "props = [\"formation_energy_per_atom\", \"band_gap\", \"is_metal\", \"is_magnetic\"]\n", - "class_props = [\"is_metal\", \"is_magnetic\"]\n", - "units = {\"formation_energy_per_atom\": \"eV/atom\", \"band_gap\": \"eV\"}\n", - "fig, axes = plt.subplots(2, 2, figsize=(12, 9))\n", - "\n", - "for prop, ax in zip(props, axes.flatten()):\n", - " if prop in class_props:\n", - " sns.countplot(unique_df, x=prop, ax=ax)\n", - " ax.set_xlabel(prop.replace(\"_\", \" \").capitalize())\n", - " else:\n", - " sns.histplot(unique_df, x=prop, ax=ax, rasterized=True)\n", - " ax.set_xlabel(prop.replace(\"_\", \" \").capitalize() + f\" [{units[prop]}]\")\n", - " ax.set_yscale(\"log\")\n", - " # ax.set_xlabel(prop.replace(\"_\", \" \").capitalize())\n", - " ax.set_ylabel(\"Number of compounds\")\n", - "\n", - "fig.text(0.05, 1, \"(a)\", weight=\"bold\")\n", - "fig.text(0.52, 1, \"(b)\", weight=\"bold\")\n", - "fig.text(0.05, 0.52, \"(c)\", weight=\"bold\")\n", - "fig.text(0.52, 0.52, \"(d)\", weight=\"bold\")\n", - "\n", - "plt.tight_layout()\n", - "plt.savefig(\"Property_dataset.pdf\", dpi=300, bbox_inches=\"tight\", transparent=True)\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Export the unique formula dataset\n", - "dumpfn(unique_df, fn=\"../data/oxi-mp_property_dataset_unique_formulas.json.gz\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python [conda env:skipspecies]", - "language": "python", - "name": "conda-env-skipspecies-py" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.16" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}