Skip to content

Commit

Permalink
fix direct data file download URLs on /contribute page
Browse files Browse the repository at this point in the history
add tqdm progress bars to upload_to_figshare.py
add prompt to delete new Figshare article if error occurs
use gzip CLI in get_mp_traj.py to check all files for archive corruption
rename compute_projections.py to project_compositions.py
  • Loading branch information
janosh committed Jun 20, 2023
1 parent cc08c78 commit 83736ad
Show file tree
Hide file tree
Showing 7 changed files with 130 additions and 47 deletions.
40 changes: 32 additions & 8 deletions data/figshare/1.0.0.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,34 @@
{
"mp_computed_structure_entries": "https://figshare.com/ndownloader/files/40344436",
"mp_elemental_ref_entries": "https://figshare.com/ndownloader/files/40387775",
"mp_energies": "https://figshare.com/ndownloader/files/40344448",
"mp_patched_phase_diagram": "https://figshare.com/ndownloader/files/40344451",
"wbm_computed_structure_entries": "https://figshare.com/ndownloader/files/40344463",
"wbm_initial_structures": "https://figshare.com/ndownloader/files/40344466",
"wbm_cses_plus_init_structs": "https://figshare.com/ndownloader/files/40344469",
"wbm_summary": "https://figshare.com/ndownloader/files/40407575"
"mp_computed_structure_entries": [
"https://figshare.com/ndownloader/files/40344436",
"2023-02-07-mp-computed-structure-entries.json.gz"
],
"mp_elemental_ref_entries": [
"https://figshare.com/ndownloader/files/40387775",
"2023-02-07-mp-elemental-reference-entries.json.gz"
],
"mp_energies": [
"https://figshare.com/ndownloader/files/40344448",
"2023-01-10-mp-energies.csv"
],
"mp_patched_phase_diagram": [
"https://figshare.com/ndownloader/files/40344451",
"2023-02-07-ppd-mp.pkl.gz"
],
"wbm_computed_structure_entries": [
"https://figshare.com/ndownloader/files/40344463",
"2022-10-19-wbm-computed-structure-entries.json.bz2"
],
"wbm_initial_structures": [
"https://figshare.com/ndownloader/files/40344466",
"2022-10-19-wbm-init-structs.json.bz2"
],
"wbm_cses_plus_init_structs": [
"https://figshare.com/ndownloader/files/40344469",
"2022-10-19-wbm-computed-structure-entries+init-structs.json.bz2"
],
"wbm_summary": [
"https://figshare.com/ndownloader/files/40407575",
"2022-10-19-wbm-summary.csv"
]
}
30 changes: 24 additions & 6 deletions data/mp/get_mp_traj.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
"""Download all MP ionic steps on 2023-03-15."""
"""Download all MP ionic steps using direct read-access to the mp_core DB.
Gzipped JSON is ~15GB.
On a good connection, takes about 15 min per batch * 140 batches = 35 h.
"""


# %%
import os
import subprocess
from glob import glob

import pandas as pd
from emmet.core.tasks import TaskDoc
from pymongo import MongoClient
from pymongo.database import Database
from tqdm import trange
from tqdm import tqdm, trange

from matbench_discovery import ROOT, today

Expand Down Expand Up @@ -36,8 +42,8 @@
fields = "task_id formula_pretty run_type nsites task_type tags completed_at".split()

if os.path.isfile(ids_path):
print(f"Found existing list of task IDs to query at {ids_path=}")
df_tasks = pd.read_csv(ids_path).set_index("task_id")
print(f"Found existing list of task IDs to query at\n{ids_path=}")
df_tasks = pd.read_csv(ids_path, low_memory=False).set_index("task_id")
else:
print(f"Querying all task docs from {db_name}\n{fields=}.\nThis takes a while...")
task_docs = sorted(
Expand Down Expand Up @@ -97,6 +103,18 @@


# %% inspect saved task docs for expected data
df_10k = pd.read_json(
f"{module_dir}/mp-tasks/mp-1708653__mp-1735769.json.gz"
df_batch = pd.read_json(
f"{module_dir}/mp-tasks/mp-531529__mp-568116.json.gz"
).set_index("task_id")

print(f"{len(df_batch)=}")
df_batch.head()


# %% use gzip CLI to check all files for archive corruption
for path in tqdm(glob(f"{module_dir}/mp-tasks/*.json.gz")):
try:
subprocess.run(["gzip", "-t", path], check=True)
except subprocess.CalledProcessError as exc:
print(f"{path} raised {exc.stderr}")
# os.remove(path)
2 changes: 1 addition & 1 deletion matbench_discovery/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def load(

cache_path = f"{cache_dir}/{file}"
if not os.path.isfile(cache_path): # download from Figshare URL
url = file_urls[data_key]
url = file_urls[data_key][0]
print(f"Downloading {data_key!r} from {url}")
try:
# ensure directory exists
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Compute t-SNE and UMAP projections of the WBM and MP datasets."""
"""Compute t-SNE or UMAP projections of WBM and MP compositions."""


# %%
Expand Down
48 changes: 29 additions & 19 deletions scripts/upload_to_figshare.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import requests
import tomllib # needs python 3.11
from requests.exceptions import HTTPError
from tqdm.auto import tqdm

from matbench_discovery import ROOT
from matbench_discovery.data import DATA_FILES
Expand All @@ -21,6 +22,7 @@
__date__ = "2023-04-27"

with open(f"{ROOT}/site/.env") as file:
# TOKEN: length 128, alphanumeric (e.g. 271431c6a94ff7...)
TOKEN = file.read().split("figshare_token=")[1].split("\n")[0]

BASE_URL = "https://api.figshare.com/v2"
Expand Down Expand Up @@ -51,7 +53,7 @@

def make_request(method: str, url: str, data: Any = None, binary: bool = False) -> Any:
"""Make a token-authorized HTTP request to the Figshare API."""
headers = {"Authorization": "token " + TOKEN}
headers = {"Authorization": f"token {TOKEN}"}
if data is not None and not binary:
data = json.dumps(data)
response = requests.request(method, url, headers=headers, data=data)
Expand Down Expand Up @@ -95,22 +97,20 @@ def upload_file_to_figshare(article_id: int, file_path: str) -> int:
data = dict(name=os.path.basename(file_path), md5=md5, size=size)
endpoint = f"{BASE_URL}/account/articles/{article_id}/files"
result = make_request("POST", endpoint, data=data)
print(f"Initiated file upload: {result['location']}\n")
file_info = make_request("GET", result["location"])

# Upload parts
url = file_info["upload_url"]
result = make_request("GET", url)
with open(file_path, "rb") as file:
for part in result["parts"]:
for part in tqdm(result["parts"], desc=file_path):
# Upload part
u_data = file_info.copy()
u_data.update(part)
url = f'{u_data["upload_url"]}/{part["partNo"]}'
url = f"{u_data['upload_url']}/{part['partNo']}"
file.seek(part["startOffset"])
chunk = file.read(part["endOffset"] - part["startOffset"] + 1)
make_request("PUT", url, data=chunk, binary=True)
print(f'\tUploaded part {part["partNo"]}')

# Complete upload
make_request("POST", f"{endpoint}/{file_info['id']}")
Expand All @@ -127,20 +127,30 @@ def main() -> int:
"categories": list(CATEGORIES),
"references": REFERENCES,
}
article_id = create_article(metadata)
uploaded_files: dict[str, str] = {}
for key, file_path in DATA_FILES.items():
file_id = upload_file_to_figshare(article_id, file_path)
file_url = f"https://figshare.com/ndownloader/files/{file_id}"
uploaded_files[key] = file_url

print("\nUploaded files:")
for file_path, file_url in uploaded_files.items():
print(f"{file_path}: {file_url}")

# write to JSON file
with open(file_urls_out_path, "w") as file:
json.dump(uploaded_files, file)
try:
article_id = create_article(metadata)
uploaded_files: dict[str, tuple[str, str]] = {}
pbar = tqdm(DATA_FILES.items(), desc="Uploading to Figshare")
for key, file_path in pbar:
pbar.set_postfix(file=key)
file_id = upload_file_to_figshare(article_id, file_path)
file_url = f"https://figshare.com/ndownloader/files/{file_id}"
uploaded_files[key] = (file_url, file_path.split("/")[-1])

print("\nUploaded files:")
for file_path, (file_url, _) in uploaded_files.items():
print(f"{file_path}: {file_url}")

# write uploaded file keys mapped to their URLs to JSON
with open(file_urls_out_path, "w") as file:
json.dump(uploaded_files, file)
except Exception as exc: # prompt to delete article if something went wrong
answer = ""
print(f"Encountered {exc=}")
while answer not in ("y", "n"):
answer = input("Delete article? [y/n] ")
if answer == "y":
make_request("DELETE", f"{BASE_URL}/account/articles/{article_id}")

return 0

Expand Down
51 changes: 41 additions & 10 deletions site/src/routes/contribute/+page.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,32 @@
<script>
import { name, repository as repo, homepage } from "$site/package.json";
import { name, repository as repo, homepage } from "$site/package.json"
import figshare_urls from "$root/data/figshare/1.0.0.json"
import { Tooltip } from 'svelte-zoo'

const ppd_doc_url = `https://github.com/materialsproject/pymatgen/blob/v2023.5.10/pymatgen/analysis/phase_diagram.py#L1480-L1814`
const ppd_link = `<a href=${ppd_doc_url}>PatchedPhaseDiagram</a>`
const cse_doc_url = `https://github.com/materialsproject/pymatgen/blob/v2023.5.10/pymatgen/entries/computed_entries.py#L579-L722`
const cse_link = `<a href=${cse_doc_url}>ComputedStructureEntry</a>`

const descriptions = {
mp_computed_structure_entries:
`JSON-Serialized MP ${cse_link} objects containing relaxed structures and DFT final energies`,
mp_elemental_ref_entries: `Minimum energy ComputedEntry for each element in MP`,
mp_energies: `Materials Project formation energies and energies above convex hull`,
mp_patched_phase_diagram:
`${ppd_link} constructed from all MP ComputedStructureEntries`,
wbm_computed_structure_entries: `Materials Project computed structure entries`,
wbm_initial_structures: `Unrelaxed WBM structures`,
wbm_cses_plus_init_structs: `Both unrelaxed and DFT-relaxed WBM structures, the latter stored with their final VASP energies as ${cse_link}`,
wbm_summary:
`Computed material properties only, no structures. Available properties are VASP energy, formation energy, energy above the convex hull, volume, band gap, number of sites per unit cell, and more.`,
}
const desc_keys = Object.keys(descriptions).sort()
const figshare_keys = Object.keys(figshare_urls).sort()
const missing = figshare_keys.filter((key) => !desc_keys.includes(key))
if (missing.length > 0) {
console.error(`descriptions must contain all figshare_urls keys, missing=${missing}`)
}
</script>

# How to contribute
Expand Down Expand Up @@ -75,15 +102,19 @@ assert list(df_wbm) == [

## 📥 &thinsp; Direct Download

You can also download the data files directly from GitHub:

1. [`2022-10-19-wbm-summary.csv`]({repo}/blob/-/data/wbm/2022-10-19-wbm-summary.csv): Computed material properties only, no structures. Available properties are VASP energy, formation energy, energy above the convex hull, volume, band gap, number of sites per unit cell, and more.
1. [`2022-10-19-wbm-init-structs.json`]({repo}/blob/-/data/wbm/2022-10-19-wbm-init-structs.json): Unrelaxed WBM structures
1. [`2022-10-19-wbm-cses.json`]({repo}/blob/-/data/wbm/2022-10-19-wbm-cses.json): Relaxed WBM structures along with final VASP energies
1. [`2023-01-10-mp-energies.json.gz`]({repo}/blob/-/data/mp/2023-01-10-mp-energies.json.gz): Materials Project formation energies and energies above convex hull
1. [`2023-02-07-mp-computed-structure-entries.json.gz`]({repo}/blob/-/data/mp/2023-02-07-mp-computed-structure-entries.json.gz): Materials Project computed structure entries
1. [`2023-02-07-ppd-mp.pkl.gz`]({repo}/blob/-/data/mp/2023-02-07-ppd-mp.pkl.gz): [PatchedPhaseDiagram](https://pymatgen.org/pymatgen.analysis.phase_diagram.html#pymatgen.analysis.phase_diagram.PatchedPhaseDiagram) constructed from all MP ComputedStructureEntries
1. [`2023-02-07-mp-elemental-reference-entries.json.gz`]({repo}/blob/-/data/mp/2023-02-07-mp-elemental-reference-entries.json.gz): Minimum energy PDEntries for each element present in the Materials Project
You can also download the data files directly from Figshare:

<ol>
{#each Object.entries(figshare_urls) as [key, lst]}
{@const [href, file_name] = lst}
<li>
<Tooltip text={file_name}>
<a {href}>{key}</a>:
</Tooltip>
{@html descriptions[key]}
</li>
{/each}
</ol>

[wbm paper]: https://nature.com/articles/s41524-020-00481-6

Expand Down
4 changes: 2 additions & 2 deletions tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def test_load(

stdout, _stderr = capsys.readouterr()

assert f"Downloading {data_key!r} from {figshare_urls[data_key]}" in stdout
assert f"Downloading {data_key!r} from {figshare_urls[data_key][0]}" in stdout

# check we called read_csv/read_json once for each data_name
assert urlretrieve.call_count == 1
Expand Down Expand Up @@ -172,7 +172,7 @@ def test_load_no_mock(
rel_path = getattr(type(DATA_FILES), file_key)
cache_path = f"{tmp_path}/{rel_path}"
assert (
f"Downloading {file_key!r} from {figshare_urls[file_key]}\nCached "
f"Downloading {file_key!r} from {figshare_urls[file_key][0]}\nCached "
f"{file_key!r} to {cache_path!r}" in stdout
)

Expand Down

0 comments on commit 83736ad

Please sign in to comment.