fix direct data file download URLs on /contribute page

add tqdm progress bars to upload_to_figshare.py add prompt to delete new Figshare article if error occurs use gzip CLI in get_mp_traj.py to check all files for archive corruption rename compute_projections.py to project_compositions.py
janosh · Jun 20, 2023 · 83736ad · 83736ad
1 parent cc08c78
commit 83736ad
Show file tree

Hide file tree

Showing 7 changed files with 130 additions and 47 deletions.
diff --git a/data/figshare/1.0.0.json b/data/figshare/1.0.0.json
@@ -1,10 +1,34 @@
 {
-  "mp_computed_structure_entries": "https://figshare.com/ndownloader/files/40344436",
-  "mp_elemental_ref_entries": "https://figshare.com/ndownloader/files/40387775",
-  "mp_energies": "https://figshare.com/ndownloader/files/40344448",
-  "mp_patched_phase_diagram": "https://figshare.com/ndownloader/files/40344451",
-  "wbm_computed_structure_entries": "https://figshare.com/ndownloader/files/40344463",
-  "wbm_initial_structures": "https://figshare.com/ndownloader/files/40344466",
-  "wbm_cses_plus_init_structs": "https://figshare.com/ndownloader/files/40344469",
-  "wbm_summary": "https://figshare.com/ndownloader/files/40407575"
+  "mp_computed_structure_entries": [
+    "https://figshare.com/ndownloader/files/40344436",
+    "2023-02-07-mp-computed-structure-entries.json.gz"
+  ],
+  "mp_elemental_ref_entries": [
+    "https://figshare.com/ndownloader/files/40387775",
+    "2023-02-07-mp-elemental-reference-entries.json.gz"
+  ],
+  "mp_energies": [
+    "https://figshare.com/ndownloader/files/40344448",
+    "2023-01-10-mp-energies.csv"
+  ],
+  "mp_patched_phase_diagram": [
+    "https://figshare.com/ndownloader/files/40344451",
+    "2023-02-07-ppd-mp.pkl.gz"
+  ],
+  "wbm_computed_structure_entries": [
+    "https://figshare.com/ndownloader/files/40344463",
+    "2022-10-19-wbm-computed-structure-entries.json.bz2"
+  ],
+  "wbm_initial_structures": [
+    "https://figshare.com/ndownloader/files/40344466",
+    "2022-10-19-wbm-init-structs.json.bz2"
+  ],
+  "wbm_cses_plus_init_structs": [
+    "https://figshare.com/ndownloader/files/40344469",
+    "2022-10-19-wbm-computed-structure-entries+init-structs.json.bz2"
+  ],
+  "wbm_summary": [
+    "https://figshare.com/ndownloader/files/40407575",
+    "2022-10-19-wbm-summary.csv"
+  ]
 }
diff --git a/data/mp/get_mp_traj.py b/data/mp/get_mp_traj.py
@@ -1,14 +1,20 @@
-"""Download all MP ionic steps on 2023-03-15."""
+"""Download all MP ionic steps using direct read-access to the mp_core DB.
+
+Gzipped JSON is ~15GB.
+On a good connection, takes about 15 min per batch * 140 batches = 35 h.
+"""
 
 
 # %%
 import os
+import subprocess
+from glob import glob
 
 import pandas as pd
 from emmet.core.tasks import TaskDoc
 from pymongo import MongoClient
 from pymongo.database import Database
-from tqdm import trange
+from tqdm import tqdm, trange
 
 from matbench_discovery import ROOT, today
 
@@ -36,8 +42,8 @@
 fields = "task_id formula_pretty run_type nsites task_type tags completed_at".split()
 
 if os.path.isfile(ids_path):
-    print(f"Found existing list of task IDs to query at {ids_path=}")
-    df_tasks = pd.read_csv(ids_path).set_index("task_id")
+    print(f"Found existing list of task IDs to query at\n{ids_path=}")
+    df_tasks = pd.read_csv(ids_path, low_memory=False).set_index("task_id")
 else:
     print(f"Querying all task docs from {db_name}\n{fields=}.\nThis takes a while...")
     task_docs = sorted(
@@ -97,6 +103,18 @@
 
 
 # %% inspect saved task docs for expected data
-df_10k = pd.read_json(
-    f"{module_dir}/mp-tasks/mp-1708653__mp-1735769.json.gz"
+df_batch = pd.read_json(
+    f"{module_dir}/mp-tasks/mp-531529__mp-568116.json.gz"
 ).set_index("task_id")
+
+print(f"{len(df_batch)=}")
+df_batch.head()
+
+
+# %% use gzip CLI to check all files for archive corruption
+for path in tqdm(glob(f"{module_dir}/mp-tasks/*.json.gz")):
+    try:
+        subprocess.run(["gzip", "-t", path], check=True)
+    except subprocess.CalledProcessError as exc:
+        print(f"{path} raised {exc.stderr}")
+        # os.remove(path)
diff --git a/matbench_discovery/data.py b/matbench_discovery/data.py
@@ -92,7 +92,7 @@ def load(
 
     cache_path = f"{cache_dir}/{file}"
     if not os.path.isfile(cache_path):  # download from Figshare URL
-        url = file_urls[data_key]
+        url = file_urls[data_key][0]
         print(f"Downloading {data_key!r} from {url}")
         try:
             # ensure directory exists

diff --git a/scripts/compute_projections.py → scripts/project_compositions.py b/scripts/compute_projections.py → scripts/project_compositions.py
@@ -1,4 +1,4 @@
-"""Compute t-SNE and UMAP projections of the WBM and MP datasets."""
+"""Compute t-SNE or UMAP projections of WBM and MP compositions."""
 
 
 # %%

diff --git a/scripts/upload_to_figshare.py b/scripts/upload_to_figshare.py
@@ -13,6 +13,7 @@
 import requests
 import tomllib  # needs python 3.11
 from requests.exceptions import HTTPError
+from tqdm.auto import tqdm
 
 from matbench_discovery import ROOT
 from matbench_discovery.data import DATA_FILES
@@ -21,6 +22,7 @@
 __date__ = "2023-04-27"
 
 with open(f"{ROOT}/site/.env") as file:
+    # TOKEN: length 128, alphanumeric (e.g. 271431c6a94ff7...)
     TOKEN = file.read().split("figshare_token=")[1].split("\n")[0]
 
 BASE_URL = "https://api.figshare.com/v2"
@@ -51,7 +53,7 @@
 
 def make_request(method: str, url: str, data: Any = None, binary: bool = False) -> Any:
     """Make a token-authorized HTTP request to the Figshare API."""
-    headers = {"Authorization": "token " + TOKEN}
+    headers = {"Authorization": f"token {TOKEN}"}
     if data is not None and not binary:
         data = json.dumps(data)
     response = requests.request(method, url, headers=headers, data=data)
@@ -95,22 +97,20 @@ def upload_file_to_figshare(article_id: int, file_path: str) -> int:
     data = dict(name=os.path.basename(file_path), md5=md5, size=size)
     endpoint = f"{BASE_URL}/account/articles/{article_id}/files"
     result = make_request("POST", endpoint, data=data)
-    print(f"Initiated file upload: {result['location']}\n")
     file_info = make_request("GET", result["location"])
 
     # Upload parts
     url = file_info["upload_url"]
     result = make_request("GET", url)
     with open(file_path, "rb") as file:
-        for part in result["parts"]:
+        for part in tqdm(result["parts"], desc=file_path):
             # Upload part
             u_data = file_info.copy()
             u_data.update(part)
-            url = f'{u_data["upload_url"]}/{part["partNo"]}'
+            url = f"{u_data['upload_url']}/{part['partNo']}"
             file.seek(part["startOffset"])
             chunk = file.read(part["endOffset"] - part["startOffset"] + 1)
             make_request("PUT", url, data=chunk, binary=True)
-            print(f'\tUploaded part {part["partNo"]}')
 
     # Complete upload
     make_request("POST", f"{endpoint}/{file_info['id']}")
@@ -127,20 +127,30 @@ def main() -> int:
         "categories": list(CATEGORIES),
         "references": REFERENCES,
     }
-    article_id = create_article(metadata)
-    uploaded_files: dict[str, str] = {}
-    for key, file_path in DATA_FILES.items():
-        file_id = upload_file_to_figshare(article_id, file_path)
-        file_url = f"https://figshare.com/ndownloader/files/{file_id}"
-        uploaded_files[key] = file_url
-
-    print("\nUploaded files:")
-    for file_path, file_url in uploaded_files.items():
-        print(f"{file_path}: {file_url}")
-
-    # write to JSON file
-    with open(file_urls_out_path, "w") as file:
-        json.dump(uploaded_files, file)
+    try:
+        article_id = create_article(metadata)
+        uploaded_files: dict[str, tuple[str, str]] = {}
+        pbar = tqdm(DATA_FILES.items(), desc="Uploading to Figshare")
+        for key, file_path in pbar:
+            pbar.set_postfix(file=key)
+            file_id = upload_file_to_figshare(article_id, file_path)
+            file_url = f"https://figshare.com/ndownloader/files/{file_id}"
+            uploaded_files[key] = (file_url, file_path.split("/")[-1])
+
+        print("\nUploaded files:")
+        for file_path, (file_url, _) in uploaded_files.items():
+            print(f"{file_path}: {file_url}")
+
+        # write uploaded file keys mapped to their URLs to JSON
+        with open(file_urls_out_path, "w") as file:
+            json.dump(uploaded_files, file)
+    except Exception as exc:  # prompt to delete article if something went wrong
+        answer = ""
+        print(f"Encountered {exc=}")
+        while answer not in ("y", "n"):
+            answer = input("Delete article? [y/n] ")
+        if answer == "y":
+            make_request("DELETE", f"{BASE_URL}/account/articles/{article_id}")
 
     return 0
 

diff --git a/site/src/routes/contribute/+page.md b/site/src/routes/contribute/+page.md
@@ -1,5 +1,32 @@
 <script>
-  import { name, repository as repo, homepage } from "$site/package.json";
+  import { name, repository as repo, homepage } from "$site/package.json"
+  import figshare_urls from "$root/data/figshare/1.0.0.json"
+  import { Tooltip } from 'svelte-zoo'
+
+  const ppd_doc_url = `https://github.com/materialsproject/pymatgen/blob/v2023.5.10/pymatgen/analysis/phase_diagram.py#L1480-L1814`
+  const ppd_link = `<a href=${ppd_doc_url}>PatchedPhaseDiagram</a>`
+  const cse_doc_url = `https://github.com/materialsproject/pymatgen/blob/v2023.5.10/pymatgen/entries/computed_entries.py#L579-L722`
+  const cse_link = `<a href=${cse_doc_url}>ComputedStructureEntry</a>`
+
+  const descriptions = {
+    mp_computed_structure_entries:
+      `JSON-Serialized MP ${cse_link} objects containing relaxed structures and DFT final energies`,
+    mp_elemental_ref_entries: `Minimum energy ComputedEntry for each element in MP`,
+    mp_energies: `Materials Project formation energies and energies above convex hull`,
+    mp_patched_phase_diagram:
+      `${ppd_link} constructed from all MP ComputedStructureEntries`,
+    wbm_computed_structure_entries: `Materials Project computed structure entries`,
+    wbm_initial_structures: `Unrelaxed WBM structures`,
+    wbm_cses_plus_init_structs: `Both unrelaxed and DFT-relaxed WBM structures, the latter stored with their final VASP energies as ${cse_link}`,
+    wbm_summary:
+      `Computed material properties only, no structures. Available properties are VASP energy, formation energy, energy above the convex hull, volume, band gap, number of sites per unit cell, and more.`,
+  }
+  const desc_keys = Object.keys(descriptions).sort()
+  const figshare_keys = Object.keys(figshare_urls).sort()
+  const missing = figshare_keys.filter((key) => !desc_keys.includes(key))
+  if (missing.length > 0) {
+    console.error(`descriptions must contain all figshare_urls keys, missing=${missing}`)
+  }
 </script>
 
 # How to contribute
@@ -75,15 +102,19 @@ assert list(df_wbm) == [
 
 ## 📥 &thinsp; Direct Download
 
-You can also download the data files directly from GitHub:
-
-1. [`2022-10-19-wbm-summary.csv`]({repo}/blob/-/data/wbm/2022-10-19-wbm-summary.csv): Computed material properties only, no structures. Available properties are VASP energy, formation energy, energy above the convex hull, volume, band gap, number of sites per unit cell, and more.
-1. [`2022-10-19-wbm-init-structs.json`]({repo}/blob/-/data/wbm/2022-10-19-wbm-init-structs.json): Unrelaxed WBM structures
-1. [`2022-10-19-wbm-cses.json`]({repo}/blob/-/data/wbm/2022-10-19-wbm-cses.json): Relaxed WBM structures along with final VASP energies
-1. [`2023-01-10-mp-energies.json.gz`]({repo}/blob/-/data/mp/2023-01-10-mp-energies.json.gz): Materials Project formation energies and energies above convex hull
-1. [`2023-02-07-mp-computed-structure-entries.json.gz`]({repo}/blob/-/data/mp/2023-02-07-mp-computed-structure-entries.json.gz): Materials Project computed structure entries
-1. [`2023-02-07-ppd-mp.pkl.gz`]({repo}/blob/-/data/mp/2023-02-07-ppd-mp.pkl.gz): [PatchedPhaseDiagram](https://pymatgen.org/pymatgen.analysis.phase_diagram.html#pymatgen.analysis.phase_diagram.PatchedPhaseDiagram) constructed from all MP ComputedStructureEntries
-1. [`2023-02-07-mp-elemental-reference-entries.json.gz`]({repo}/blob/-/data/mp/2023-02-07-mp-elemental-reference-entries.json.gz): Minimum energy PDEntries for each element present in the Materials Project
+You can also download the data files directly from Figshare:
+
+<ol>
+  {#each Object.entries(figshare_urls) as [key, lst]}
+    {@const [href, file_name] = lst}
+    <li>
+      <Tooltip text={file_name}>
+        <a {href}>{key}</a>:
+      </Tooltip>
+      {@html descriptions[key]}
+    </li>
+  {/each}
+</ol>
 
 [wbm paper]: https://nature.com/articles/s41524-020-00481-6
 

diff --git a/tests/test_data.py b/tests/test_data.py
@@ -67,7 +67,7 @@ def test_load(
 
     stdout, _stderr = capsys.readouterr()
 
-    assert f"Downloading {data_key!r} from {figshare_urls[data_key]}" in stdout
+    assert f"Downloading {data_key!r} from {figshare_urls[data_key][0]}" in stdout
 
     # check we called read_csv/read_json once for each data_name
     assert urlretrieve.call_count == 1
@@ -172,7 +172,7 @@ def test_load_no_mock(
     rel_path = getattr(type(DATA_FILES), file_key)
     cache_path = f"{tmp_path}/{rel_path}"
     assert (
-        f"Downloading {file_key!r} from {figshare_urls[file_key]}\nCached "
+        f"Downloading {file_key!r} from {figshare_urls[file_key][0]}\nCached "
         f"{file_key!r} to {cache_path!r}" in stdout
     )