Download and split non-English Wiktionary JSON files from kaikki.org

xxyzz · Jan 2, 2024 · 3726687 · 3726687
1 parent 9e72a74
commit 3726687
Show file tree

Hide file tree

Showing 6 changed files with 28 additions and 41 deletions.
diff --git a/README.md b/README.md
@@ -26,6 +26,8 @@ Wiktionary data come from kaikki.org and [Dbnary](https://kaiko.getalp.org/about
 
 - [oxigraph](https://github.com/oxigraph/oxigraph)
 
+- pigz or gzip
+
 ## Create files
 
 ```

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "Proficiency"
-version = "0.5.11"
+version = "0.5.12"
 authors = [
     {name = "xxyzz"}
 ]
@@ -28,7 +28,6 @@ dev = [
 
 [project.scripts]
 proficiency = "proficiency.main:main"
-split_jsonl = "proficiency.split_jsonl:main"
 
 [tool.setuptools]
 zip-safe = false

diff --git a/src/proficiency/extract_kaikki.py b/src/proficiency/extract_kaikki.py
@@ -59,42 +59,47 @@ def download_kaikki_json(lang: str) -> Path:
     return filepath
 
 
-def download_non_en_json(lemma_lang: str, gloss_lang: str) -> Path:
-    jsonl_path = Path(f"build/{lemma_lang}/{lemma_lang}_{gloss_lang}.jsonl")
-    bz2_path = jsonl_path.with_suffix(".jsonl.bz2")
-    if not bz2_path.exists() and not jsonl_path.exists():
+def download_kaikki_non_en_json(gloss_lang: str) -> Path:
+    from .split_jsonl import split_kaikki_non_en_jsonl
+
+    jsonl_path = Path(f"build/{gloss_lang}-extract.json")
+    gz_path = jsonl_path.with_suffix(".json.gz")
+    if not gz_path.exists() and not jsonl_path.exists():
         subprocess.run(
             [
                 "wget",
                 "-nv",
                 "-P",
-                f"build/{lemma_lang}",
-                f"https://github.com/xxyzz/wiktextract/releases/latest/download/{bz2_path.name}",
+                "build",
+                f"https://kaikki.org/dictionary/downloads/{gloss_lang}/{gloss_lang}-extract.json.gz",
             ],
             check=True,
             capture_output=True,
             text=True,
         )
-    if bz2_path.exists() and not jsonl_path.exists():
+    if gz_path.exists() and not jsonl_path.exists():
         subprocess.run(
             [
-                "lbunzip2" if which("lbunzip2") is not None else "bunzip2",
-                str(bz2_path),
+                "pigz" if which("pigz") is not None else "gzip",
+                "-d",
+                str(gz_path),
             ],
             check=True,
             capture_output=True,
             text=True,
         )
+        split_kaikki_non_en_jsonl(jsonl_path, gloss_lang)
+
     return jsonl_path
 
 
 def load_data(lemma_lang: str, gloss_lang: str) -> tuple[Path, dict[str, int]]:
     if gloss_lang == "en":
         kaikki_json_path = download_kaikki_json(lemma_lang)
     else:
-        kaikki_json_path = download_non_en_json(
-            "sh" if lemma_lang == "hr" else lemma_lang, gloss_lang
-        )
+        if lemma_lang == "hr":
+            lemma_lang = "sh"
+        kaikki_json_path = Path(f"build/{lemma_lang}/{lemma_lang}_{gloss_lang}.jsonl")
 
     difficulty_data = load_difficulty_data(lemma_lang)
     return kaikki_json_path, difficulty_data

diff --git a/src/proficiency/extract_kindle_lemmas.py b/src/proficiency/extract_kindle_lemmas.py
@@ -74,7 +74,7 @@ def create_kindle_lemmas_db(db_path: Path) -> None:
     enabled_sense_ids: set[int] = {data[1] for data in enabled_lemmas.values()}
     conn = init_db(db_path, "en", True, False)
 
-    with (files("proficiency") / "en" / "kindle_all_lemmas.csv").open(
+    with (files("proficiency") / "en" / "kindle_all_lemmas.csv").open(  # type: ignore
         newline="", encoding="utf-8"
     ) as f:
         csv_reader = csv.reader(f)

diff --git a/src/proficiency/main.py b/src/proficiency/main.py
@@ -16,7 +16,7 @@
     download_dbnary_files,
     init_oxigraph_store,
 )
-from .extract_kaikki import create_lemmas_db_from_kaikki
+from .extract_kaikki import create_lemmas_db_from_kaikki, download_kaikki_non_en_json
 from .extract_kindle_lemmas import create_kindle_lemmas_db
 
 VERSION = version("proficiency")
@@ -118,6 +118,8 @@ def main() -> None:
     with ProcessPoolExecutor() as executor:
         logging.info("Creating Wiktionary files")
         if args.gloss_lang in WIKITEXTRACT_LANGUAGES:
+            if args.gloss_lang != "en":
+                download_kaikki_non_en_json(args.gloss_lang)
             for _ in executor.map(
                 partial(
                     create_wiktionary_files_from_kaikki, gloss_lang=args.gloss_lang

diff --git a/src/proficiency/split_jsonl.py b/src/proficiency/split_jsonl.py
@@ -1,20 +1,12 @@
-import argparse
 import json
-import subprocess
 from importlib.resources import files
 from pathlib import Path
-from shutil import which
 
 
-def main() -> None:
+def split_kaikki_non_en_jsonl(jsonl_path: Path, gloss_code: str) -> None:
     """
     Split extracted jsonl file created by wiktextract to each language file.
     """
-    parser = argparse.ArgumentParser()
-    parser.add_argument("jsonl_path", type=Path)
-    parser.add_argument("gloss_code")
-    args = parser.parse_args()
-
     with (files("proficiency") / "data" / "kaikki_languages.json").open(
         encoding="utf-8"
     ) as f:
@@ -24,7 +16,7 @@ def main() -> None:
         lang_codes["sh"] = "Serbo-Croatian"
 
     out_file_paths = {
-        lemma_code: Path(f"build/{lemma_code}/{lemma_code}_{args.gloss_code}.jsonl")
+        lemma_code: Path(f"build/{lemma_code}/{lemma_code}_{gloss_code}.jsonl")
         for lemma_code in lang_codes
     }
     for out_file_path in out_file_paths.values():
@@ -34,7 +26,7 @@ def main() -> None:
         for lemma_code, out_file_path in zip(lang_codes.keys(), out_file_paths.values())
     }
 
-    with args.jsonl_path.open(encoding="utf-8") as jsonl_f:
+    with jsonl_path.open(encoding="utf-8") as jsonl_f:
         for line in jsonl_f:
             data = json.loads(line)
             if "lang_code" in data:
@@ -45,18 +37,5 @@ def main() -> None:
                     for out_f in out_files.values():
                         out_f.write(line)
 
-    for lemma_code, out_f in out_files.items():
+    for out_f in out_files.values():
         out_f.close()
-        jsonl_path = out_file_paths[lemma_code]
-        jsonl_bz2_path = jsonl_path.with_suffix(".jsonl.bz2")
-        jsonl_bz2_path.unlink(missing_ok=True)
-        subprocess.run(
-            ["lbzip2" if which("lbzip2") is not None else "bzip2", str(jsonl_path)],
-            check=True,
-            capture_output=True,
-            text=True,
-        )
-
-
-if __name__ == "__main__":
-    main()
-Original file line number
+Diff line change
@@ Expand Up @@
     - [oxigraph](https://github.com/oxigraph/oxigraph)
+    - pigz or gzip
     ## Create files
     ```
@@ Expand Down @@