Skip to content

Commit

Permalink
Download and split non-English Wiktionary JSON files from kaikki.org
Browse files Browse the repository at this point in the history
  • Loading branch information
xxyzz committed Jan 2, 2024
1 parent 9e72a74 commit 3726687
Show file tree
Hide file tree
Showing 6 changed files with 28 additions and 41 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ Wiktionary data come from kaikki.org and [Dbnary](https://kaiko.getalp.org/about

- [oxigraph](https://github.com/oxigraph/oxigraph)

- pigz or gzip

## Create files

```
Expand Down
3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "Proficiency"
version = "0.5.11"
version = "0.5.12"
authors = [
{name = "xxyzz"}
]
Expand All @@ -28,7 +28,6 @@ dev = [

[project.scripts]
proficiency = "proficiency.main:main"
split_jsonl = "proficiency.split_jsonl:main"

[tool.setuptools]
zip-safe = false
Expand Down
29 changes: 17 additions & 12 deletions src/proficiency/extract_kaikki.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,42 +59,47 @@ def download_kaikki_json(lang: str) -> Path:
return filepath


def download_non_en_json(lemma_lang: str, gloss_lang: str) -> Path:
jsonl_path = Path(f"build/{lemma_lang}/{lemma_lang}_{gloss_lang}.jsonl")
bz2_path = jsonl_path.with_suffix(".jsonl.bz2")
if not bz2_path.exists() and not jsonl_path.exists():
def download_kaikki_non_en_json(gloss_lang: str) -> Path:
from .split_jsonl import split_kaikki_non_en_jsonl

jsonl_path = Path(f"build/{gloss_lang}-extract.json")
gz_path = jsonl_path.with_suffix(".json.gz")
if not gz_path.exists() and not jsonl_path.exists():
subprocess.run(
[
"wget",
"-nv",
"-P",
f"build/{lemma_lang}",
f"https://github.com/xxyzz/wiktextract/releases/latest/download/{bz2_path.name}",
"build",
f"https://kaikki.org/dictionary/downloads/{gloss_lang}/{gloss_lang}-extract.json.gz",
],
check=True,
capture_output=True,
text=True,
)
if bz2_path.exists() and not jsonl_path.exists():
if gz_path.exists() and not jsonl_path.exists():
subprocess.run(
[
"lbunzip2" if which("lbunzip2") is not None else "bunzip2",
str(bz2_path),
"pigz" if which("pigz") is not None else "gzip",
"-d",
str(gz_path),
],
check=True,
capture_output=True,
text=True,
)
split_kaikki_non_en_jsonl(jsonl_path, gloss_lang)

return jsonl_path


def load_data(lemma_lang: str, gloss_lang: str) -> tuple[Path, dict[str, int]]:
if gloss_lang == "en":
kaikki_json_path = download_kaikki_json(lemma_lang)
else:
kaikki_json_path = download_non_en_json(
"sh" if lemma_lang == "hr" else lemma_lang, gloss_lang
)
if lemma_lang == "hr":
lemma_lang = "sh"
kaikki_json_path = Path(f"build/{lemma_lang}/{lemma_lang}_{gloss_lang}.jsonl")

difficulty_data = load_difficulty_data(lemma_lang)
return kaikki_json_path, difficulty_data
Expand Down
2 changes: 1 addition & 1 deletion src/proficiency/extract_kindle_lemmas.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def create_kindle_lemmas_db(db_path: Path) -> None:
enabled_sense_ids: set[int] = {data[1] for data in enabled_lemmas.values()}
conn = init_db(db_path, "en", True, False)

with (files("proficiency") / "en" / "kindle_all_lemmas.csv").open(
with (files("proficiency") / "en" / "kindle_all_lemmas.csv").open( # type: ignore
newline="", encoding="utf-8"
) as f:
csv_reader = csv.reader(f)
Expand Down
4 changes: 3 additions & 1 deletion src/proficiency/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
download_dbnary_files,
init_oxigraph_store,
)
from .extract_kaikki import create_lemmas_db_from_kaikki
from .extract_kaikki import create_lemmas_db_from_kaikki, download_kaikki_non_en_json
from .extract_kindle_lemmas import create_kindle_lemmas_db

VERSION = version("proficiency")
Expand Down Expand Up @@ -118,6 +118,8 @@ def main() -> None:
with ProcessPoolExecutor() as executor:
logging.info("Creating Wiktionary files")
if args.gloss_lang in WIKITEXTRACT_LANGUAGES:
if args.gloss_lang != "en":
download_kaikki_non_en_json(args.gloss_lang)
for _ in executor.map(
partial(
create_wiktionary_files_from_kaikki, gloss_lang=args.gloss_lang
Expand Down
29 changes: 4 additions & 25 deletions src/proficiency/split_jsonl.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,12 @@
import argparse
import json
import subprocess
from importlib.resources import files
from pathlib import Path
from shutil import which


def main() -> None:
def split_kaikki_non_en_jsonl(jsonl_path: Path, gloss_code: str) -> None:
"""
Split extracted jsonl file created by wiktextract to each language file.
"""
parser = argparse.ArgumentParser()
parser.add_argument("jsonl_path", type=Path)
parser.add_argument("gloss_code")
args = parser.parse_args()

with (files("proficiency") / "data" / "kaikki_languages.json").open(
encoding="utf-8"
) as f:
Expand All @@ -24,7 +16,7 @@ def main() -> None:
lang_codes["sh"] = "Serbo-Croatian"

out_file_paths = {
lemma_code: Path(f"build/{lemma_code}/{lemma_code}_{args.gloss_code}.jsonl")
lemma_code: Path(f"build/{lemma_code}/{lemma_code}_{gloss_code}.jsonl")
for lemma_code in lang_codes
}
for out_file_path in out_file_paths.values():
Expand All @@ -34,7 +26,7 @@ def main() -> None:
for lemma_code, out_file_path in zip(lang_codes.keys(), out_file_paths.values())
}

with args.jsonl_path.open(encoding="utf-8") as jsonl_f:
with jsonl_path.open(encoding="utf-8") as jsonl_f:
for line in jsonl_f:
data = json.loads(line)
if "lang_code" in data:
Expand All @@ -45,18 +37,5 @@ def main() -> None:
for out_f in out_files.values():
out_f.write(line)

for lemma_code, out_f in out_files.items():
for out_f in out_files.values():
out_f.close()
jsonl_path = out_file_paths[lemma_code]
jsonl_bz2_path = jsonl_path.with_suffix(".jsonl.bz2")
jsonl_bz2_path.unlink(missing_ok=True)
subprocess.run(
["lbzip2" if which("lbzip2") is not None else "bzip2", str(jsonl_path)],
check=True,
capture_output=True,
text=True,
)


if __name__ == "__main__":
main()

0 comments on commit 3726687

Please sign in to comment.