Skip to content

Commit

Permalink
Add Kindle lemma db to all "en_en" and "en_zh" tar files
Browse files Browse the repository at this point in the history
  • Loading branch information
xxyzz committed Sep 13, 2024
1 parent f1e808d commit d563acc
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 38 deletions.
13 changes: 9 additions & 4 deletions src/proficiency/create_klld.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,14 +89,18 @@ def create_klld_tables(
conn.executemany("INSERT INTO metadata VALUES(?, ?)", metadata.items())


def create_klld_db(
wiktionary_db_path: Path, klld_path: Path, lemma_lang: str, gloss_lang: str
) -> None:
def create_klld_db(gloss_lang: str, lemma_lang: str) -> Path:
from .database import wiktionary_db_path
from .main import MAJOR_VERSION

klld_path = Path(
f"build/{lemma_lang}/kll.{lemma_lang}.{gloss_lang}_v{MAJOR_VERSION}.klld"
)
if klld_path.exists():
klld_path.unlink()

klld_conn = sqlite3.connect(klld_path)
wiktionary_conn = sqlite3.connect(wiktionary_db_path)
wiktionary_conn = sqlite3.connect(wiktionary_db_path(lemma_lang, gloss_lang))
create_klld_tables(klld_conn, lemma_lang, gloss_lang)

for data in wiktionary_conn.execute("SELECT id, lemma FROM lemmas"):
Expand Down Expand Up @@ -152,6 +156,7 @@ def create_klld_db(
klld_conn.commit()
klld_conn.close()
wiktionary_conn.close()
return klld_path


def remove_rtl_pdi(text: str) -> str:
Expand Down
61 changes: 27 additions & 34 deletions src/proficiency/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from pathlib import Path

from .create_klld import create_klld_db
from .database import wiktionary_db_path
from .extract_dbnary import (
create_lemmas_db_from_dbnary,
download_dbnary_files,
Expand Down Expand Up @@ -51,26 +50,6 @@ def create_wiktionary_files_from_dbnary(
return db_paths


def create_kindle_files(lemma_lang: str, gloss_lang: str) -> list[Path]:
db_paths = []
if lemma_lang == "en" and gloss_lang == "en":
db_path = Path(f"build/en/kindle_en_en_v{MAJOR_VERSION}.db")
create_kindle_lemmas_db(db_path)
db_paths.append(db_path)

klld_path = Path(
f"build/{lemma_lang}/kll.{lemma_lang}.{gloss_lang}_v{MAJOR_VERSION}.klld"
)
create_klld_db(
wiktionary_db_path(lemma_lang, gloss_lang),
klld_path,
lemma_lang,
gloss_lang,
)
db_paths.append(klld_path)
return db_paths


def main() -> None:
logging.basicConfig(
format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO
Expand Down Expand Up @@ -131,37 +110,51 @@ def main() -> None:
logging.info("Wiktionary files created")

logging.info("Creating Kindle files")
kindle_db_path = Path()
if "en" in args.lemma_lang_codes and args.gloss_lang in ["en", "zh"]:
kindle_db_path = Path(f"build/en/kindle_en_en_v{MAJOR_VERSION}.db")
create_kindle_lemmas_db(kindle_db_path)

no_zh_cn_paths = file_paths.copy()
for db_paths in executor.map(
partial(create_kindle_files, gloss_lang=args.gloss_lang),
for db_path in executor.map(
partial(create_klld_db, args.gloss_lang),
args.lemma_lang_codes,
):
no_zh_cn_paths.extend(db_paths)
archive_files(no_zh_cn_paths)
no_zh_cn_paths.append(db_path)
archive_files(no_zh_cn_paths, kindle_db_path)
if args.gloss_lang == "zh":
for db_paths in executor.map(
partial(create_kindle_files, gloss_lang="zh_cn"),
for db_path in executor.map(
partial(create_klld_db, "zh_cn"),
args.lemma_lang_codes,
):
file_paths.extend(db_paths)
archive_files(file_paths)
file_paths.append(db_path)
archive_files(file_paths, kindle_db_path, True)
logging.info("Kindle files created")


def archive_files(file_paths: list[Path]) -> None:
def archive_files(
file_paths: list[Path], kindle_db_path: Path, is_zh_cn: bool = False
) -> None:
grouped_paths = defaultdict(list)
lemma_code = ""
gloss_code = ""
for path in file_paths:
if (is_zh_cn and "zh_cn" not in path.name) or (
not is_zh_cn and "zh_cn" in path.name
):
continue
_, lemma_code, gloss_code, _ = re.split(r"\.|_", path.name, 3)
grouped_paths[f"{lemma_code}_{gloss_code}"].append(path)
for tar_name, paths in grouped_paths.items():
if "zh_cn" in paths[-1].name:
tar_name = f"{lemma_code}_{gloss_code}"
if is_zh_cn:
tar_name += "_cn"
tar_path = f"build/{tar_name}.tar.bz2"
grouped_paths[tar_name].append(path)
for tar_name, paths in grouped_paths.items():
tar_path = Path(f"build/{tar_name}.tar.bz2")
with tarfile.open(name=tar_path, mode="x:bz2") as tar_f:
for path in paths:
tar_f.add(path, path.name)
if tar_name.startswith(("en_en", "en_zh")):
tar_f.add(kindle_db_path)


if __name__ == "__main__":
Expand Down

0 comments on commit d563acc

Please sign in to comment.