diff --git a/simplemma/strategies/defaultrules/__init__.py b/simplemma/strategies/defaultrules/__init__.py index 2107dfd..2e72cb3 100644 --- a/simplemma/strategies/defaultrules/__init__.py +++ b/simplemma/strategies/defaultrules/__init__.py @@ -6,6 +6,7 @@ from .en import apply_en from .et import apply_et from .fi import apply_fi +from .lv import apply_lv from .nl import apply_nl from .pl import apply_pl from .ru import apply_ru @@ -15,6 +16,7 @@ "en": apply_en, "et": apply_et, "fi": apply_fi, + "lv": apply_lv, "nl": apply_nl, "pl": apply_pl, "ru": apply_ru, diff --git a/simplemma/strategies/defaultrules/lv.py b/simplemma/strategies/defaultrules/lv.py new file mode 100644 index 0000000..883b652 --- /dev/null +++ b/simplemma/strategies/defaultrules/lv.py @@ -0,0 +1,45 @@ +import re +from typing import Optional + +from .generic import apply_rules + +# https://en.wiktionary.org/wiki/Category:Latvian_suffixes + +DEFAULT_RULES = { + # feminine nouns + re.compile(r"(?:āju|ājas|ājai|ājam|ājās)$"): "āja", + re.compile(r"(?:ēju|ējas|ējai|ējam|ējās)$"): "ēja", + re.compile(r"(?:ieci|ieces|iecei|iecē|ieču|iecēm|iecēs)$"): "iece", + re.compile(r"(?:ieti|ietes|ietei|ietē|ietes|iešu|ietēm|ietēs)$"): "iete", + re.compile(r"(?:iju|ijas|ijai|ijam)$"): "ija", + re.compile(r"(?:ību|ības|ībai|ībām|ībās)$"): "ība", + re.compile(r"(?:īgu|īga|īgam|īgi|īgus|īgiem|īgos|īgas|īgai|īgā|īgām|īgās)$"): "īgs", + re.compile(r"(?:īva|īvu|īvam|īvas|īvai|īvus|īviem|īvos|īvā|īvām|īvās)$"): "īvs", + re.compile(r"(?:šanu|šanas|šanai|šanā|šanām|šanās)$"): "šana", + re.compile(r"(?:umu|uma|umam|umā|umām|umās)$"): "ums", # |um + # masculine nouns + re.compile(r"(?:āju|āja|ājam|āj|āji|ājus|ājiem|ājos)$"): "ājs", + re.compile(r"(?:iņu|iņa|iņam|iņ|iņi|iņus|iņiem|iņos)$"): "iņš", + re.compile( + r"(?:isku|iska|iskam|iskā|iski|iskus|iskiem|isko|iskos|iskai|iskas|iskām|iskās)$" + ): "isks", + re.compile(r"(?:ismu|isma|ismam|ismā|iski|ism)$"): "isms", + re.compile(r"(?:īti|īša|ītim|ītī|īt|īši|īšus|īšu|īšiem|īšos)$"): "ītis", + re.compile(r"(?:kli|kļa|klim|klī|kļi|kļus|kļiem|kļos)$"): "klis", + re.compile(r"(?:nieku|nieka|niekam|niekā|nieki|niekus|niekiem|niekos)$"): "nieks", + re.compile(r"(?:ni|ņa|nim|nī|ņi|ņus|ņu|ņiem|ņos)$"): "nis", + # fallback + re.compile(r"(?:as|ai|ā|ām|ās)$"): "a", + re.compile(r"(?:ei|es|ē|ēm|ēs)$"): "e", + re.compile(r"(?:is|im|ī|iem|īs)$"): "is", + # re.compile(r"(?:os|us)$"): "s", + # re.compile(r"(?:ēto|ēts)$"): "ēt", +} + + +def apply_lv(token: str) -> Optional[str]: + "Apply pre-defined rules for Latvian." + if len(token) < 5: + return None + + return apply_rules(token, DEFAULT_RULES) diff --git a/simplemma/strategies/dictionaries/data/lv.plzma b/simplemma/strategies/dictionaries/data/lv.plzma index 7298d8c..f9702c4 100644 Binary files a/simplemma/strategies/dictionaries/data/lv.plzma and b/simplemma/strategies/dictionaries/data/lv.plzma differ diff --git a/simplemma/strategies/greedy_dictionary_lookup.py b/simplemma/strategies/greedy_dictionary_lookup.py index 2d52244..fea33da 100644 --- a/simplemma/strategies/greedy_dictionary_lookup.py +++ b/simplemma/strategies/greedy_dictionary_lookup.py @@ -7,7 +7,7 @@ from .dictionaries.dictionary_factory import DefaultDictionaryFactory, DictionaryFactory from .lemmatization_strategy import LemmatizationStrategy -SHORTER_GREEDY = {"bg", "et", "fi"} +SHORTER_GREEDY = {"bg", "et", "fi", "lv"} class GreedyDictionaryLookupStrategy(LemmatizationStrategy): diff --git a/tests/strategies/defaultrules/test_rules.py b/tests/strategies/defaultrules/test_rules.py index bc6c61f..0e4b607 100644 --- a/tests/strategies/defaultrules/test_rules.py +++ b/tests/strategies/defaultrules/test_rules.py @@ -19,4 +19,8 @@ def test_DEFAULT_RULES() -> None: assert rules_strategy.get_lemma("безгра́мотностью", "ru") == "безгра́мотность" - assert rules_strategy.get_lemma("keelkondade", "et") == "keelkond" + assert rules_strategy.get_lemma("Rīga", "lv") is None + assert rules_strategy.get_lemma("šķirkļiem", "lv") == "šķirklis" + assert rules_strategy.get_lemma("mācībām", "lv") == "mācība" + + assert rules_strategy.get_lemma("keelkondade", "et") == "keelkond" \ No newline at end of file diff --git a/tests/test_dictionary_pickler.py b/tests/test_dictionary_pickler.py index 2fc806f..333484c 100644 --- a/tests/test_dictionary_pickler.py +++ b/tests/test_dictionary_pickler.py @@ -20,6 +20,7 @@ def test_logic() -> None: # log warning mydict = dictionary_pickler._read_dict(testfile, "zz", silent=False) assert len(mydict) == 3 + # different length mydict = dictionary_pickler._read_dict(testfile, "en", silent=True) assert len(mydict) == 5 diff --git a/training/dictionary_pickler.py b/training/dictionary_pickler.py index 15345d1..a1eed9f 100644 --- a/training/dictionary_pickler.py +++ b/training/dictionary_pickler.py @@ -19,7 +19,7 @@ LOGGER = logging.getLogger(__name__) -INPUT_PUNCT = re.compile(r"[,:*/\+_]|^-|-\t") +INPUT_PUNCT = re.compile(r"[,:*/\+_]|.+-$|.+-\t|^-.+") SAFE_LIMIT = { "cs", "da", @@ -31,6 +31,7 @@ "ga", "hu", "it", + "lv", "pl", "pt", "ru", @@ -86,7 +87,7 @@ def _read_dict( and columns[1] != columns[0] ): rule = DEFAULT_RULES[langcode](columns[1]) - if rule is not None and rule != columns[1]: + if rule and rule != columns[0]: print(columns[1], columns[0], rule) # process if columns[1] in mydict and mydict[columns[1]] != columns[0]: