diff --git a/plugins/lute-khmer/.pytest.ini b/plugins/lute-khmer/.pytest.ini new file mode 100644 index 00000000..8b098e6f --- /dev/null +++ b/plugins/lute-khmer/.pytest.ini @@ -0,0 +1,10 @@ +[pytest] +testpaths = + tests + +# Acceptance tests were raising FutureWarning: +# FutureWarning: Deleting all cookies via CookieManager.delete() +# with no arguments has been deprecated. use CookieManager.delete_all(). +# This is internal to the package, so stopping that. +filterwarnings = + ignore::FutureWarning diff --git a/plugins/lute-khmer/README.md b/plugins/lute-khmer/README.md new file mode 100644 index 00000000..c7687741 --- /dev/null +++ b/plugins/lute-khmer/README.md @@ -0,0 +1,7 @@ +# README + +The Lute Khmer parser. + +See [the wiki](https://github.com/LuteOrg/lute-v3/wiki/Developing-language-parser-plugins) for development notes. + +See the [Pypi readme](./README_PyPi.md) for extra config notes. diff --git a/plugins/lute-khmer/README_PyPi.md b/plugins/lute-khmer/README_PyPi.md new file mode 100644 index 00000000..590eb9bd --- /dev/null +++ b/plugins/lute-khmer/README_PyPi.md @@ -0,0 +1,12 @@ +# `lute3-khmer` + +A Khmer parser for Lute (`lute3`). + +## Installation + +See the [Lute manual](https://luteorg.github.io/lute-manual/install/plugins.html). + +## Usage + +When this parser is installed, you can add "Khmer" as a +language to Lute, which comes with a simple story. diff --git a/plugins/lute-khmer/definition.yaml b/plugins/lute-khmer/definition.yaml new file mode 100644 index 00000000..e3f2c0bd --- /dev/null +++ b/plugins/lute-khmer/definition.yaml @@ -0,0 +1,22 @@ +name: Khmer +dictionaries: + - for: terms + type: embedded + url: https://en.wiktionary.org/wiki/### + - for: terms + type: popup + url: https://glosbe.com/km/en/### + - for: terms + type: popup + url: https://www.kheng.info/search/?query=### + - for: sentences + type: embedded + url: https://www.bing.com/translator/?from=kh&to=en&text=### +show_romanization: true +# right_to_left: + +parser_type: lute_khmer +# character_substitutions: +split_sentences: ។?៕ +# split_sentence_exceptions: +word_chars: ក-៹ diff --git a/plugins/lute-khmer/lute_khmer_parser/__init__.py b/plugins/lute-khmer/lute_khmer_parser/__init__.py new file mode 100644 index 00000000..cf5eb49a --- /dev/null +++ b/plugins/lute-khmer/lute_khmer_parser/__init__.py @@ -0,0 +1,5 @@ +""" +Lute Khmer Parser +""" + +__version__ = "0.0.3" diff --git a/plugins/lute-khmer/lute_khmer_parser/parser.py b/plugins/lute-khmer/lute_khmer_parser/parser.py new file mode 100644 index 00000000..035cba5a --- /dev/null +++ b/plugins/lute-khmer/lute_khmer_parser/parser.py @@ -0,0 +1,118 @@ +""" +Parsing using khmer-nltk + +Includes classes: + +- KhmerParser + +""" + +import re + +from typing import List + +import khmernltk + +from lute.parse.base import ParsedToken, AbstractParser + + +class KhmerParser(AbstractParser): + """ + A parser for KHMER + """ + + @classmethod + def name(cls): + return "Lute Khmer" + + @classmethod + def uses_data_directory(cls): + "Uses the data_directory (defined in the AbstractParser)." + return False # or True + + # @classmethod + # def init_data_directory(cls): + # "Set up necessary files." + # pass + + def _handle_special_token(self, token: str, special_char: str) -> List[str]: + """ + Handle special token scenarios by replacing all special tokens with newline characters. + + Example: + If \ is the special token then + "\hey\man\\\" will evaluate as + ["\n", "hey", "\n", "man", "\n", "\n", "\n"] + """ + if token == special_char: + return ["\n"] + + num_leading_slashes = len(token) - len(token.lstrip(special_char)) + num_trailing_slashes = len(token) - len(token.rstrip(special_char)) + output = [] + + output.extend("\n" * num_leading_slashes) + + tokens = token.strip(special_char).split(special_char) + + if len(tokens) == 1: + output.append(tokens[0]) + else: + for token in tokens[:-1]: + output.append(token) + output.append("\n") + output.append(tokens[-1]) + + output.extend("\n" * num_trailing_slashes) + + return output + + def word_tokenize(self, text: str) -> List[str]: + """ + Tokenize a text using khmernltk and handle the fact that khmernltk + completely omits newline characters by replacing all newline chars with + something that khmernltk won't omit. + """ + special_char = "\\" + text = text.replace("\n", special_char) + output = [] + + for token in khmernltk.word_tokenize(text): + if special_char in token: + output.extend(self._handle_special_token(token, special_char)) + continue + output.append(token) + return output + + def get_parsed_tokens(self, text: str, language) -> List[ParsedToken]: + """ + Returns ParsedToken array for given language. + """ + + # Ensure standard carriage returns so that paragraph + # markers are used correctly. Lute uses paragraph markers + # for rendering. + text = text.replace("\r\n", "\n") + words = self.word_tokenize(text) # ... get words using parser. + pattern = f"[{language.word_characters}]" + tokens = [] + for word in words: + is_end_of_sentence = word in language.regexp_split_sentences + is_word_char = (not is_end_of_sentence) and ( + re.match(pattern, word) is not None + ) + if word == "\n": + word = "¶" + if word == "¶": + is_word_char = False + is_end_of_sentence = True + + t = ParsedToken(word, is_word_char, is_end_of_sentence) + tokens.append(t) + return tokens + + def get_reading(self, text: str): + """ + Get reading -- some parsers can return readings. + """ + return None diff --git a/plugins/lute-khmer/pyproject.toml b/plugins/lute-khmer/pyproject.toml new file mode 100644 index 00000000..783bcea6 --- /dev/null +++ b/plugins/lute-khmer/pyproject.toml @@ -0,0 +1,26 @@ +# TODO fix names +[build-system] +requires = ["flit_core >=3.2,<4"] +build-backend = "flit_core.buildapi" + +[tool.flit.module] +name = "lute_khmer_parser" + +[project] +name = "lute3-khmer" +dynamic = ['version'] +description = "Learning Using Texts - Khmer Parser" +requires-python = ">=3.8" +authors = [ + {name = "Justin Dom"} +] +readme = "README_PyPi.md" + +dependencies = [ + "lute3>=3.4.2", + "khmer-nltk==1.6" +] + + +[project.entry-points."lute.plugin.parse"] +lute_khmer = "lute_khmer_parser.parser:KhmerParser" diff --git a/plugins/lute-khmer/requirements.txt b/plugins/lute-khmer/requirements.txt new file mode 100644 index 00000000..cfb04a4a --- /dev/null +++ b/plugins/lute-khmer/requirements.txt @@ -0,0 +1,5 @@ +# Required dependency for base classes. +lute3>=3.4.2 + +# extra requirements here. +khmer-nltk==1.6 diff --git a/plugins/lute-khmer/tests/__init__.py b/plugins/lute-khmer/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/plugins/lute-khmer/tests/conftest.py b/plugins/lute-khmer/tests/conftest.py new file mode 100644 index 00000000..1dcbd220 --- /dev/null +++ b/plugins/lute-khmer/tests/conftest.py @@ -0,0 +1,36 @@ +""" +Common fixtures used by many tests. +""" + +import os +import yaml +import pytest + + +from lute.parse.registry import init_parser_plugins + +from lute.models.language import Language + + +def pytest_sessionstart(session): # pylint: disable=unused-argument + """ + Initialize parser list + """ + init_parser_plugins() + + +def _get_test_language(): + """ + Retrieve the language definition file for testing ths plugin from definition.yaml + """ + thisdir = os.path.dirname(os.path.realpath(__file__)) + definition_file = os.path.join(thisdir, "..", "definition.yaml") + with open(definition_file, "r", encoding="utf-8") as df: + d = yaml.safe_load(df) + lang = Language.from_dict(d) + return lang + + +@pytest.fixture(name="khmer") +def fixture_khmer(): + return _get_test_language() diff --git a/plugins/lute-khmer/tests/test_KhmerParser.py b/plugins/lute-khmer/tests/test_KhmerParser.py new file mode 100644 index 00000000..d885193f --- /dev/null +++ b/plugins/lute-khmer/tests/test_KhmerParser.py @@ -0,0 +1,101 @@ +""" +KhmerParser tests. +""" + + +import pytest + +# pylint: disable=wrong-import-order +from lute.models.term import Term +from lute.parse.base import ParsedToken + +from lute_khmer_parser.parser import KhmerParser + + +def test_dummy_test(): + "A dummy test so that pytest doesn't complain in github ci." + s = "Hello" + assert s == "Hello", "TODO - fix these tests for your parser :-)" + + +def test_token_count(khmer): + """ + token_count checks. + """ + cases = [ + ("ជំរាបសួរ", 2), + ("ខ្ញុំ", 1), + ("ខ្ញុំស្រលាញ់អ្នក។", 4), + ("ខ្ញុំរៀនភាសាខ្មែរ", 4), + ("ខ្ញុំចូលចិត្តរៀនភាសាខ្មែរជាមួយមិត្តរបស់ខ្ញុំ", 9), + ] + + for text, expected_count in cases: + t = Term(khmer, text) + assert t.token_count == expected_count, text + assert t.text_lc == t.text, "case" + + +def assert_tokens_equals(text, lang, expected): + """ + Parsing a text using a language should give the expected parsed tokens. + + expected is given as array of: + [ original_text, is_word, is_end_of_sentence ] + """ + p = KhmerParser() + actual = p.get_parsed_tokens(text, lang) + expected = [ParsedToken(*a) for a in expected] + assert [str(a) for a in actual] == [str(e) for e in expected] + + +def test_end_of_sentence_stored_in_parsed_tokens(khmer): + """ + ParsedToken is marked as EOS=True at ends of sentences. + """ + s = "ខ្ញុំចូលចិត្តរៀនភាសាខ្មែរជាមួយមិត្តរបស់ខ្ញុំ។ ចុះអ្នកវិញ?" + + expected = [ + ("ខ្ញុំ", True), + ("ចូលចិត្ត", True), + ("រៀន", True), + ("ភាសា", True), + ("ខ្មែរ", True), + ("ជាមួយ", True), + ("មិត្ត", True), + ("របស់", True), + ("ខ្ញុំ", True), + ("។", False, True), + (" ", False), + ("ចុះ", True), + ("អ្នក", True), + ("វិញ", True), + ("?", False, True), + ] + assert_tokens_equals(s, khmer, expected) + + +def test_carriage_returns_treated_as_reverse_p_character(khmer): + """ + Returns need to be marked with the backwards P for rendering etc. + """ + s = "ខ្ញុំចូលចិត្តរៀនភាសាខ្មែរជាមួយមិត្តរបស់ខ្ញុំ។\nចុះអ្នកវិញ?" + + expected = [ + ("ខ្ញុំ", True), + ("ចូលចិត្ត", True), + ("រៀន", True), + ("ភាសា", True), + ("ខ្មែរ", True), + ("ជាមួយ", True), + ("មិត្ត", True), + ("របស់", True), + ("ខ្ញុំ", True), + ("។", False, True), + ("¶", False, True), + ("ចុះ", True), + ("អ្នក", True), + ("វិញ", True), + ("?", False, True), + ] + assert_tokens_equals(s, khmer, expected)