LuteOrg · jzohrab · Nov 16, 2024 · Nov 9, 2024 · Nov 9, 2024 · Nov 9, 2024
diff --git a/plugins/lute-khmer/.pytest.ini b/plugins/lute-khmer/.pytest.ini
@@ -0,0 +1,10 @@
+[pytest]
+testpaths =
+    tests
+
+# Acceptance tests were raising FutureWarning:
+# FutureWarning: Deleting all cookies via CookieManager.delete()
+# with no arguments has been deprecated. use CookieManager.delete_all().
+# This is internal to the package, so stopping that.
+filterwarnings =
+    ignore::FutureWarning
diff --git a/plugins/lute-khmer/README.md b/plugins/lute-khmer/README.md
@@ -0,0 +1,7 @@
+# README
+
+The Lute Khmer parser.
+
+See [the wiki](https://github.com/LuteOrg/lute-v3/wiki/Developing-language-parser-plugins) for development notes.
+
+See the [Pypi readme](./README_PyPi.md) for extra config notes.
diff --git a/plugins/lute-khmer/README_PyPi.md b/plugins/lute-khmer/README_PyPi.md
@@ -0,0 +1,12 @@
+# `lute3-khmer`
+
+A Khmer parser for Lute (`lute3`).
+
+## Installation
+
+See the [Lute manual](https://luteorg.github.io/lute-manual/install/plugins.html).
+
+## Usage
+
+When this parser is installed, you can add "Khmer" as a
+language to Lute, which comes with a simple story.
diff --git a/plugins/lute-khmer/definition.yaml b/plugins/lute-khmer/definition.yaml
@@ -0,0 +1,22 @@
+name: Khmer
+dictionaries:
+  - for: terms
+    type: embedded
+    url: https://en.wiktionary.org/wiki/###
+  - for: terms
+    type: popup
+    url: https://glosbe.com/km/en/###
+  - for: terms
+    type: popup
+    url: https://www.kheng.info/search/?query=###
+  - for: sentences
+    type: embedded
+    url: https://www.bing.com/translator/?from=kh&to=en&text=###
+show_romanization: true
+# right_to_left:
+
+parser_type: lute_khmer
+# character_substitutions:
+split_sentences: ។?៕
+# split_sentence_exceptions:
+word_chars: ក-៹
diff --git a/plugins/lute-khmer/lute_khmer_parser/__init__.py b/plugins/lute-khmer/lute_khmer_parser/__init__.py
@@ -0,0 +1,5 @@
+"""
+Lute Khmer Parser
+"""
+
+__version__ = "0.0.3"
diff --git a/plugins/lute-khmer/lute_khmer_parser/parser.py b/plugins/lute-khmer/lute_khmer_parser/parser.py
@@ -0,0 +1,86 @@
+"""
+Parsing using khmer-nltk
+
+Includes classes:
+
+- KhmerParser
+
+"""
+
+import re
+
+from typing import List
+
+import khmernltk
+
+from lute.parse.base import ParsedToken, AbstractParser
+
+
+class KhmerParser(AbstractParser):
+    """
+    A parser for KHMER
+    """
+
+    @classmethod
+    def name(cls):
+        return "Lute Khmer"
+
+    @classmethod
+    def uses_data_directory(cls):
+        "Uses the data_directory (defined in the AbstractParser)."
+        return False  # or True
+
+    # @classmethod
+    # def init_data_directory(cls):
+    #     "Set up necessary files."
+    #     pass
+
+    def get_parsed_tokens(self, text: str, language) -> List[ParsedToken]:
+        """
+        Returns ParsedToken array for given language.
+        """
+
+        # Ensure standard carriage returns so that paragraph
+        # markers are used correctly.  Lute uses paragraph markers
+        # for rendering.
+        text = text.replace("\r\n", "\n")
+        text = text.replace("\n", "\\")
+
+        words = khmernltk.word_tokenize(text)  # ... get words using parser.
+        tokens = []
+        pattern = f"[{language.word_characters}]"
+        whitespace_regex = r"[ \t]+"
+        for word in words:
+            is_end_of_sentence = word in language.regexp_split_sentences
+            is_whitespace = re.match(whitespace_regex, word) is not None
+            if is_whitespace:
+                continue
+
+            is_word_char = (not is_end_of_sentence) and (
+                re.match(pattern, word) is not None
+            )
+
+            if word == "\\":
+                word = "¶"
+            if word == "¶":
+                is_word_char = False
+                is_end_of_sentence = True
+
+            if word.startswith("\\"):
+                num_leading_slashes = len(word) - len(word.lstrip("\\"))
+                for _ in range(num_leading_slashes):
+                    tokens.append(ParsedToken("¶", False, True))
+
+                word = word.lstrip("\\")
+                is_word_char = True
+                is_end_of_sentence = False
+
+            t = ParsedToken(word, is_word_char, is_end_of_sentence)
+            tokens.append(t)
+        return tokens
+
+    def get_reading(self, text: str):
+        """
+        Get reading -- some parsers can return readings.
+        """
+        return None
diff --git a/plugins/lute-khmer/pyproject.toml b/plugins/lute-khmer/pyproject.toml
@@ -0,0 +1,26 @@
+# TODO fix names
+[build-system]
+requires = ["flit_core >=3.2,<4"]
+build-backend = "flit_core.buildapi"
+
+[tool.flit.module]
+name = "lute_khmer_parser"
+
+[project]
+name = "lute3-khmer"
+dynamic = ['version']
+description = "Learning Using Texts - Khmer Parser"
+requires-python = ">=3.8"
+authors = [
+  {name = "Justin Dom"}
+]
+readme = "README_PyPi.md"
+
+dependencies = [
+  "lute3>=3.4.2",
+  "khmer-nltk==1.6"
+]
+
+
+[project.entry-points."lute.plugin.parse"]
+lute_khmer = "lute_khmer_parser.parser:KhmerParser"
diff --git a/plugins/lute-khmer/requirements.txt b/plugins/lute-khmer/requirements.txt
@@ -0,0 +1,5 @@
+# Required dependency for base classes.
+lute3>=3.4.2
+
+# extra requirements here.
+khmer-nltk==1.6
diff --git a/plugins/lute-khmer/tests/__init__.py b/plugins/lute-khmer/tests/__init__.py
diff --git a/plugins/lute-khmer/tests/conftest.py b/plugins/lute-khmer/tests/conftest.py
@@ -0,0 +1,36 @@
+"""
+Common fixtures used by many tests.
+"""
+
+import os
+import yaml
+import pytest
+
+
+from lute.parse.registry import init_parser_plugins
+
+from lute.models.language import Language
+
+
+def pytest_sessionstart(session):  # pylint: disable=unused-argument
+    """
+    Initialize parser list
+    """
+    init_parser_plugins()
+
+
+def _get_test_language():
+    """
+    Retrieve the language definition file for testing ths plugin from definition.yaml
+    """
+    thisdir = os.path.dirname(os.path.realpath(__file__))
+    definition_file = os.path.join(thisdir, "..", "definition.yaml")
+    with open(definition_file, "r", encoding="utf-8") as df:
+        d = yaml.safe_load(df)
+    lang = Language.from_dict(d)
+    return lang
+
+
+@pytest.fixture(name="khmer")
+def fixture_khmer():
+    return _get_test_language()
diff --git a/plugins/lute-khmer/tests/test_KhmerParser.py b/plugins/lute-khmer/tests/test_KhmerParser.py
@@ -0,0 +1,100 @@
+"""
+KhmerParser tests.
+"""
+
+
+import pytest
+
+# pylint: disable=wrong-import-order
+from lute.models.term import Term
+from lute.parse.base import ParsedToken
+
+from lute_khmer_parser.parser import KhmerParser
+
+
+def test_dummy_test():
+    "A dummy test so that pytest doesn't complain in github ci."
+    s = "Hello"
+    assert s == "Hello", "TODO - fix these tests for your parser :-)"
+
+
+def test_token_count(khmer):
+    """
+    token_count checks.
+    """
+    cases = [
+        ("ជំរាបសួរ", 2),
+        ("ខ្ញុំ", 1),
+        ("ខ្ញុំស្រលាញ់អ្នក។", 4),
+        ("ខ្ញុំរៀនភាសាខ្មែរ", 4),
+        ("ខ្ញុំចូលចិត្តរៀនភាសាខ្មែរជាមួយមិត្តរបស់ខ្ញុំ", 9),
+    ]
+
+    for text, expected_count in cases:
+        t = Term(khmer, text)
+        assert t.token_count == expected_count, text
+        assert t.text_lc == t.text, "case"
+
+
+def assert_tokens_equals(text, lang, expected):
+    """
+    Parsing a text using a language should give the expected parsed tokens.
+
+    expected is given as array of:
+    [ original_text, is_word, is_end_of_sentence ]
+    """
+    p = KhmerParser()
+    actual = p.get_parsed_tokens(text, lang)
+    expected = [ParsedToken(*a) for a in expected]
+    assert [str(a) for a in actual] == [str(e) for e in expected]
+
+
+def test_end_of_sentence_stored_in_parsed_tokens(khmer):
+    """
+    ParsedToken is marked as EOS=True at ends of sentences.
+    """
+    s = "ខ្ញុំចូលចិត្តរៀនភាសាខ្មែរជាមួយមិត្តរបស់ខ្ញុំ។ ចុះអ្នកវិញ?"
+
+    expected = [
+        ("ខ្ញុំ", True),
+        ("ចូលចិត្ត", True),
+        ("រៀន", True),
+        ("ភាសា", True),
+        ("ខ្មែរ", True),
+        ("ជាមួយ", True),
+        ("មិត្ត", True),
+        ("របស់", True),
+        ("ខ្ញុំ", True),
+        ("។", False, True),
+        ("ចុះ", True),
+        ("អ្នក", True),
+        ("វិញ", True),
+        ("?", False, True),
+    ]
+    assert_tokens_equals(s, khmer, expected)
+
+
+def test_carriage_returns_treated_as_reverse_p_character(khmer):
+    """
+    Returns need to be marked with the backwards P for rendering etc.
+    """
+    s = "ខ្ញុំចូលចិត្តរៀនភាសាខ្មែរជាមួយមិត្តរបស់ខ្ញុំ។\nចុះអ្នកវិញ?"
+
+    expected = [
+        ("ខ្ញុំ", True),
+        ("ចូលចិត្ត", True),
+        ("រៀន", True),
+        ("ភាសា", True),
+        ("ខ្មែរ", True),
+        ("ជាមួយ", True),
+        ("មិត្ត", True),
+        ("របស់", True),
+        ("ខ្ញុំ", True),
+        ("។", False, True),
+        ("¶", False, True),
+        ("ចុះ", True),
+        ("អ្នក", True),
+        ("វិញ", True),
+        ("?", False, True),
+    ]
+    assert_tokens_equals(s, khmer, expected)