Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added khmer parser plugin #513

Merged
merged 7 commits into from
Nov 16, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions plugins/lute-khmer/.pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[pytest]
testpaths =
tests

# Acceptance tests were raising FutureWarning:
# FutureWarning: Deleting all cookies via CookieManager.delete()
# with no arguments has been deprecated. use CookieManager.delete_all().
# This is internal to the package, so stopping that.
filterwarnings =
ignore::FutureWarning
7 changes: 7 additions & 0 deletions plugins/lute-khmer/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# README

The Lute Khmer parser.

See [the wiki](https://github.com/LuteOrg/lute-v3/wiki/Developing-language-parser-plugins) for development notes.

See the [Pypi readme](./README_PyPi.md) for extra config notes.
12 changes: 12 additions & 0 deletions plugins/lute-khmer/README_PyPi.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# `lute3-khmer`

A Khmer parser for Lute (`lute3`).

## Installation

See the [Lute manual](https://luteorg.github.io/lute-manual/install/plugins.html).

## Usage

When this parser is installed, you can add "Khmer" as a
language to Lute, which comes with a simple story.
22 changes: 22 additions & 0 deletions plugins/lute-khmer/definition.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
name: Khmer
dictionaries:
- for: terms
type: embedded
url: https://en.wiktionary.org/wiki/###
- for: terms
type: popup
url: https://glosbe.com/km/en/###
- for: terms
type: popup
url: https://www.kheng.info/search/?query=###
- for: sentences
type: embedded
url: https://www.bing.com/translator/?from=kh&to=en&text=###
show_romanization: true
# right_to_left:

parser_type: lute_khmer
# character_substitutions:
split_sentences: ។?៕
# split_sentence_exceptions:
word_chars: ក-៹
5 changes: 5 additions & 0 deletions plugins/lute-khmer/lute_khmer_parser/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""
Lute Khmer Parser
"""

__version__ = "0.0.3"
86 changes: 86 additions & 0 deletions plugins/lute-khmer/lute_khmer_parser/parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
"""
Parsing using khmer-nltk

Includes classes:

- KhmerParser

"""

import re

from typing import List

import khmernltk

from lute.parse.base import ParsedToken, AbstractParser


class KhmerParser(AbstractParser):
"""
A parser for KHMER
"""

@classmethod
def name(cls):
return "Lute Khmer"

@classmethod
def uses_data_directory(cls):
"Uses the data_directory (defined in the AbstractParser)."
return False # or True

# @classmethod
# def init_data_directory(cls):
# "Set up necessary files."
# pass

def get_parsed_tokens(self, text: str, language) -> List[ParsedToken]:
"""
Returns ParsedToken array for given language.
"""

# Ensure standard carriage returns so that paragraph
# markers are used correctly. Lute uses paragraph markers
# for rendering.
text = text.replace("\r\n", "\n")
text = text.replace("\n", "\\")

words = khmernltk.word_tokenize(text) # ... get words using parser.
tokens = []
pattern = f"[{language.word_characters}]"
whitespace_regex = r"[ \t]+"
for word in words:
is_end_of_sentence = word in language.regexp_split_sentences
is_whitespace = re.match(whitespace_regex, word) is not None
if is_whitespace:
continue

is_word_char = (not is_end_of_sentence) and (
re.match(pattern, word) is not None
)

if word == "\\":
word = "¶"
if word == "¶":
is_word_char = False
is_end_of_sentence = True

if word.startswith("\\"):
num_leading_slashes = len(word) - len(word.lstrip("\\"))
for _ in range(num_leading_slashes):
tokens.append(ParsedToken("¶", False, True))

word = word.lstrip("\\")
is_word_char = True
is_end_of_sentence = False
jzohrab marked this conversation as resolved.
Show resolved Hide resolved

t = ParsedToken(word, is_word_char, is_end_of_sentence)
tokens.append(t)
return tokens

def get_reading(self, text: str):
"""
Get reading -- some parsers can return readings.
"""
return None
26 changes: 26 additions & 0 deletions plugins/lute-khmer/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# TODO fix names
[build-system]
requires = ["flit_core >=3.2,<4"]
build-backend = "flit_core.buildapi"

[tool.flit.module]
name = "lute_khmer_parser"

[project]
name = "lute3-khmer"
dynamic = ['version']
description = "Learning Using Texts - Khmer Parser"
requires-python = ">=3.8"
authors = [
{name = "Justin Dom"}
]
readme = "README_PyPi.md"

dependencies = [
"lute3>=3.4.2",
"khmer-nltk==1.6"
]


[project.entry-points."lute.plugin.parse"]
lute_khmer = "lute_khmer_parser.parser:KhmerParser"
5 changes: 5 additions & 0 deletions plugins/lute-khmer/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Required dependency for base classes.
lute3>=3.4.2

# extra requirements here.
khmer-nltk==1.6
Empty file.
36 changes: 36 additions & 0 deletions plugins/lute-khmer/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""
Common fixtures used by many tests.
"""

import os
import yaml
import pytest


from lute.parse.registry import init_parser_plugins

from lute.models.language import Language


def pytest_sessionstart(session): # pylint: disable=unused-argument
"""
Initialize parser list
"""
init_parser_plugins()


def _get_test_language():
"""
Retrieve the language definition file for testing ths plugin from definition.yaml
"""
thisdir = os.path.dirname(os.path.realpath(__file__))
definition_file = os.path.join(thisdir, "..", "definition.yaml")
with open(definition_file, "r", encoding="utf-8") as df:
d = yaml.safe_load(df)
lang = Language.from_dict(d)
return lang


@pytest.fixture(name="khmer")
def fixture_khmer():
return _get_test_language()
100 changes: 100 additions & 0 deletions plugins/lute-khmer/tests/test_KhmerParser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
"""
KhmerParser tests.
"""


import pytest

# pylint: disable=wrong-import-order
from lute.models.term import Term
from lute.parse.base import ParsedToken

from lute_khmer_parser.parser import KhmerParser


def test_dummy_test():
"A dummy test so that pytest doesn't complain in github ci."
s = "Hello"
assert s == "Hello", "TODO - fix these tests for your parser :-)"


def test_token_count(khmer):
"""
token_count checks.
"""
cases = [
("ជំរាបសួរ", 2),
("ខ្ញុំ", 1),
("ខ្ញុំស្រលាញ់អ្នក។", 4),
("ខ្ញុំរៀនភាសាខ្មែរ", 4),
("ខ្ញុំចូលចិត្តរៀនភាសាខ្មែរជាមួយមិត្តរបស់ខ្ញុំ", 9),
]

for text, expected_count in cases:
t = Term(khmer, text)
assert t.token_count == expected_count, text
assert t.text_lc == t.text, "case"


def assert_tokens_equals(text, lang, expected):
"""
Parsing a text using a language should give the expected parsed tokens.

expected is given as array of:
[ original_text, is_word, is_end_of_sentence ]
"""
p = KhmerParser()
actual = p.get_parsed_tokens(text, lang)
expected = [ParsedToken(*a) for a in expected]
assert [str(a) for a in actual] == [str(e) for e in expected]


def test_end_of_sentence_stored_in_parsed_tokens(khmer):
"""
ParsedToken is marked as EOS=True at ends of sentences.
"""
s = "ខ្ញុំចូលចិត្តរៀនភាសាខ្មែរជាមួយមិត្តរបស់ខ្ញុំ។ ចុះអ្នកវិញ?"

expected = [
("ខ្ញុំ", True),
("ចូលចិត្ត", True),
("រៀន", True),
("ភាសា", True),
("ខ្មែរ", True),
("ជាមួយ", True),
("មិត្ត", True),
("របស់", True),
("ខ្ញុំ", True),
("។", False, True),
("ចុះ", True),
("អ្នក", True),
("វិញ", True),
("?", False, True),
]
assert_tokens_equals(s, khmer, expected)


def test_carriage_returns_treated_as_reverse_p_character(khmer):
"""
Returns need to be marked with the backwards P for rendering etc.
"""
s = "ខ្ញុំចូលចិត្តរៀនភាសាខ្មែរជាមួយមិត្តរបស់ខ្ញុំ។\nចុះអ្នកវិញ?"

expected = [
("ខ្ញុំ", True),
("ចូលចិត្ត", True),
("រៀន", True),
("ភាសា", True),
("ខ្មែរ", True),
("ជាមួយ", True),
("មិត្ត", True),
("របស់", True),
("ខ្ញុំ", True),
("។", False, True),
("¶", False, True),
("ចុះ", True),
("អ្នក", True),
("វិញ", True),
("?", False, True),
]
assert_tokens_equals(s, khmer, expected)
Loading