Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ES TN Fix for Issue #166 #224

Merged
merged 6 commits into from
Sep 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ pipeline {
AR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-24-24-0'
DE_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-03-24-0'
EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-22-24-0'
ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-14-24-0'
ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-14-24-0'
ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-30-24-0'
ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-30-24-0'
FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-04-24-0'
HU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-16-24-0'
PT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
Expand Down
Original file line number Diff line number Diff line change
@@ -1,23 +1,10 @@
di
Di
DI
mi
Mi
MI
vi
Vi
VI
I
i
V
v
X
x
L
l
C
c
D
d
M
m
82 changes: 55 additions & 27 deletions nemo_text_processing/text_normalization/es/graph_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
ES_PLUS = pynini.union("más", "Más", "MÁS").optimize()


def strip_accent(fst: 'pynini.FstLike') -> 'pynini.FstLike':
def strip_accent(fst: "pynini.FstLike") -> "pynini.FstLike":
"""
Converts all accented vowels to non-accented equivalents

Expand All @@ -54,7 +54,7 @@ def strip_accent(fst: 'pynini.FstLike') -> 'pynini.FstLike':
return fst @ pynini.cdrewrite(accents, "", "", NEMO_SIGMA)


def shift_cardinal_gender(fst: 'pynini.FstLike') -> 'pynini.FstLike':
def shift_cardinal_gender(fst: "pynini.FstLike") -> "pynini.FstLike":
"""
Applies gender conversion rules to a cardinal string. These include: rendering all masculine forms of "uno" (including apocopated forms) as "una" and
Converting all gendered numbers in the hundreds series (200,300,400...) to feminine equivalent (e.g. "doscientos" -> "doscientas"). Conversion only applies
Expand All @@ -76,23 +76,23 @@ def shift_cardinal_gender(fst: 'pynini.FstLike') -> 'pynini.FstLike':
+ (pynini.accep("mil") | pynini.accep("milésimo"))
+ pynini.closure(NEMO_SPACE + hundreds, 0, 1)
+ pynini.closure(NEMO_SPACE + one_to_one_hundred, 0, 1)
+ pynini.union(pynini.accep("[EOS]"), pynini.accep("\""), decimal_separator)
+ pynini.union(pynini.accep("[EOS]"), pynini.accep('"'), decimal_separator)
)
before_double_digits = pynini.closure(NEMO_SPACE + one_to_one_hundred, 0, 1) + pynini.union(
pynini.accep("[EOS]"), pynini.accep("\"")
pynini.accep("[EOS]"), pynini.accep('"')
)

fem_allign = pynini.cdrewrite(fem_hundreds, "", before_mil, NEMO_SIGMA) # doscientas mil dosciento
fem_allign @= pynini.cdrewrite(fem_hundreds, "", before_double_digits, NEMO_SIGMA) # doscientas mil doscienta

fem_allign @= pynini.cdrewrite(
fem_ones, "", pynini.union("[EOS]", "\"", decimal_separator), NEMO_SIGMA
fem_ones, "", pynini.union("[EOS]", '"', decimal_separator), NEMO_SIGMA
) # If before a quote or EOS, we know it's the end of a string

return fst @ fem_allign


def shift_number_gender(fst: 'pynini.FstLike') -> 'pynini.FstLike':
def shift_number_gender(fst: "pynini.FstLike") -> "pynini.FstLike":
"""
Performs gender conversion on all verbalized numbers in output. All values in the hundreds series (200,300,400) are changed to
feminine gender (e.g. "doscientos" -> "doscientas") and all forms of "uno" (including apocopated forms) are converted to "una".
Expand All @@ -107,13 +107,13 @@ def shift_number_gender(fst: 'pynini.FstLike') -> 'pynini.FstLike':
"""
fem_allign = pynini.cdrewrite(fem_hundreds, "", "", NEMO_SIGMA)
fem_allign @= pynini.cdrewrite(
fem_ones, "", pynini.union(NEMO_SPACE, pynini.accep("[EOS]"), pynini.accep("\"")), NEMO_SIGMA
fem_ones, "", pynini.union(NEMO_SPACE, pynini.accep("[EOS]"), pynini.accep('"')), NEMO_SIGMA,
) # If before a quote or EOS, we know it's the end of a string

return fst @ fem_allign


def strip_cardinal_apocope(fst: 'pynini.FstLike') -> 'pynini.FstLike':
def strip_cardinal_apocope(fst: "pynini.FstLike") -> "pynini.FstLike":
"""
Reverts apocope on cardinal strings in line with formation rules. e.g. "un" -> "uno". Due to cardinal formation rules, this in effect only
affects strings where the final value is a variation of "un".
Expand All @@ -126,11 +126,11 @@ def strip_cardinal_apocope(fst: 'pynini.FstLike') -> 'pynini.FstLike':
"""
# Since cardinals use apocope by default for large values (e.g. "millón"), this only needs to act on the last instance of one
strip = pynini.cross("un", "uno") | pynini.cross("ún", "uno")
strip = pynini.cdrewrite(strip, "", pynini.union("[EOS]", "\""), NEMO_SIGMA)
strip = pynini.cdrewrite(strip, "", pynini.union("[EOS]", '"'), NEMO_SIGMA)
return fst @ strip


def add_cardinal_apocope_fem(fst: 'pynini.FstLike') -> 'pynini.FstLike':
def add_cardinal_apocope_fem(fst: "pynini.FstLike") -> "pynini.FstLike":
"""
Adds apocope on cardinal strings in line with stressing rules. e.g. "una" -> "un". This only occurs when "una" precedes a stressed "a" sound in formal speech. This is not predictable
with text string, so is included for non-deterministic cases.
Expand All @@ -143,11 +143,11 @@ def add_cardinal_apocope_fem(fst: 'pynini.FstLike') -> 'pynini.FstLike':
"""
# Since the stress trigger follows the cardinal string and only affects the preceding sound, this only needs to act on the last instance of one
strip = pynini.cross("una", "un") | pynini.cross("veintiuna", "veintiún")
strip = pynini.cdrewrite(strip, "", pynini.union("[EOS]", "\""), NEMO_SIGMA)
strip = pynini.cdrewrite(strip, "", pynini.union("[EOS]", '"'), NEMO_SIGMA)
return fst @ strip


def roman_to_int(fst: 'pynini.FstLike') -> 'pynini.FstLike':
def roman_to_int(fst: "pynini.FstLike") -> "pynini.FstLike":
"""
Alters given fst to convert Roman integers (lower and upper cased) into Arabic numerals. Valid for values up to 1000.
e.g.
Expand All @@ -158,29 +158,57 @@ def roman_to_int(fst: 'pynini.FstLike') -> 'pynini.FstLike':
fst: Any fst. Composes fst onto Roman conversion outputs.
"""

def _load_roman(file: str):
def _load_roman(file: str, upper_casing: bool):
tbartley94 marked this conversation as resolved.
Show resolved Hide resolved
roman = load_labels(get_abs_path(file))
roman_numerals = [(x, y) for x, y in roman] + [(x.upper(), y) for x, y in roman]
if upper_casing:
roman_numerals = [(x.upper(), y) for x, y in roman]
else:
roman_numerals = [(x, y) for x, y in roman]
return pynini.string_map(roman_numerals)

digit = _load_roman("data/roman/digit.tsv")
ties = _load_roman("data/roman/ties.tsv")
hundreds = _load_roman("data/roman/hundreds.tsv")
thousands = _load_roman("data/roman/thousands.tsv")
# A split between all upper-case and all lower-case Roman numerals is introduced in order to preserve orthographic accuracy,
# and to prevent cases in which certain proper nouns e.g. (Li, Xi, Yi, etc.) are transduced to Roman numerals.

digit_lower = _load_roman("data/roman/digit.tsv", False)
digit_upper = _load_roman("data/roman/digit.tsv", True)
ties_lower = _load_roman("data/roman/ties.tsv", False)
ties_upper = _load_roman("data/roman/ties.tsv", True)
hundreds_lower = _load_roman("data/roman/hundreds.tsv", False)
hundreds_upper = _load_roman("data/roman/hundreds.tsv", True)
thousands_lower = _load_roman("data/roman/thousands.tsv", False)
thousands_upper = _load_roman("data/roman/thousands.tsv", True)

graph = (
digit
| ties + (digit | pynutil.add_weight(pynutil.insert("0"), 0.01))
(digit_upper | digit_lower)
| (
(ties_upper + (digit_upper | pynutil.add_weight(pynutil.insert("0"), 0.01)))
| (ties_lower + (digit_lower | pynutil.add_weight(pynutil.insert("0"), 0.01)))
)
| (
hundreds
+ (ties | pynutil.add_weight(pynutil.insert("0"), 0.01))
+ (digit | pynutil.add_weight(pynutil.insert("0"), 0.01))
(
hundreds_upper
+ (ties_upper | pynutil.add_weight(pynutil.insert("0"), 0.01))
+ (digit_upper | pynutil.add_weight(pynutil.insert("0"), 0.01))
)
| (
hundreds_lower
+ (ties_lower | pynutil.add_weight(pynutil.insert("0"), 0.01))
+ (digit_lower | pynutil.add_weight(pynutil.insert("0"), 0.01))
)
)
| (
thousands
+ (hundreds | pynutil.add_weight(pynutil.insert("0"), 0.01))
+ (ties | pynutil.add_weight(pynutil.insert("0"), 0.01))
+ (digit | pynutil.add_weight(pynutil.insert("0"), 0.01))
(
thousands_upper
+ (hundreds_upper | pynutil.add_weight(pynutil.insert("0"), 0.01))
+ (ties_upper | pynutil.add_weight(pynutil.insert("0"), 0.01))
+ (digit_upper | pynutil.add_weight(pynutil.insert("0"), 0.01))
)
| (
thousands_lower
+ (hundreds_lower | pynutil.add_weight(pynutil.insert("0"), 0.01))
+ (ties_lower | pynutil.add_weight(pynutil.insert("0"), 0.01))
+ (digit_lower | pynutil.add_weight(pynutil.insert("0"), 0.01))
)
)
).optimize()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,4 +114,7 @@
1ra~primera
maría vii~maría séptima~maría séptimo
todo mi reconocimiento~todo mi reconocimiento
V~V
V~V
El texto de Li Qin en este libro ahora está disponible en forma de libro electrónico.~El texto de Li Qin en este libro ahora está disponible en forma de libro electrónico.
Xi Jinping es el actual presidente de China.~Xi Jinping es el actual presidente de China.
Matías fue el XI apóstol.~Matías fue el undécimo apóstol.
Loading