diff --git a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py index cde940156502..a39131440cec 100644 --- a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py +++ b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py @@ -23,7 +23,7 @@ class InverseNormalizer(Normalizer): """ - Inverse normalizer that converts text from spoken to written form. Useful for ASR postprocessing. + Inverse normalizer that converts text from spoken to written form. Useful for ASR postprocessing. Input is expected to have no punctuation outside of approstrophe (') and dash (-) and be lower cased. Args: @@ -46,6 +46,12 @@ def __init__(self, lang: str = 'en', cache_dir: str = None, overwrite_cache: boo VerbalizeFinalFst, ) + elif lang == 'pt': + from nemo_text_processing.inverse_text_normalization.pt.taggers.tokenize_and_classify import ClassifyFst + from nemo_text_processing.inverse_text_normalization.pt.verbalizers.verbalize_final import ( + VerbalizeFinalFst, + ) + elif lang == 'ru': from nemo_text_processing.inverse_text_normalization.ru.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.inverse_text_normalization.ru.verbalizers.verbalize_final import ( @@ -75,7 +81,7 @@ def __init__(self, lang: str = 'en', cache_dir: str = None, overwrite_cache: boo def inverse_normalize_list(self, texts: List[str], verbose=False) -> List[str]: """ - NeMo inverse text normalizer + NeMo inverse text normalizer Args: texts: list of input strings @@ -106,7 +112,7 @@ def parse_args(): input.add_argument("--input_file", dest="input_file", help="input file path", type=str) parser.add_argument('--output_file', dest="output_file", help="output file path", type=str) parser.add_argument( - "--language", help="language", choices=['en', 'de', 'es', 'ru', 'fr', 'vi'], default="en", type=str + "--language", help="language", choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'vi'], default="en", type=str ) parser.add_argument("--verbose", help="print info for debugging", action='store_true') parser.add_argument("--overwrite_cache", help="set to True to re-create .far grammar files", action="store_true") diff --git a/nemo_text_processing/inverse_text_normalization/pt/__init__.py b/nemo_text_processing/inverse_text_normalization/pt/__init__.py new file mode 100644 index 000000000000..c1586debd25f --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.inverse_text_normalization.pt.taggers.tokenize_and_classify import ClassifyFst +from nemo_text_processing.inverse_text_normalization.pt.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.inverse_text_normalization.pt.verbalizers.verbalize_final import VerbalizeFinalFst diff --git a/nemo_text_processing/inverse_text_normalization/pt/data/currency_plural.tsv b/nemo_text_processing/inverse_text_normalization/pt/data/currency_plural.tsv new file mode 100644 index 000000000000..a89a763093ea --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/data/currency_plural.tsv @@ -0,0 +1,5 @@ +€ euros +£ libras esterlinas +US$ dólares americanos +$ dólares +R$ reais \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/pt/data/currency_singular.tsv b/nemo_text_processing/inverse_text_normalization/pt/data/currency_singular.tsv new file mode 100644 index 000000000000..9ec77dc35654 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/data/currency_singular.tsv @@ -0,0 +1,5 @@ +€ euro +£ libra esterlina +US$ dólar americano +$ dólar +R$ real \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/pt/data/electronic/__init__.py b/nemo_text_processing/inverse_text_normalization/pt/data/electronic/__init__.py new file mode 100644 index 000000000000..a1cf281f0908 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/data/electronic/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/pt/data/electronic/domain.tsv b/nemo_text_processing/inverse_text_normalization/pt/data/electronic/domain.tsv new file mode 100644 index 000000000000..ea547b890119 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/data/electronic/domain.tsv @@ -0,0 +1,26 @@ +com +es +uk +fr +net +br +in +ru +de +it +edu +co +ar +bo +cl +co +ec +fk +gf +fy +pe +py +sr +ve +uy +pt \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/pt/data/electronic/server_name.tsv b/nemo_text_processing/inverse_text_normalization/pt/data/electronic/server_name.tsv new file mode 100644 index 000000000000..34ab709bb308 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/data/electronic/server_name.tsv @@ -0,0 +1,11 @@ +gmail g mail +gmail +nvidia n vidia +nvidia +outlook +hotmail +yahoo +aol +live +msn +live \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/pt/data/electronic/symbols.tsv b/nemo_text_processing/inverse_text_normalization/pt/data/electronic/symbols.tsv new file mode 100644 index 000000000000..690a9ca427f1 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/data/electronic/symbols.tsv @@ -0,0 +1,6 @@ +. ponto +- traço +- hífen +_ traço baixo +_ underscore +/ barra \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/pt/data/measurements_plural.tsv b/nemo_text_processing/inverse_text_normalization/pt/data/measurements_plural.tsv new file mode 100755 index 000000000000..6d2684afdf8b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/data/measurements_plural.tsv @@ -0,0 +1,56 @@ +h horas +min minutos +s segundos +ms milissegundos +ns nanossegundos +μs microssegundos +t toneladas +kg quilos +kg quilogramas +g gramas +mg miligramas +μm micrômetros +nm nanômetros +mm milímetros +cm centímetros +cm² centímetros quadrado +cm³ centímetros cúbico +m metros +m² metros quadrados +m³ metros cúbicos +km quilômetros +km² quilômetros quadrados +ha hectares +kph quilômetros por hora +mph milhas por hora +m/s metros por segundo +l litros +ml mililitros +kgf quilogramas forças +kgf quilogramas força +% por cento +°F fahrenheit +°C celsius +°F graus fahrenheit +°C graus celsius +Hz hertz +kHz quilo hertz +MHz mega hertz +GHz giga hertz +W watts +kW quilowatts +MW megawatts +GW gigawatts +Wh watts hora +kWh quilowatts hora +MWh megawatts hora +GWh gigawatts hora +kV quilovolts +V volts +mV milivolts +A amperes +mA miliamperes +rpm rotações por minuto +db decibéis +cal calorias +kcal quilocalorias diff --git a/nemo_text_processing/inverse_text_normalization/pt/data/measurements_singular.tsv b/nemo_text_processing/inverse_text_normalization/pt/data/measurements_singular.tsv new file mode 100755 index 000000000000..bf7320e6242c --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/data/measurements_singular.tsv @@ -0,0 +1,55 @@ +h hora +min minuto +s segundo +ms milissegundo +ns nanossegundo +μs microssegundo +t tonelada +kg quilo +kg quilograma +g grama +mg miligrama +μm micrômetro +nm nanômetro +mm milímetro +cm centímetro +cm² centímetro quadrado +cm³ centímetro cúbico +m metro +m² metro quadrado +m³ metro cúbico +km quilômetro +km² quilômetro quadrado +ha hectare +kph quilômetro por hora +mph milha por hora +m/s metro por segundo +l litro +ml mililitro +kgf quilograma força +% por cento +°F fahrenheit +°C celsius +°F grau fahrenheit +°C grau celsius +Hz hertz +kHz quilo hertz +MHz mega hertz +GHz giga hertz +W watt +kW quilowatt +MW megawatt +GW gigawatt +Wh watt hora +kWh quilowatt hora +MWh megawatt hora +GWh gigawatt hora +kV quilovolt +V volt +mV milivolt +A ampere +mA miliampere +rpm rotação por minuto +db decibel +cal caloria +kcal quilocaloria diff --git a/nemo_text_processing/inverse_text_normalization/pt/data/months.tsv b/nemo_text_processing/inverse_text_normalization/pt/data/months.tsv new file mode 100644 index 000000000000..ed1cf8d4f78c --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/data/months.tsv @@ -0,0 +1,12 @@ +janeiro +fevereiro +março +abril +maio +junho +julho +agosto +setembro +outubro +novembro +dezembro diff --git a/nemo_text_processing/inverse_text_normalization/pt/data/numbers/__init__.py b/nemo_text_processing/inverse_text_normalization/pt/data/numbers/__init__.py new file mode 100644 index 000000000000..a1cf281f0908 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/data/numbers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/pt/data/numbers/digit.tsv b/nemo_text_processing/inverse_text_normalization/pt/data/numbers/digit.tsv new file mode 100644 index 000000000000..fda1b633b2fb --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/data/numbers/digit.tsv @@ -0,0 +1,11 @@ +um 1 +uma 1 +dois 2 +duas 2 +três 3 +quatro 4 +cinco 5 +seis 6 +sete 7 +oito 8 +nove 9 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/pt/data/numbers/hundreds.tsv b/nemo_text_processing/inverse_text_normalization/pt/data/numbers/hundreds.tsv new file mode 100644 index 000000000000..ff06089d3e67 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/data/numbers/hundreds.tsv @@ -0,0 +1,17 @@ +cento 1 +duzentos 2 +duzentas 2 +trezentos 3 +trezentas 3 +quatrocentos 4 +quatrocentas 4 +quinhentos 5 +quinhentas 5 +seiscentos 6 +seiscentas 6 +setecentos 7 +setecentas 7 +oitocentos 8 +oitocentas 8 +novecentos 9 +novecentas 9 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/pt/data/numbers/onehundred.tsv b/nemo_text_processing/inverse_text_normalization/pt/data/numbers/onehundred.tsv new file mode 100644 index 000000000000..1b5f9fa05302 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/data/numbers/onehundred.tsv @@ -0,0 +1 @@ +cem 100 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/pt/data/numbers/teen.tsv b/nemo_text_processing/inverse_text_normalization/pt/data/numbers/teen.tsv new file mode 100644 index 000000000000..6bc21cccfc30 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/data/numbers/teen.tsv @@ -0,0 +1,11 @@ +dez 10 +onze 11 +doze 12 +treze 13 +catorze 14 +quatorze 14 +quinze 15 +dezesseis 16 +dezessete 17 +dezoito 18 +dezenove 19 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/pt/data/numbers/ties.tsv b/nemo_text_processing/inverse_text_normalization/pt/data/numbers/ties.tsv new file mode 100644 index 000000000000..63ff93c83220 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/data/numbers/ties.tsv @@ -0,0 +1,8 @@ +vinte 2 +trinta 3 +quarenta 4 +cinquenta 5 +sessenta 6 +setenta 7 +oitenta 8 +noventa 9 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/pt/data/numbers/twenties.tsv b/nemo_text_processing/inverse_text_normalization/pt/data/numbers/twenties.tsv new file mode 100644 index 000000000000..c72178c15ed1 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/data/numbers/twenties.tsv @@ -0,0 +1,9 @@ +vinte um 21 +vinte dois 22 +vinte três 23 +vinte quatro 24 +vinte cinco 25 +vinte seis 26 +vinte sete 27 +vinte oito 28 +vinte nove 29 diff --git a/nemo_text_processing/inverse_text_normalization/pt/data/numbers/zero.tsv b/nemo_text_processing/inverse_text_normalization/pt/data/numbers/zero.tsv new file mode 100644 index 000000000000..c479272d4039 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/data/numbers/zero.tsv @@ -0,0 +1 @@ +zero 0 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/pt/data/ordinals/__init__.py b/nemo_text_processing/inverse_text_normalization/pt/data/ordinals/__init__.py new file mode 100644 index 000000000000..a1cf281f0908 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/data/ordinals/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/pt/data/ordinals/digit.tsv b/nemo_text_processing/inverse_text_normalization/pt/data/ordinals/digit.tsv new file mode 100644 index 000000000000..ad97cc411414 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/data/ordinals/digit.tsv @@ -0,0 +1,18 @@ +primeiro 1 +primeira 1 +segundo 2 +segunda 2 +terceiro 3 +terceira 3 +quarto 4 +quarta 4 +quinto 5 +quinta 5 +sexto 6 +sexta 6 +sétimo 7 +sétima 7 +oitavo 8 +oitava 8 +nono 9 +nona 9 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/pt/data/ordinals/hundreds.tsv b/nemo_text_processing/inverse_text_normalization/pt/data/ordinals/hundreds.tsv new file mode 100644 index 000000000000..b7b15ee92488 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/data/ordinals/hundreds.tsv @@ -0,0 +1,28 @@ +centésimo 1 +centésima 1 +ducentésimo 2 +ducentésima 2 +tricentésimo 3 +tricentésima 3 +trecentésimo 3 +trecentésima 3 +quadringentésimo 4 +quadringentésima 4 +quingentésimo 5 +quingentésima 5 +sexcentésimo 6 +sexcentésima 6 +seiscentésimo 6 +seiscentésima 6 +septingentésimo 7 +septingentésima 7 +setingentésimo 7 +setingentésima 7 +octingentésimo 8 +octingentésima 8 +octogentésimo 8 +octogentésima 8 +noningentésimo 9 +noningentésima 9 +nongentésimo 9 +nongentésima 9 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/pt/data/ordinals/ties.tsv b/nemo_text_processing/inverse_text_normalization/pt/data/ordinals/ties.tsv new file mode 100644 index 000000000000..55c4c4ee2fa3 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/data/ordinals/ties.tsv @@ -0,0 +1,20 @@ +décimo 1 +décima 1 +vigésimo 2 +vigésima 2 +trigésimo 3 +trigésima 3 +quadragésimo 4 +quadragésima 4 +quinquagésimo 5 +quinquagésima 5 +sexagésimo 6 +sexagésima 6 +septuagésimo 7 +septuagésima 7 +setuagésimo 7 +setuagésima 7 +octogésimo 8 +octogésima 8 +nonagésimo 9 +nonagésima 9 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/pt/data/time/__init__.py b/nemo_text_processing/inverse_text_normalization/pt/data/time/__init__.py new file mode 100644 index 000000000000..a1cf281f0908 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/data/time/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/pt/data/time/hour_to_am.tsv b/nemo_text_processing/inverse_text_normalization/pt/data/time/hour_to_am.tsv new file mode 100644 index 000000000000..c22366ba8665 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/data/time/hour_to_am.tsv @@ -0,0 +1 @@ +1 0 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/pt/data/time/hour_to_pm.tsv b/nemo_text_processing/inverse_text_normalization/pt/data/time/hour_to_pm.tsv new file mode 100644 index 000000000000..548045d94c60 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/data/time/hour_to_pm.tsv @@ -0,0 +1 @@ +1 12 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/pt/data/time/hours_to.tsv b/nemo_text_processing/inverse_text_normalization/pt/data/time/hours_to.tsv new file mode 100644 index 000000000000..5742d596b64d --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/data/time/hours_to.tsv @@ -0,0 +1,23 @@ +0 23 +2 1 +3 2 +4 3 +5 4 +6 5 +7 6 +8 7 +9 8 +10 9 +11 10 +12 11 +13 12 +14 13 +15 14 +16 15 +17 16 +18 17 +19 18 +20 19 +21 20 +22 21 +23 22 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/pt/data/time/minutes_to.tsv b/nemo_text_processing/inverse_text_normalization/pt/data/time/minutes_to.tsv new file mode 100644 index 000000000000..d8516a9f83ce --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/data/time/minutes_to.tsv @@ -0,0 +1,59 @@ +01 59 +02 58 +03 57 +04 56 +05 55 +06 54 +07 53 +08 52 +09 51 +10 50 +11 49 +12 48 +13 47 +14 46 +15 45 +16 44 +17 43 +18 42 +19 41 +20 40 +21 39 +22 38 +23 37 +24 36 +25 35 +26 34 +27 33 +28 32 +29 31 +30 30 +31 29 +32 28 +33 27 +34 26 +35 25 +36 24 +37 23 +38 22 +39 21 +40 20 +41 19 +42 18 +43 17 +44 16 +45 15 +46 14 +47 13 +48 12 +49 11 +50 10 +51 09 +52 08 +53 07 +54 06 +55 05 +56 04 +57 03 +58 02 +59 01 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/pt/data/time/time_suffix_am.tsv b/nemo_text_processing/inverse_text_normalization/pt/data/time/time_suffix_am.tsv new file mode 100644 index 000000000000..95394d7a6145 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/data/time/time_suffix_am.tsv @@ -0,0 +1,2 @@ +da madrugada da madrugada +da manhã da manhã \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/pt/data/time/time_suffix_pm.tsv b/nemo_text_processing/inverse_text_normalization/pt/data/time/time_suffix_pm.tsv new file mode 100644 index 000000000000..18c7c994b020 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/data/time/time_suffix_pm.tsv @@ -0,0 +1,2 @@ +da tarde da tarde +da noite da noite \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/pt/data/whitelist.tsv b/nemo_text_processing/inverse_text_normalization/pt/data/whitelist.tsv new file mode 100644 index 000000000000..bd82088f7990 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/data/whitelist.tsv @@ -0,0 +1,5 @@ +segunda-feira segunda feira +terça-feira terça feira +quarta-feira quarta feira +quinta-feira quinta feira +sexta-feira sexta feira \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/pt/taggers/__init__.py b/nemo_text_processing/inverse_text_normalization/pt/taggers/__init__.py new file mode 100644 index 000000000000..a1cf281f0908 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/taggers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/pt/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/pt/taggers/cardinal.py new file mode 100644 index 000000000000..115cea6cdd3d --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/taggers/cardinal.py @@ -0,0 +1,380 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from nemo_text_processing.inverse_text_normalization.pt.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_ALPHA, + NEMO_DIGIT, + NEMO_SIGMA, + NEMO_SPACE, + NEMO_WHITE_SPACE, + GraphFst, + delete_space, +) +from pynini.lib import pynutil + + +class CardinalFst(GraphFst): + """ + Finite state transducer for classifying cardinals + e.g. menos veintitrés -> cardinal { negative: "-" integer: "23"} + This class converts cardinals up to (but not including) "un cuatrillón", + i.e up to "one septillion" in English (10^{24}). + Cardinals below ten are not converted (in order to avoid + "vivo em uma casa" --> "vivo em 1 casa" and any other odd conversions.) + + Although technically Portuguese grammar requires that "e" only comes after + "10s" numbers (ie. "trinta", ..., "noventa"), these rules will convert + numbers even with "e" in an ungrammatical place (because "e" is ignored + inside cardinal numbers). + e.g. "mil e uma" -> cardinal { integer: "1001"} + e.g. "cento e uma" -> cardinal { integer: "101"} + """ + + def __init__(self, use_strict_e=False): + """ + :param use_strict_e: When True forces to have the separator "e" in the right places + """ + super().__init__(name="cardinal", kind="classify") + graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) + graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) + graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv")) + graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv")) + graph_twenties = pynini.string_file(get_abs_path("data/numbers/twenties.tsv")) + graph_one_hundred = pynini.string_file(get_abs_path("data/numbers/onehundred.tsv")) + graph_hundreds = pynini.string_file(get_abs_path("data/numbers/hundreds.tsv")) + + graph = None + + if not use_strict_e: + graph_hundred_component = graph_hundreds | pynutil.insert("0") + graph_hundred_component += delete_space + graph_hundred_component += pynini.union( + graph_twenties | graph_teen | pynutil.insert("00"), + (graph_ties | pynutil.insert("0")) + delete_space + (graph_digit | pynutil.insert("0")), + ) + graph_hundred_component = pynini.union(graph_hundred_component, graph_one_hundred) + + graph_hundred_component_at_least_one_none_zero_digit = graph_hundred_component @ ( + pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT) + ) + + graph_thousands = pynini.union( + graph_hundred_component_at_least_one_none_zero_digit + delete_space + pynutil.delete("mil"), + pynutil.insert("001") + pynutil.delete("mil"), # because we say 'mil', not 'hum mil' + pynutil.insert("000", weight=0.01), + ) + + graph_milhoes = pynini.union( + graph_hundred_component_at_least_one_none_zero_digit + + delete_space + + (pynutil.delete("milhão") | pynutil.delete("milhões")), + pynutil.insert("000", weight=0.01), + ) + + graph_bilhoes = pynini.union( + graph_hundred_component_at_least_one_none_zero_digit + + delete_space + + (pynutil.delete("bilhão") | pynutil.delete("bilhões")), + pynutil.insert("000", weight=0.01), + ) + + graph_trilhoes = pynini.union( + graph_hundred_component_at_least_one_none_zero_digit + + delete_space + + (pynutil.delete("trilhão") | pynutil.delete("trilhões")), + pynutil.insert("000", weight=0.01), + ) + + graph_quatrilhoes = pynini.union( + graph_hundred_component_at_least_one_none_zero_digit + + delete_space + + (pynutil.delete("quatrilhão") | pynutil.delete("quatrilhões")), + pynutil.insert("000", weight=0.01), + ) + + graph_quintilhoes = pynini.union( + graph_hundred_component_at_least_one_none_zero_digit + + delete_space + + (pynutil.delete("quintilhão") | pynutil.delete("quintilhões")), + pynutil.insert("000", weight=0.01), + ) + + graph_sextilhoes = pynini.union( + graph_hundred_component_at_least_one_none_zero_digit + + delete_space + + (pynutil.delete("sextilhão") | pynutil.delete("sextilhões")), + pynutil.insert("000", weight=0.01), + ) + + graph = pynini.union( + graph_sextilhoes + + delete_space + + graph_quintilhoes + + delete_space + + graph_quatrilhoes + + delete_space + + graph_trilhoes + + delete_space + + graph_bilhoes + + delete_space + + graph_milhoes + + delete_space + + graph_thousands + + delete_space + + graph_hundred_component, + graph_zero, + ) + + graph = graph @ pynini.union( + pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), + "0", + ) + + graph = ( + pynini.cdrewrite(pynutil.delete("e"), NEMO_SPACE, NEMO_SPACE, NEMO_SIGMA) + @ (NEMO_ALPHA + NEMO_SIGMA) + @ graph + ) + + else: + graph_e = ( + pynutil.delete(NEMO_WHITE_SPACE.plus) + pynutil.delete("e") + pynutil.delete(NEMO_WHITE_SPACE.plus) + ) + + graph_ties_component = pynini.union( + graph_teen | graph_twenties, + graph_ties + ((graph_e + graph_digit) | pynutil.insert("0")), + pynutil.add_weight(pynutil.insert("0") + graph_digit, 0.1), + ) @ (pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT)) + + graph_hundreds_except_hundred = (pynini.project(graph_hundreds, "input") - "cento") @ graph_hundreds + + graph_hundred_component_prefix_e = pynini.union( + graph_one_hundred, + pynutil.add_weight(graph_hundreds_except_hundred + pynutil.insert("00"), 0.1), + pynutil.insert("0") + graph_ties_component, + ) @ (pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT)) + graph_hundred_component_prefix_e = graph_hundred_component_prefix_e.optimize() + + graph_hundred_component_no_prefix = pynini.union(graph_hundreds + graph_e + graph_ties_component,) @ ( + pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT) + ) + graph_hundred_component_no_prefix = graph_hundred_component_no_prefix.optimize() + + graph_mil_prefix_e = pynini.union( + # because we say 'mil', not 'hum mil' + ( + (graph_hundred_component_prefix_e + delete_space + pynutil.delete("mil")) + | (pynutil.insert("001", weight=0.1) + pynutil.delete("mil")) + ) + + ( + (graph_e + graph_hundred_component_prefix_e) + | (delete_space + graph_hundred_component_no_prefix) + | pynutil.insert("000", weight=0.1) + ) + ) + + graph_mil_no_prefix = pynini.union( + ( + (graph_hundred_component_no_prefix + delete_space + pynutil.delete("mil")) + | pynutil.insert("000", weight=0.1) + ) + + ( + (graph_e + graph_hundred_component_prefix_e) + | (delete_space + graph_hundred_component_no_prefix) + | pynutil.insert("000", weight=0.1) + ) + ) + + graph_milhao_prefix_e = pynini.union( + ( + graph_hundred_component_prefix_e + + delete_space + + (pynutil.delete("milhão") | pynutil.delete("milhões")) + ) + + ((graph_e + graph_mil_prefix_e) | (delete_space + graph_mil_no_prefix)) + ) + + graph_milhao_no_prefix = pynini.union( + ( + ( + graph_hundred_component_no_prefix + + delete_space + + (pynutil.delete("milhão") | pynutil.delete("milhões")) + ) + | pynutil.insert("000", weight=0.1) + ) + + ((graph_e + graph_mil_prefix_e) | (delete_space + graph_mil_no_prefix)) + ) + + graph_bilhao_prefix_e = pynini.union( + ( + graph_hundred_component_prefix_e + + delete_space + + (pynutil.delete("bilhão") | pynutil.delete("bilhões")) + ) + + ((graph_e + graph_milhao_prefix_e) | (delete_space + graph_milhao_no_prefix)) + ) + + graph_bilhao_no_prefix = pynini.union( + ( + ( + graph_hundred_component_no_prefix + + delete_space + + (pynutil.delete("bilhão") | pynutil.delete("bilhões")) + ) + | pynutil.insert("000", weight=0.1) + ) + + ((graph_e + graph_milhao_prefix_e) | (delete_space + graph_milhao_no_prefix)) + ) + + graph_trilhao_prefix_e = pynini.union( + ( + graph_hundred_component_prefix_e + + delete_space + + (pynutil.delete("trilhão") | pynutil.delete("trilhões")) + ) + + ((graph_e + graph_bilhao_prefix_e) | (delete_space + graph_bilhao_no_prefix)) + ) + + graph_trilhao_no_prefix = pynini.union( + ( + ( + graph_hundred_component_no_prefix + + delete_space + + (pynutil.delete("trilhão") | pynutil.delete("trilhões")) + ) + | pynutil.insert("000", weight=0.1) + ) + + ((graph_e + graph_bilhao_prefix_e) | (delete_space + graph_bilhao_no_prefix)) + ) + + graph_quatrilhao_prefix_e = pynini.union( + ( + graph_hundred_component_prefix_e + + delete_space + + (pynutil.delete("quatrilhão") | pynutil.delete("quatrilhões")) + ) + + ((graph_e + graph_trilhao_prefix_e) | (delete_space + graph_trilhao_no_prefix)) + ) + + graph_quatrilhao_no_prefix = pynini.union( + ( + ( + graph_hundred_component_no_prefix + + delete_space + + (pynutil.delete("quatrilhão") | pynutil.delete("quatrilhões")) + ) + | pynutil.insert("000", weight=0.1) + ) + + ((graph_e + graph_trilhao_prefix_e) | (delete_space + graph_trilhao_no_prefix)) + ) + + graph_quintilhao_prefix_e = pynini.union( + ( + graph_hundred_component_prefix_e + + delete_space + + (pynutil.delete("quintilhão") | pynutil.delete("quintilhões")) + ) + + ((graph_e + graph_quatrilhao_prefix_e) | (delete_space + graph_quatrilhao_no_prefix)) + ) + + graph_quintilhao_no_prefix = pynini.union( + ( + ( + graph_hundred_component_no_prefix + + delete_space + + (pynutil.delete("quintilhão") | pynutil.delete("quintilhões")) + ) + | pynutil.insert("000", weight=0.1) + ) + + ((graph_e + graph_quatrilhao_prefix_e) | (delete_space + graph_quatrilhao_no_prefix)) + ) + + graph_sextilhao_prefix_e = pynini.union( + ( + graph_hundred_component_prefix_e + + delete_space + + (pynutil.delete("sextilhão") | pynutil.delete("sextilhões")) + ) + + ((graph_e + graph_quintilhao_prefix_e) | (delete_space + graph_quintilhao_no_prefix)) + ) + + graph_sextilhao_no_prefix = pynini.union( + ( + ( + graph_hundred_component_no_prefix + + delete_space + + (pynutil.delete("sextilhão") | pynutil.delete("sextilhões")) + ) + | pynutil.insert("000", weight=0.1) + ) + + ((graph_e + graph_quintilhao_prefix_e) | (delete_space + graph_quintilhao_no_prefix)) + ) + + graph = pynini.union( + graph_sextilhao_no_prefix, + graph_sextilhao_prefix_e, + graph_quintilhao_prefix_e, + graph_quatrilhao_prefix_e, + graph_trilhao_prefix_e, + graph_bilhao_prefix_e, + graph_milhao_prefix_e, + graph_mil_prefix_e, + graph_hundred_component_prefix_e, + graph_ties_component, + graph_zero, + ).optimize() + + graph = graph @ pynini.union( + pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), + "0", + ) + + graph = graph.optimize() + self.graph_no_exception = graph + + # save self.numbers_up_to_thousand for use in DecimalFst + digits_up_to_thousand = NEMO_DIGIT | (NEMO_DIGIT ** 2) | (NEMO_DIGIT ** 3) + numbers_up_to_thousand = pynini.compose(graph, digits_up_to_thousand).optimize() + self.numbers_up_to_thousand = numbers_up_to_thousand + + # save self.numbers_up_to_million for use in DecimalFst + digits_up_to_million = ( + NEMO_DIGIT + | (NEMO_DIGIT ** 2) + | (NEMO_DIGIT ** 3) + | (NEMO_DIGIT ** 4) + | (NEMO_DIGIT ** 5) + | (NEMO_DIGIT ** 6) + ) + numbers_up_to_million = pynini.compose(graph, digits_up_to_million).optimize() + self.numbers_up_to_million = numbers_up_to_million + + # don't convert cardinals from zero to nine inclusive + graph_exception = pynini.project(pynini.union(graph_digit, graph_zero), 'input') + + self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph + + optional_minus_graph = pynini.closure( + pynutil.insert("negative: ") + pynini.cross("menos", "\"-\"") + NEMO_SPACE, 0, 1 + ) + + final_graph = optional_minus_graph + pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"") + + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/pt/taggers/date.py b/nemo_text_processing/inverse_text_normalization/pt/taggers/date.py new file mode 100644 index 000000000000..f0cd2b94c8e1 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/taggers/date.py @@ -0,0 +1,59 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.inverse_text_normalization.pt.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, delete_extra_space, delete_space +from pynini.lib import pynutil + + +class DateFst(GraphFst): + """ + Finite state transducer for classifying date, + e.g. primeiro de janeiro -> date { day: "1" month: "janeiro" } + e.g. um de janeiro -> date { day: "1" month: "janeiro" } + """ + + def __init__(self): + super().__init__(name="date", kind="classify") + + graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) + graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv")) + graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv")) + graph_twenties = pynini.string_file(get_abs_path("data/numbers/twenties.tsv")) + + graph_1_to_100 = pynini.union( + graph_digit, + graph_twenties, + graph_teen, + (graph_ties + pynutil.insert("0")), + (graph_ties + pynutil.delete(" e ") + graph_digit), + ) + + digits_1_to_31 = [str(digits) for digits in range(1, 32)] + graph_1_to_31 = graph_1_to_100 @ pynini.union(*digits_1_to_31) + # can use "primeiro" for 1st day of the month + graph_1_to_31 = pynini.union(graph_1_to_31, pynini.cross("primeiro", "1")) + + day_graph = pynutil.insert("day: \"") + graph_1_to_31 + pynutil.insert("\"") + + month_graph = pynini.string_file(get_abs_path("data/months.tsv")) + month_graph = pynutil.insert("month: \"") + month_graph + pynutil.insert("\"") + + graph_dm = day_graph + delete_space + pynutil.delete("de") + delete_extra_space + month_graph + + final_graph = graph_dm + final_graph += pynutil.insert(" preserve_order: true") + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/pt/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/pt/taggers/decimal.py new file mode 100644 index 000000000000..dab779965ed3 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/taggers/decimal.py @@ -0,0 +1,119 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.inverse_text_normalization.pt.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_DIGIT, + GraphFst, + delete_extra_space, + delete_space, +) +from pynini.lib import pynutil + + +def get_quantity(decimal: 'pynini.FstLike', cardinal_up_to_million: 'pynini.FstLike') -> 'pynini.FstLike': + """ + Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral, + e.g. one million -> integer_part: "1" quantity: "million" + e.g. one point five million -> integer_part: "1" fractional_part: "5" quantity: "million" + + Args: + decimal: decimal FST + cardinal_up_to_million: cardinal FST + """ + numbers = cardinal_up_to_million @ ( + pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT) + ) + + suffix = pynini.union( + "milhão", + "milhões", + "bilhão", + "bilhões", + "trilhão", + "trilhões", + "quatrilhão", + "quatrilhões", + "quintilhão", + "quintilhões", + "sextilhão", + "sextilhões", + ) + res = ( + pynutil.insert("integer_part: \"") + + numbers + + pynutil.insert("\"") + + delete_extra_space + + pynutil.insert("quantity: \"") + + suffix + + pynutil.insert("\"") + ) + res |= decimal + delete_extra_space + pynutil.insert("quantity: \"") + suffix + pynutil.insert("\"") + return res + + +class DecimalFst(GraphFst): + """ + Finite state transducer for classifying decimal + Decimal point is either "." or ",", determined by whether "ponto" or "vírgula" is spoken. + e.g. menos um vírgula dois seis -> decimal { negative: "true" integer_part: "1" morphosyntactic_features: "," fractional_part: "26" } + e.g. menos um ponto dois seis -> decimal { negative: "true" integer_part: "1" morphosyntactic_features: "." fractional_part: "26" } + + This decimal rule assumes that decimals can be pronounced as: + (a cardinal) + ('vírgula' or 'ponto') plus (any sequence of cardinals <1000, including 'zero') + + Also writes large numbers in shortened form, e.g. + e.g. um vírgula dois seis milhões -> decimal { negative: "false" integer_part: "1" morphosyntactic_features: "," fractional_part: "26" quantity: "milhões" } + e.g. dois milhões -> decimal { negative: "false" integer_part: "2" quantity: "milhões" } + e.g. mil oitcentos e vinte e quatro milhões -> decimal { negative: "false" integer_part: "1824" quantity: "milhões" } + Args: + cardinal: CardinalFst + + """ + + def __init__(self, cardinal: GraphFst): + super().__init__(name="decimal", kind="classify") + + # number after decimal point can be any series of cardinals <1000, including 'zero' + graph_decimal = cardinal.numbers_up_to_thousand + graph_decimal = pynini.closure(graph_decimal + delete_space) + graph_decimal + self.graph = graph_decimal + + # decimal point can be denoted by 'vírgula' or 'ponto' + decimal_point = pynini.cross("vírgula", "morphosyntactic_features: \",\"") + decimal_point |= pynini.cross("ponto", "morphosyntactic_features: \".\"") + + optional_graph_negative = pynini.closure( + pynutil.insert("negative: ") + pynini.cross("menos", "\"true\"") + delete_extra_space, 0, 1 + ) + + graph_fractional = pynutil.insert("fractional_part: \"") + graph_decimal + pynutil.insert("\"") + + cardinal_graph = cardinal.graph_no_exception | pynini.string_file(get_abs_path("data/numbers/zero.tsv")) + graph_integer = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") + final_graph_wo_sign = ( + pynini.closure(graph_integer + delete_extra_space, 0, 1) + + decimal_point + + delete_extra_space + + graph_fractional + ) + final_graph = optional_graph_negative + final_graph_wo_sign + + self.final_graph_wo_negative = final_graph_wo_sign | get_quantity( + final_graph_wo_sign, cardinal.numbers_up_to_million + ) + final_graph |= optional_graph_negative + get_quantity(final_graph_wo_sign, cardinal.numbers_up_to_million) + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/pt/taggers/electronic.py b/nemo_text_processing/inverse_text_normalization/pt/taggers/electronic.py new file mode 100644 index 000000000000..aa152b116a20 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/taggers/electronic.py @@ -0,0 +1,96 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.inverse_text_normalization.pt.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_ALPHA, GraphFst, insert_space +from pynini.lib import pynutil + + +class ElectronicFst(GraphFst): + """ + Finite state transducer for classifying 'electronic' semiotic classes, i.e. + email address (which get converted to "username" and "domain" fields), + and URLS (which get converted to a "protocol" field). + e.g. c d f um arroba a b c ponto e d u -> tokens { electronic { username: "cdf1" domain: "abc.edu" } } + e.g. dáblio dáblio dáblio a b c ponto e d u -> tokens { electronic { protocol: "www.abc.edu" } } + """ + + def __init__(self): + super().__init__(name="electronic", kind="classify") + + delete_extra_space = pynutil.delete(" ") + alpha_num = ( + NEMO_ALPHA + | pynini.string_file(get_abs_path("data/numbers/digit.tsv")) + | pynini.string_file(get_abs_path("data/numbers/zero.tsv")) + ) + + symbols = pynini.string_file(get_abs_path("data/electronic/symbols.tsv")).invert() + + accepted_username = alpha_num | symbols + process_dot = pynini.cross("ponto", ".") + username = ( + pynutil.insert("username: \"") + + alpha_num + + delete_extra_space + + pynini.closure(accepted_username + delete_extra_space) + + alpha_num + + pynutil.insert("\"") + ) + single_alphanum = pynini.closure(alpha_num + delete_extra_space) + alpha_num + server = single_alphanum | pynini.string_file(get_abs_path("data/electronic/server_name.tsv")).invert() + domain = single_alphanum | pynini.string_file(get_abs_path("data/electronic/domain.tsv")).invert() + domain_graph = ( + pynutil.insert("domain: \"") + + server + + delete_extra_space + + process_dot + + delete_extra_space + + domain + + pynutil.insert("\"") + ) + graph = ( + username + delete_extra_space + pynutil.delete("arroba") + insert_space + delete_extra_space + domain_graph + ) + + ############# url ### + protocol_end = pynini.cross(pynini.union("www", "w w w", "dáblio dáblio dáblio"), "www") + protocol_start = pynini.cross(pynini.union("http", "h t t p", "agá tê tê pê"), "http") + protocol_start |= pynini.cross(pynini.union("https", "h t t p s", "agá tê tê pê ésse"), "https") + protocol_start += pynini.cross(" dois pontos barra barra ", "://") + + # e.g. .com, .es + ending = ( + delete_extra_space + + symbols + + delete_extra_space + + (domain | pynini.closure(accepted_username + delete_extra_space) + accepted_username) + ) + + protocol = ( + pynini.closure(protocol_start, 0, 1) + + protocol_end + + delete_extra_space + + process_dot + + delete_extra_space + + (pynini.closure(delete_extra_space + accepted_username, 1) | server) + + pynini.closure(ending, 1) + ) + protocol = pynutil.insert("protocol: \"") + protocol + pynutil.insert("\"") + graph |= protocol + ######## + + final_graph = self.add_tokens(graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/pt/taggers/measure.py b/nemo_text_processing/inverse_text_normalization/pt/taggers/measure.py new file mode 100644 index 000000000000..7b6f1015ad79 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/taggers/measure.py @@ -0,0 +1,95 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.inverse_text_normalization.pt.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_SIGMA, + GraphFst, + convert_space, + delete_extra_space, + delete_space, +) +from pynini.lib import pynutil + + +class MeasureFst(GraphFst): + """ + Finite state transducer for classifying measure + e.g. menos doze quilogramas -> measure { cardinal { negative: "true" integer: "12" } units: "kg" } + + Args: + cardinal: CardinalFst + decimal: DecimalFst + """ + + def __init__(self, cardinal: GraphFst, decimal: GraphFst): + super().__init__(name="measure", kind="classify") + + cardinal_graph = cardinal.graph_no_exception + + graph_unit_singular = pynini.string_file(get_abs_path("data/measurements_singular.tsv")).invert() + graph_unit_plural = pynini.string_file(get_abs_path("data/measurements_plural.tsv")).invert() + + optional_graph_negative = pynini.closure( + pynutil.insert("negative: ") + pynini.cross("menos", "\"true\"") + delete_extra_space, 0, 1 + ) + + unit_singular = convert_space(graph_unit_singular) + unit_plural = convert_space(graph_unit_plural) + unit_misc = pynutil.insert("/") + pynutil.delete("por") + delete_space + convert_space(graph_unit_singular) + + unit_singular = ( + pynutil.insert("units: \"") + + (unit_singular | unit_misc | pynutil.add_weight(unit_singular + delete_space + unit_misc, 0.01)) + + pynutil.insert("\"") + ) + unit_plural = ( + pynutil.insert("units: \"") + + (unit_plural | unit_misc | pynutil.add_weight(unit_plural + delete_space + unit_misc, 0.01)) + + pynutil.insert("\"") + ) + + subgraph_decimal = ( + pynutil.insert("decimal { ") + + optional_graph_negative + + decimal.final_graph_wo_negative + + pynutil.insert(" }") + + delete_extra_space + + unit_plural + ) + subgraph_cardinal = ( + pynutil.insert("cardinal { ") + + optional_graph_negative + + pynutil.insert("integer: \"") + + ((NEMO_SIGMA - "um" - "uma") @ cardinal_graph) + + pynutil.insert("\"") + + pynutil.insert(" }") + + delete_extra_space + + unit_plural + ) + subgraph_cardinal |= ( + pynutil.insert("cardinal { ") + + optional_graph_negative + + pynutil.insert("integer: \"") + + (pynini.cross("um", "1") | pynini.cross("uma", "1")) + + pynutil.insert("\"") + + pynutil.insert(" }") + + delete_extra_space + + unit_singular + ) + + final_graph = subgraph_decimal | subgraph_cardinal + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/pt/taggers/money.py b/nemo_text_processing/inverse_text_normalization/pt/taggers/money.py new file mode 100644 index 000000000000..cc3639438b83 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/taggers/money.py @@ -0,0 +1,127 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.inverse_text_normalization.pt.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_DIGIT, + NEMO_SIGMA, + GraphFst, + convert_space, + delete_extra_space, + delete_space, + insert_space, +) +from pynini.lib import pynutil + + +class MoneyFst(GraphFst): + """ + Finite state transducer for classifying money + e.g. doze dólares e cinco centavos -> money { integer_part: "12" fractional_part: "05" currency: "$" } + + Args: + cardinal: CardinalFst + decimal: DecimalFst + """ + + def __init__(self, cardinal: GraphFst, decimal: GraphFst): + super().__init__(name="money", kind="classify") + # quantity, integer_part, fractional_part, currency + + cardinal_graph = cardinal.graph_no_exception + graph_decimal_final = decimal.final_graph_wo_negative + + unit_singular = pynini.string_file(get_abs_path("data/currency_singular.tsv")).invert() + unit_plural = pynini.string_file(get_abs_path("data/currency_plural.tsv")).invert() + + graph_unit_singular = pynutil.insert("currency: \"") + convert_space(unit_singular) + pynutil.insert("\"") + graph_unit_plural = pynutil.insert("currency: \"") + convert_space(unit_plural) + pynutil.insert("\"") + + add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (pynutil.insert("0") + NEMO_DIGIT) + # twelve dollars (and) fifty cents, zero cents + cents_standalone = ( + pynutil.insert("morphosyntactic_features: \",\"") # always use a comma in the decimal + + insert_space + + pynutil.insert("fractional_part: \"") + + pynini.union( + pynutil.add_weight(((NEMO_SIGMA - "um" - "uma") @ cardinal_graph), -0.7) + @ add_leading_zero_to_double_digit + + delete_space + + pynutil.delete(pynini.union("centavos")), + pynini.cross("um", "01") + delete_space + pynutil.delete(pynini.union("centavo")), + ) + + pynutil.insert("\"") + ) + + optional_cents_standalone = pynini.closure( + delete_space + + pynini.closure((pynutil.delete("com") | pynutil.delete('e')) + delete_space, 0, 1) + + insert_space + + cents_standalone, + 0, + 1, + ) + + # twelve dollars fifty, only after integer + # setenta e cinco dólares com sessenta e três ~ $75,63 + optional_cents_suffix = pynini.closure( + delete_extra_space + + pynutil.insert("morphosyntactic_features: \",\"") # always use a comma in the decimal + + insert_space + + pynutil.insert("fractional_part: \"") + + pynini.closure((pynutil.delete("com") | pynutil.delete('e')) + delete_space, 0, 1) + + pynutil.add_weight(cardinal_graph @ add_leading_zero_to_double_digit, -0.7) + + pynutil.insert("\""), + 0, + 1, + ) + + graph_integer = ( + pynutil.insert("integer_part: \"") + + ((NEMO_SIGMA - "um" - "uma") @ cardinal_graph) + + pynutil.insert("\"") + + delete_extra_space + + graph_unit_plural + + (optional_cents_standalone | optional_cents_suffix) + ) + graph_integer |= ( + pynutil.insert("integer_part: \"") + + (pynini.cross("um", "1") | pynini.cross("uma", "1")) + + pynutil.insert("\"") + + delete_extra_space + + graph_unit_singular + + (optional_cents_standalone | optional_cents_suffix) + ) + + graph_cents_standalone = pynini.union( + pynutil.insert("currency: \"R$\" integer_part: \"0\" ") + cents_standalone, + pynutil.add_weight( + pynutil.insert("integer_part: \"0\" ") + + cents_standalone + + delete_extra_space + + pynutil.delete("de") + + delete_space + + graph_unit_singular, + -0.1, + ), + ) + + graph_decimal = ( + graph_decimal_final + delete_extra_space + (pynutil.delete("de") + delete_space).ques + graph_unit_plural + ) + graph_decimal |= graph_cents_standalone + final_graph = graph_integer | graph_decimal + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/pt/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/pt/taggers/ordinal.py new file mode 100644 index 000000000000..ff7f3fbf02fa --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/taggers/ordinal.py @@ -0,0 +1,84 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.inverse_text_normalization.pt.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, GraphFst, delete_space +from pynini.lib import pynutil + + +class OrdinalFst(GraphFst): + """ + Finite state transducer for classifying ordinal + vigésimo primeiro -> ordinal { integer: "21" morphosyntactic_features: "o" } + This class converts ordinal up to "milésimo" (one thousandth) exclusive. + + Cardinals below ten are not converted (in order to avoid + e.g. "primero fez ..." -> "1º fez...", "segunda guerra mundial" -> "2ª guerra mundial" + and any other odd conversions.) + + This FST also records the ending of the ordinal (called "morphosyntactic_features"): + either "o" or "a". + + Args: + cardinal: CardinalFst + """ + + def __init__(self): + super().__init__(name="ordinal", kind="classify") + + graph_digit = pynini.string_file(get_abs_path("data/ordinals/digit.tsv")) + graph_ties = pynini.string_file(get_abs_path("data/ordinals/ties.tsv")) + graph_hundreds = pynini.string_file(get_abs_path("data/ordinals/hundreds.tsv")) + + ordinal_graph_union = pynini.union( + pynutil.add_weight(graph_digit, 0.4), + pynutil.add_weight(graph_ties + ((delete_space + graph_digit) | pynutil.insert("0")), 0.2), + graph_hundreds + + ((delete_space + graph_ties) | pynutil.insert("0")) + + ((delete_space + graph_digit) | pynutil.insert("0")), + ) + + accept_o_endings = NEMO_SIGMA + pynini.accep("o") + accept_a_endings = NEMO_SIGMA + pynini.accep("a") + + ordinal_graph_o = accept_o_endings @ ordinal_graph_union + ordinal_graph_a = accept_a_endings @ ordinal_graph_union + + # 'optional_numbers_in_front' have negative weight so we always + # include them if they're there + optional_in_front = (pynutil.add_weight(ordinal_graph_union, -0.1) + delete_space.closure()).closure() + graph_o_suffix = optional_in_front + ordinal_graph_o + graph_a_suffix = optional_in_front + ordinal_graph_a + + # don't convert ordinals from one to nine inclusive + graph_exception = pynini.project(pynini.union(graph_digit), 'input') + graph_o_suffix = (pynini.project(graph_o_suffix, "input") - graph_exception.arcsort()) @ graph_o_suffix + graph_a_suffix = (pynini.project(graph_a_suffix, "input") - graph_exception.arcsort()) @ graph_a_suffix + + graph = ( + pynutil.insert("integer: \"") + + graph_o_suffix + + pynutil.insert("\"") + + pynutil.insert(" morphosyntactic_features: \"o\"") + ) + graph |= ( + pynutil.insert("integer: \"") + + graph_a_suffix + + pynutil.insert("\"") + + pynutil.insert(" morphosyntactic_features: \"a\"") + ) + + final_graph = self.add_tokens(graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/pt/taggers/punctuation.py b/nemo_text_processing/inverse_text_normalization/pt/taggers/punctuation.py new file mode 100644 index 000000000000..cb5285452954 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/taggers/punctuation.py @@ -0,0 +1,34 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst +from pynini.lib import pynutil + + +class PunctuationFst(GraphFst): + """ + Finite state transducer for classifying punctuation + e.g. a, -> tokens { name: "a" } tokens { name: "," } + """ + + def __init__(self): + super().__init__(name="punctuation", kind="classify") + + s = "!#$%&\'()*+,-./:;<=>?@^_`{|}~" + punct = pynini.union(*s) + + graph = pynutil.insert("name: \"") + punct + pynutil.insert("\"") + + self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/pt/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/pt/taggers/telephone.py new file mode 100755 index 000000000000..a1ad2d07585d --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/taggers/telephone.py @@ -0,0 +1,131 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.inverse_text_normalization.pt.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, delete_space, insert_space +from pynini.lib import pynutil + + +class TelephoneFst(GraphFst): + """ + Finite state transducer for classifying telephone numbers, e.g. + um dois um dois três quatro cinco seis sete oito nove -> { number_part: "(12) 12345-6789" }. + If 11 digits are spoken, they are grouped as 2+5+4 (eg. (12) 34567-8901). + If 10 digits are spoken, they are grouped as 2+4+4 (eg. (12) 3456-7890). + If 9 digits are spoken, they are grouped as 5+4 (eg. 12345-6789). + If 8 digits are spoken, they are grouped as 4+4 (eg. 1234-5678). + In portuguese, digits are generally spoken individually, or as 2-digit numbers, + eg. "trinta e quatro oitenta e dois" = "3482", + "meia sete vinte" = "6720". + """ + + def __init__(self): + super().__init__(name="telephone", kind="classify") + + # create `single_digits` and `double_digits` graphs as these will be + # the building blocks of possible telephone numbers + graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) + graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv")) + graph_twenties = pynini.string_file(get_abs_path("data/numbers/twenties.tsv")) + graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv")) + graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) + graph_half = pynini.cross("meia", "6") + + graph_all_digits = pynini.union(graph_digit, graph_half, graph_zero) + + single_digits = pynini.invert(graph_all_digits).optimize() + + double_digits = ( + pynini.union( + graph_teen | graph_twenties, + (graph_ties + pynutil.insert("0")), + (graph_ties + delete_space + pynutil.delete("e") + delete_space + graph_digit), + (graph_all_digits + delete_space + graph_all_digits), + ) + .invert() + .optimize() + ) + + # define `eleven_digit_graph`, `ten_digit_graph`, `nine_digit_graph`, `eight_digit_graph` + # which accept telephone numbers spoken (1) only with single digits, + # or (2) spoken with double digits (and sometimes single digits) + + # 11-digit option (2): (2) + (1+2+2) + (2+2) digits + eleven_digit_graph = ( + pynutil.delete("(") + + double_digits + + insert_space + + pynutil.delete(") ") + + single_digits + + insert_space + + double_digits + + insert_space + + double_digits + + insert_space + + pynutil.delete("-") + + double_digits + + insert_space + + double_digits + ) + + # 10-digit option (2): (2) + (2+2) + (2+2) digits + ten_digit_graph = ( + pynutil.delete("(") + + double_digits + + insert_space + + pynutil.delete(") ") + + double_digits + + insert_space + + double_digits + + insert_space + + pynutil.delete("-") + + double_digits + + insert_space + + double_digits + ) + + # 9-digit option (2): (1+2+2) + (2+2) digits + nine_digit_graph = ( + single_digits + + insert_space + + double_digits + + insert_space + + double_digits + + insert_space + + pynutil.delete("-") + + double_digits + + insert_space + + double_digits + ) + + # 8-digit option (2): (2+2) + (2+2) digits + eight_digit_graph = ( + double_digits + + insert_space + + double_digits + + insert_space + + pynutil.delete("-") + + double_digits + + insert_space + + double_digits + ) + + number_part = pynini.union(eleven_digit_graph, ten_digit_graph, nine_digit_graph, eight_digit_graph) + + number_part = pynutil.insert("number_part: \"") + pynini.invert(number_part) + pynutil.insert("\"") + + graph = number_part + final_graph = self.add_tokens(graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/pt/taggers/time.py b/nemo_text_processing/inverse_text_normalization/pt/taggers/time.py new file mode 100755 index 000000000000..e669abd11836 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/taggers/time.py @@ -0,0 +1,182 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.inverse_text_normalization.pt.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, delete_space, insert_space +from pynini.lib import pynutil + + +class TimeFst(GraphFst): + """ + Finite state transducer for classifying time + e.g. quinze pro meio dia -> time { hours: "11" minutes: "45" } + e.g. quinze pra meia noite -> time { hours: "23" minutes: "45" } + e.g. quinze pra uma -> time { hours: "12" minutes: "45" } + e.g. dez pras duas -> time { hours: "1" minutes: "50" } + e.g. quinze pras duas -> time { hours: "1" minutes: "45" } + e.g. ao meio dia -> time { hours: "12" minutes: "00" morphosyntactic_features: "ao" } + e.g. ao meio dia e meia -> time { hours: "12" minutes: "30" morphosyntactic_features: "ao" } + e.g. ao meio dia e meio -> time { hours: "12" minutes: "30" morphosyntactic_features: "ao" } + e.g. à meia noite e quinze -> time { hours: "0" minutes: "15" morphosyntactic_features: "à" } + e.g. à meia noite e meia -> time { hours: "0" minutes: "30" morphosyntactic_features: "à" } + e.g. à uma e trinta -> time { hours: "1" minutes: "30" morphosyntactic_features: "à" } + e.g. às onze e trinta -> time { hours: "11" minutes: "30" morphosyntactic_features: "às" } + e.g. às três horas e trinta minutos -> time { hours: "3" minutes: "30" morphosyntactic_features: "às" } + """ + + def __init__(self): + super().__init__(name="time", kind="classify") + + # graph_hour_to_am = pynini.string_file(get_abs_path("data/time/hour_to_am.tsv")) + # graph_hour_to_pm = pynini.string_file(get_abs_path("data/time/hour_to_pm.tsv")) + graph_hours_to = pynini.string_file(get_abs_path("data/time/hours_to.tsv")) + graph_minutes_to = pynini.string_file(get_abs_path("data/time/minutes_to.tsv")) + graph_suffix_am = pynini.string_file(get_abs_path("data/time/time_suffix_am.tsv")) + graph_suffix_pm = pynini.string_file(get_abs_path("data/time/time_suffix_pm.tsv")) + + graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) + graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv")) + graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv")) + graph_twenties = pynini.string_file(get_abs_path("data/numbers/twenties.tsv")) + + graph_1_to_100 = pynini.union( + graph_digit, + graph_twenties, + graph_teen, + (graph_ties + pynutil.insert("0")), + (graph_ties + pynutil.delete(" e ") + graph_digit), + ) + + # note that graph_hour will start from 2 hours + # "1 o'clock" will be treated differently because it + # is singular + digits_2_to_23 = [str(digits) for digits in range(2, 24)] + digits_1_to_59 = [str(digits) for digits in range(1, 60)] + + graph_2_to_23 = graph_1_to_100 @ pynini.union(*digits_2_to_23) + graph_1_to_59 = graph_1_to_100 @ pynini.union(*digits_1_to_59) + graph_uma = pynini.cross("uma", "1") + + # Mapping 'horas' + graph_hour = pynutil.delete(pynini.accep("hora") + pynini.accep("s").ques) + graph_minute = pynutil.delete(pynini.accep("minuto") + pynini.accep("s").ques) + + # Mapping 'meio dia' and 'meia noite' + graph_meio_dia = pynini.cross("meio dia", "12") + graph_meia_noite = pynini.cross("meia noite", "0") + + # Mapping 'e meia' + graph_e = delete_space + pynutil.delete(" e ") + delete_space + graph_e_meia = graph_e + pynini.cross("meia", "30") + graph_e_meio = graph_e + pynini.cross("meio", "30") + + # à uma hora -> 1:00 + # às três e meia -> 3:30 + graph_hours_at_singular = ( + pynutil.insert("morphosyntactic_features: \"") + + (pynini.cross("à", "à") | pynini.cross("a", "à")) + + pynutil.insert("\" ") + + delete_space + ) + graph_hours_at_singular += ( + pynutil.insert("hours: \"") + graph_uma + pynutil.insert("\"") + (delete_space + graph_hour).ques + ) + graph_hours_at_plural = ( + pynutil.insert("morphosyntactic_features: \"") + + (pynini.cross("às", "às") | pynini.cross("as", "às")) + + pynutil.insert("\" ") + + delete_space + ) + graph_hours_at_plural += ( + pynutil.insert("hours: \"") + graph_2_to_23 + pynutil.insert("\"") + (delete_space + graph_hour).ques + ) + final_graph_hour_at = graph_hours_at_singular | graph_hours_at_plural + + graph_minutes_component_without_zero = graph_e + graph_1_to_59 + (delete_space + graph_minute).ques + graph_minutes_component_without_zero |= graph_e_meia + pynutil.delete(delete_space + pynini.accep("hora")).ques + graph_minutes_component = graph_minutes_component_without_zero | pynutil.insert("00", weight=0.1) + final_graph_minute = pynutil.insert(" minutes: \"") + graph_minutes_component + pynutil.insert("\"") + + graph_hm = final_graph_hour_at + final_graph_minute + + # meio dia e meia -> 12:30 + # meia noite e meia -> 0:30 + graph_minutes_without_zero = ( + pynutil.insert(" minutes: \"") + graph_minutes_component_without_zero + pynutil.insert("\"") + ) + graph_meio_min = ( + pynutil.insert("hours: \"") + + (graph_meio_dia | graph_meia_noite) + + pynutil.insert("\"") + + graph_minutes_without_zero + ) + graph_meio_min |= ( + pynutil.insert("hours: \"") + + graph_meio_dia + + pynutil.insert("\" minutes: \"") + + graph_e_meio + + pynutil.insert("\"") + ) + graph_hm |= graph_meio_min + + # às quinze para as quatro -> às 3:45 + # NOTE: case 'para à uma' ('to one') could be either 0:XX or 12:XX + # leading to wrong reading ('meio dia e ...' or 'meia noite e ...') + graph_para_a = ( + pynutil.delete("para") + | pynutil.delete("para a") + | pynutil.delete("para as") + | pynutil.delete("pra") + | pynutil.delete("pras") + ) + graph_para_o = pynutil.delete("para") | pynutil.delete("para o") | pynutil.delete("pro") + + graph_pra_min = ( + pynutil.insert("morphosyntactic_features: \"") + + (pynini.cross("à", "à") | pynini.cross("às", "às") | pynini.cross("a", "à") | pynini.cross("as", "às")) + + pynutil.insert("\" ") + + delete_space + ) + graph_pra_min += ( + pynutil.insert("minutes: \"") + + (graph_1_to_59 @ graph_minutes_to) + + pynutil.insert("\" ") + + (delete_space + graph_minute).ques + ) + graph_pra_hour = ( + pynutil.insert("hours: \"") + + (graph_2_to_23 @ graph_hours_to) + + pynutil.insert("\"") + + (delete_space + graph_hour).ques + ) + graph_pra_hour |= pynutil.insert("hours: \"") + (graph_meia_noite @ graph_hours_to) + pynutil.insert("\"") + + graph_pra = graph_pra_min + delete_space + graph_para_a + delete_space + graph_pra_hour + + # às quinze pro meio dia -> às 11:45 + graph_pro = graph_pra_min + delete_space + graph_para_o + delete_space + graph_pro += pynutil.insert(" hours: \"") + (graph_meio_dia @ graph_hours_to) + pynutil.insert("\"") + + graph_mh = graph_pra | graph_pro + + # optional suffix + final_suffix = pynutil.insert("suffix: \"") + (graph_suffix_am | graph_suffix_pm) + pynutil.insert("\"") + final_suffix_optional = pynini.closure(delete_space + insert_space + final_suffix, 0, 1) + + final_graph = pynini.union((graph_hm | graph_mh) + final_suffix_optional).optimize() + + final_graph = self.add_tokens(final_graph) + + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/pt/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/pt/taggers/tokenize_and_classify.py new file mode 100644 index 000000000000..f4396645828d --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/taggers/tokenize_and_classify.py @@ -0,0 +1,110 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pynini +from nemo_text_processing.inverse_text_normalization.pt.taggers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.pt.taggers.date import DateFst +from nemo_text_processing.inverse_text_normalization.pt.taggers.decimal import DecimalFst +from nemo_text_processing.inverse_text_normalization.pt.taggers.electronic import ElectronicFst +from nemo_text_processing.inverse_text_normalization.pt.taggers.measure import MeasureFst +from nemo_text_processing.inverse_text_normalization.pt.taggers.money import MoneyFst +from nemo_text_processing.inverse_text_normalization.pt.taggers.ordinal import OrdinalFst +from nemo_text_processing.inverse_text_normalization.pt.taggers.punctuation import PunctuationFst +from nemo_text_processing.inverse_text_normalization.pt.taggers.telephone import TelephoneFst +from nemo_text_processing.inverse_text_normalization.pt.taggers.time import TimeFst +from nemo_text_processing.inverse_text_normalization.pt.taggers.whitelist import WhiteListFst +from nemo_text_processing.inverse_text_normalization.pt.taggers.word import WordFst +from nemo_text_processing.text_normalization.en.graph_utils import ( + GraphFst, + delete_extra_space, + delete_space, + generator_main, +) +from pynini.lib import pynutil + +from nemo.utils import logging + + +class ClassifyFst(GraphFst): + """ + Final class that composes all other classification grammars. This class can process an entire sentence, that is lower cased. + For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + + Args: + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + """ + + def __init__(self, cache_dir: str = None, overwrite_cache: bool = False): + super().__init__(name="tokenize_and_classify", kind="classify") + + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, "_pt_itn.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] + logging.info(f"ClassifyFst.fst was restored from {far_file}.") + else: + logging.info(f"Creating ClassifyFst grammars.") + + cardinal = CardinalFst(use_strict_e=True) + cardinal_graph = cardinal.fst + + ordinal_graph = OrdinalFst().fst + + decimal = DecimalFst(cardinal) + decimal_graph = decimal.fst + + measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal).fst + date_graph = DateFst().fst + word_graph = WordFst().fst + time_graph = TimeFst().fst + money_graph = MoneyFst(cardinal=cardinal, decimal=decimal).fst + whitelist_graph = WhiteListFst().fst + punct_graph = PunctuationFst().fst + electronic_graph = ElectronicFst().fst + telephone_graph = TelephoneFst().fst + + classify = ( + pynutil.add_weight(whitelist_graph, 1.01) + | pynutil.add_weight(time_graph, 1.1) + | pynutil.add_weight(date_graph, 1.09) + | pynutil.add_weight(decimal_graph, 1.09) + | pynutil.add_weight(measure_graph, 1.1) + | pynutil.add_weight(cardinal_graph, 1.1) + | pynutil.add_weight(ordinal_graph, 1.1) + | pynutil.add_weight(money_graph, 1.1) + | pynutil.add_weight(telephone_graph, 1.1) + | pynutil.add_weight(electronic_graph, 1.1) + | pynutil.add_weight(word_graph, 100) + ) + + punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }") + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") + token_plus_punct = ( + pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct) + ) + + graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct) + graph = delete_space + graph + delete_space + + self.fst = graph.optimize() + + if far_file: + generator_main(far_file, {"tokenize_and_classify": self.fst}) + logging.info(f"ClassifyFst grammars are saved to {far_file}.") diff --git a/nemo_text_processing/inverse_text_normalization/pt/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/pt/taggers/whitelist.py new file mode 100644 index 000000000000..6965ccbb8e70 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/taggers/whitelist.py @@ -0,0 +1,33 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.inverse_text_normalization.pt.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, convert_space +from pynini.lib import pynutil + + +class WhiteListFst(GraphFst): + """ + Finite state transducer for classifying whitelisted tokens + e.g. usted -> tokens { name: "ud." } + This class has highest priority among all classifier grammars. Whitelisted tokens are defined and loaded from "data/whitelist.tsv". + """ + + def __init__(self): + super().__init__(name="whitelist", kind="classify") + + whitelist = pynini.string_file(get_abs_path("data/whitelist.tsv")).invert() + graph = pynutil.insert("name: \"") + convert_space(whitelist) + pynutil.insert("\"") + self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/pt/taggers/word.py b/nemo_text_processing/inverse_text_normalization/pt/taggers/word.py new file mode 100644 index 000000000000..7908397d52ad --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/taggers/word.py @@ -0,0 +1,29 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, GraphFst +from pynini.lib import pynutil + + +class WordFst(GraphFst): + """ + Finite state transducer for classifying plain tokens, that do not belong to any special class. This can be considered as the default class. + e.g. sleep -> tokens { name: "sleep" } + """ + + def __init__(self): + super().__init__(name="word", kind="classify") + word = pynutil.insert("name: \"") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert("\"") + self.fst = word.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/pt/utils.py b/nemo_text_processing/inverse_text_normalization/pt/utils.py new file mode 100644 index 000000000000..a73b7d9ddb39 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/utils.py @@ -0,0 +1,27 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + + +def get_abs_path(rel_path): + """ + Get absolute path + + Args: + rel_path: relative path to this file + + Returns absolute path + """ + return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path diff --git a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/__init__.py b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/__init__.py new file mode 100644 index 000000000000..a1cf281f0908 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/cardinal.py new file mode 100644 index 000000000000..928a259d3897 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/cardinal.py @@ -0,0 +1,48 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space +from pynini.lib import pynutil + + +class CardinalFst(GraphFst): + """ + Finite state transducer for verbalizing cardinal + e.g. cardinal { negative: "-" integer: "23" } -> -23 + """ + + def __init__(self): + super().__init__(name="cardinal", kind="verbalize") + optional_sign = pynini.closure( + pynutil.delete("negative:") + + delete_space + + pynutil.delete("\"") + + NEMO_NOT_QUOTE + + pynutil.delete("\"") + + delete_space, + 0, + 1, + ) + graph = ( + pynutil.delete("integer:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + self.numbers = graph + graph = optional_sign + graph + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/date.py new file mode 100644 index 000000000000..ee9db8fa7e1e --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/date.py @@ -0,0 +1,65 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_NOT_QUOTE, + GraphFst, + delete_extra_space, + delete_space, + insert_space, +) +from pynini.lib import pynutil + + +class DateFst(GraphFst): + """ + Finite state transducer for verbalizing date, e.g. + date { day: "1" month: "enero" preserve_order: true } -> 1 de enero + """ + + def __init__(self): + super().__init__(name="date", kind="verbalize") + month = ( + pynutil.delete("month:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + day = ( + pynutil.delete("day:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + + # day month + graph_dm = day + delete_extra_space + pynutil.insert("de") + insert_space + month + + optional_preserve_order = pynini.closure( + pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space + | pynutil.delete("field_order:") + + delete_space + + pynutil.delete("\"") + + NEMO_NOT_QUOTE + + pynutil.delete("\"") + + delete_space + ) + + final_graph = graph_dm + delete_space + optional_preserve_order + + delete_tokens = self.delete_tokens(final_graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/decimal.py new file mode 100644 index 000000000000..58fc76ea63e6 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/decimal.py @@ -0,0 +1,66 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space +from pynini.lib import pynutil + + +class DecimalFst(GraphFst): + """ + Finite state transducer for verbalizing decimal, + e.g. decimal { negative: "true" integer_part: "1" morphosyntactic_features: "," fractional_part: "26" } -> -1,26 + e.g. decimal { negative: "true" integer_part: "1" morphosyntactic_features: "." fractional_part: "26" } -> -1.26 + e.g. decimal { negative: "false" integer_part: "1" morphosyntactic_features: "," fractional_part: "26" quantity: "millón" } -> 1,26 millón + e.g. decimal { negative: "false" integer_part: "2" quantity: "millones" } -> 2 millones + """ + + def __init__(self): + super().__init__(name="decimal", kind="verbalize") + optionl_sign = pynini.closure(pynini.cross("negative: \"true\"", "-") + delete_space, 0, 1) + integer = ( + pynutil.delete("integer_part:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + optional_integer = pynini.closure(integer + delete_space, 0, 1) + + decimal_point = pynini.cross("morphosyntactic_features: \",\"", ",") + decimal_point |= pynini.cross("morphosyntactic_features: \".\"", ".") + + fractional = ( + decimal_point + + delete_space + + pynutil.delete("fractional_part:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + optional_fractional = pynini.closure(fractional + delete_space, 0, 1) + quantity = ( + pynutil.delete("quantity:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + optional_quantity = pynini.closure(pynutil.insert(" ") + quantity + delete_space, 0, 1) + graph = optional_integer + optional_fractional + optional_quantity + self.numbers = graph + graph = optionl_sign + graph + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/electronic.py b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/electronic.py new file mode 100644 index 000000000000..11b2706a3562 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/electronic.py @@ -0,0 +1,55 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space +from pynini.lib import pynutil + + +class ElectronicFst(GraphFst): + """ + Finite state transducer for verbalizing electronic + e.g. tokens { electronic { username: "cdf1" domain: "abc.edu" } } -> cdf1@abc.edu + e.g. tokens { electronic { protocol: "www.abc.edu" } } -> www.abc.edu + """ + + def __init__(self): + super().__init__(name="electronic", kind="verbalize") + user_name = ( + pynutil.delete("username:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + domain = ( + pynutil.delete("domain:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + + protocol = ( + pynutil.delete("protocol:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + + graph = user_name + delete_space + pynutil.insert("@") + domain + graph |= protocol + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/measure.py b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/measure.py new file mode 100644 index 000000000000..057ade696d11 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/measure.py @@ -0,0 +1,61 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, GraphFst, delete_space +from pynini.lib import pynutil + + +class MeasureFst(GraphFst): + """ + Finite state transducer for verbalizing measure, e.g. + measure { cardinal { negative: "true" integer: "12" } units: "kg" } -> -12 kg + + Args: + decimal: DecimalFst + cardinal: CardinalFst + """ + + def __init__(self, decimal: GraphFst, cardinal: GraphFst): + super().__init__(name="measure", kind="verbalize") + optional_sign = pynini.closure(pynini.cross("negative: \"true\"", "-"), 0, 1) + unit = ( + pynutil.delete("units:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_CHAR - " ", 1) + + pynutil.delete("\"") + + delete_space + ) + graph_decimal = ( + pynutil.delete("decimal {") + + delete_space + + optional_sign + + delete_space + + decimal.numbers + + delete_space + + pynutil.delete("}") + ) + graph_cardinal = ( + pynutil.delete("cardinal {") + + delete_space + + optional_sign + + delete_space + + cardinal.numbers + + delete_space + + pynutil.delete("}") + ) + graph = (graph_cardinal | graph_decimal) + delete_space + pynutil.insert(" ") + unit + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/money.py b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/money.py new file mode 100644 index 000000000000..54a9b1038337 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/money.py @@ -0,0 +1,40 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, GraphFst, delete_space, insert_space +from pynini.lib import pynutil + + +class MoneyFst(GraphFst): + """ + Finite state transducer for verbalizing money, e.g. + money { integer_part: "12" morphosyntactic_features: "," fractional_part: "05" currency: "$" } -> $12,05 + + Args: + decimal: DecimalFst + """ + + def __init__(self, decimal: GraphFst): + super().__init__(name="money", kind="verbalize") + unit = ( + pynutil.delete("currency:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_CHAR - " ", 1) + + pynutil.delete("\"") + ) + graph = unit + delete_space + insert_space + decimal.numbers + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/ordinal.py b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/ordinal.py new file mode 100644 index 000000000000..fe3454e15e71 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/ordinal.py @@ -0,0 +1,44 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space +from pynini.lib import pynutil + + +class OrdinalFst(GraphFst): + """ + Finite state transducer for verbalizing ordinal, e.g. + ordinal { integer: "13" morphosyntactic_features: "o" } -> 13º + """ + + def __init__(self): + super().__init__(name="ordinal", kind="verbalize") + graph = ( + pynutil.delete("integer:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + + replace_suffix = pynini.union( + pynini.cross(" morphosyntactic_features: \"o\"", "º"), + pynini.cross(" morphosyntactic_features: \"a\"", "ª"), + ) + + graph = graph + replace_suffix + + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/telephone.py b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/telephone.py new file mode 100644 index 000000000000..4dd0d7079889 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/telephone.py @@ -0,0 +1,32 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst +from pynini.lib import pynutil + + +class TelephoneFst(GraphFst): + """ + Finite state transducer for verbalizing telephone, e.g. + telephone { number_part: "123-123-5678" } + -> 123-123-5678 + """ + + def __init__(self): + super().__init__(name="telephone", kind="verbalize") + + number_part = pynutil.delete("number_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + delete_tokens = self.delete_tokens(number_part) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/time.py new file mode 100755 index 000000000000..b1a04c673752 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/time.py @@ -0,0 +1,82 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_DIGIT, + NEMO_NOT_QUOTE, + GraphFst, + delete_space, + insert_space, +) +from pynini.lib import pynutil + + +class TimeFst(GraphFst): + """ + Finite state transducer for verbalizing time, + e.g. time { hours: "à 1" minutes: "10" } -> à 1:10 + e.g. time { hours: "às 2" minutes: "45" } -> às 2:45 + """ + + def __init__(self): + super().__init__(name="time", kind="verbalize") + add_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | (pynutil.insert("0") + NEMO_DIGIT) + + prefix = ( + pynutil.delete("morphosyntactic_features:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + + delete_space + + insert_space + ) + optional_prefix = pynini.closure(prefix, 0, 1) + + hour = ( + pynutil.delete("hours:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_DIGIT, 1) + + pynutil.delete("\"") + ) + minute = ( + pynutil.delete("minutes:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_DIGIT, 1) + + pynutil.delete("\"") + ) + suffix = ( + delete_space + + insert_space + + pynutil.delete("suffix:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + optional_suffix = pynini.closure(suffix, 0, 1) + + graph = ( + optional_prefix + + hour + + delete_space + + pynutil.insert(":") + + (minute @ add_leading_zero_to_double_digit) + + optional_suffix + ) + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/verbalize.py new file mode 100644 index 000000000000..88c04991b5f4 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/verbalize.py @@ -0,0 +1,62 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.inverse_text_normalization.pt.verbalizers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.pt.verbalizers.date import DateFst +from nemo_text_processing.inverse_text_normalization.pt.verbalizers.decimal import DecimalFst +from nemo_text_processing.inverse_text_normalization.pt.verbalizers.electronic import ElectronicFst +from nemo_text_processing.inverse_text_normalization.pt.verbalizers.measure import MeasureFst +from nemo_text_processing.inverse_text_normalization.pt.verbalizers.money import MoneyFst +from nemo_text_processing.inverse_text_normalization.pt.verbalizers.ordinal import OrdinalFst +from nemo_text_processing.inverse_text_normalization.pt.verbalizers.telephone import TelephoneFst +from nemo_text_processing.inverse_text_normalization.pt.verbalizers.time import TimeFst +from nemo_text_processing.inverse_text_normalization.pt.verbalizers.whitelist import WhiteListFst +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst + + +class VerbalizeFst(GraphFst): + """ + Composes other verbalizer grammars. + For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + """ + + def __init__(self): + super().__init__(name="verbalize", kind="verbalize") + cardinal = CardinalFst() + cardinal_graph = cardinal.fst + ordinal_graph = OrdinalFst().fst + decimal = DecimalFst() + decimal_graph = decimal.fst + measure_graph = MeasureFst(decimal=decimal, cardinal=cardinal).fst + money_graph = MoneyFst(decimal=decimal).fst + time_graph = TimeFst().fst + date_graph = DateFst().fst + whitelist_graph = WhiteListFst().fst + telephone_graph = TelephoneFst().fst + electronic_graph = ElectronicFst().fst + + graph = ( + time_graph + | date_graph + | money_graph + | measure_graph + | ordinal_graph + | decimal_graph + | cardinal_graph + | whitelist_graph + | telephone_graph + | electronic_graph + ) + self.fst = graph diff --git a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/verbalize_final.py new file mode 100644 index 000000000000..cc2e65aed46d --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/verbalize_final.py @@ -0,0 +1,43 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.inverse_text_normalization.pt.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.inverse_text_normalization.pt.verbalizers.word import WordFst +from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, delete_extra_space, delete_space +from pynini.lib import pynutil + + +class VerbalizeFinalFst(GraphFst): + """ + Finite state transducer that verbalizes an entire sentence, e.g. + tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now + """ + + def __init__(self): + super().__init__(name="verbalize_final", kind="verbalize") + verbalize = VerbalizeFst().fst + word = WordFst().fst + types = verbalize | word + graph = ( + pynutil.delete("tokens") + + delete_space + + pynutil.delete("{") + + delete_space + + types + + delete_space + + pynutil.delete("}") + ) + graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space + self.fst = graph diff --git a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/whitelist.py b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/whitelist.py new file mode 100644 index 000000000000..f54aaea65b0a --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/whitelist.py @@ -0,0 +1,37 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space +from pynini.lib import pynutil + + +class WhiteListFst(GraphFst): + """ + Finite state transducer for verbalizing whitelist + e.g. tokens { name: "sexta feira" } -> "sexta-feira" + """ + + def __init__(self): + super().__init__(name="whitelist", kind="verbalize") + graph = ( + pynutil.delete("name:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_CHAR - " ", 1) + + pynutil.delete("\"") + ) + graph = graph @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/pt/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/word.py new file mode 100644 index 000000000000..4417d8f0020c --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/pt/verbalizers/word.py @@ -0,0 +1,32 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space +from pynini.lib import pynutil + + +class WordFst(GraphFst): + """ + Finite state transducer for verbalizing plain tokens + e.g. tokens { name: "sleep" } -> sleep + """ + + def __init__(self): + super().__init__(name="word", kind="verbalize") + chars = pynini.closure(NEMO_CHAR - " ", 1) + char = pynutil.delete("name:") + delete_space + pynutil.delete("\"") + chars + pynutil.delete("\"") + graph = char @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA) + + self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/run_evaluate.py b/nemo_text_processing/inverse_text_normalization/run_evaluate.py index c82a4541ad98..d2b3b63305ad 100644 --- a/nemo_text_processing/inverse_text_normalization/run_evaluate.py +++ b/nemo_text_processing/inverse_text_normalization/run_evaluate.py @@ -34,7 +34,7 @@ def parse_args(): parser = ArgumentParser() parser.add_argument("--input", help="input file path", type=str) parser.add_argument( - "--lang", help="language", choices=['en', 'de', 'es', 'ru', 'fr', 'vi'], default="en", type=str + "--lang", help="language", choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'vi'], default="en", type=str ) parser.add_argument( "--cat", diff --git a/tests/nemo_text_processing/pt/__init__.py b/tests/nemo_text_processing/pt/__init__.py new file mode 100644 index 000000000000..2db92b257416 --- /dev/null +++ b/tests/nemo_text_processing/pt/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/nemo_text_processing/pt/data_inverse_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/pt/data_inverse_text_normalization/test_cases_cardinal.txt new file mode 100755 index 000000000000..72a9c7ecc156 --- /dev/null +++ b/tests/nemo_text_processing/pt/data_inverse_text_normalization/test_cases_cardinal.txt @@ -0,0 +1,69 @@ +(non-ideal case) entre dezesseis mil e dezoito e mil~(non-ideal case) entre 16018 e 1000 +duzentos e cinquenta e um~251 +novecentos e noventa e nove milhões novecentos e noventa e nove mil novecentos e noventa e nove~999999999 +zero~zero +um~um +uma~uma +dois~dois +nove~nove +dez~10 +onze~11 +, um~, um +, uma~, uma +, dez~, 10 +, onze~, 11 +vinte três~23 +vinte e três~23 +trinta e três~33 +menos vinte e três~-23 +cem~100 +cento e um~101 +cento e uma~101 +cento e dez~110 +cento e onze~111 +cento e vinte três~123 +cento e vinte e três~123 +cento e cinquenta~150 +duzentos~200 +duzentos e um~201 +mil e um~1001 +mil e uma~1001 +nove trilhões setecentos e oitenta e nove bilhões trezentos e oitenta e dois milhões quinhentos e trinta e seis mil cento e trinta~9789382536130 +duzentos e cinquenta e quatro~254 +cento e quarenta e sete mil quatrocentos e cinquenta e um~147451 +cento e quarenta e sete mil quatrocentos e cinquenta e uma~147451 +um milhão cento e cinquenta e seis mil cento e setenta e três~1156173 +um bilhão quinhentos e noventa e três milhões e setenta e dois mil novecentos e sessenta e um~1593072961 +noventa e sete quatrilhões oitocentos e oito trilhões duzentos e sessenta e quatro bilhões setecentos e setenta e dois milhões setecentos e noventa e dois mil e cinco~97808264772792005 +dezessete sextilhões oitocentos e cinquenta e cinco quintilhões e trinta e seis quatrilhões seiscentos e cinquenta e sete trilhões e sete bilhões quinhentos e noventa e seis milhões cento e dez mil novecentos e quarenta e nove~17855036657007596110949 +dez quatrilhões e dez trilhões e dez milhões e cem mil e dez~10010000010100010 +menos vinte e cinco mil e trinta e sete~-25037 +um quatrilhão duzentos e sessenta e quatro trilhões trezentos e um bilhões novecentos e trinta e oito milhões cento e quatro~1264301938000104 +menos sessenta~-60 +quarenta e seis mil seiscentos e sessenta e quatro~46664 +sessenta~60 +dois milhões e três~2000003 +mil e treze~1013 +mil e cem~1100 +mil e vinte e seis~1026 +mil cento e vinte e seis~1126 +dezoito milhões quatrocentos e cinquenta mil novecentos e noventa~18450990 +dezoito milhões novecentos e quarenta mil setecentos e vinte e dois~18940722 +dezoito milhões seiscentos e noventa mil novecentos e dezesseis~18690916 +dezoito mil oitocentos e oitenta~18880 +um bilhão e um~1000000001 +um bilhão e uma~1000000001 +um bilhão cento e um~1000000101 +um bilhão cento e uma~1000000101 +um bilhão e mil cento e um~1000001101 +um bilhão e mil cento e uma~1000001101 +um bilhão e dez mil cento e um~1000010101 +um bilhão e dez mil cento e uma~1000010101 +um bilhão e um milhão e dez mil cento e um~1001010101 +um bilhão e um milhão e dez mil cento e uma~1001010101 +dois bilhões e cinquenta e dois~2000000052 +muitos milhões~muitos milhões +um quatrilhão e um~1000000000000001 +um quatrilhão e uma~1000000000000001 +um sextilhão e um~1000000000000000000001 +um sextilhão e uma~1000000000000000000001 \ No newline at end of file diff --git a/tests/nemo_text_processing/pt/data_inverse_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/pt/data_inverse_text_normalization/test_cases_date.txt new file mode 100644 index 000000000000..94c74b2475da --- /dev/null +++ b/tests/nemo_text_processing/pt/data_inverse_text_normalization/test_cases_date.txt @@ -0,0 +1,6 @@ +primeiro de janeiro~1 de janeiro +um de janeiro~1 de janeiro +em primeiro de dezembro~em 1 de dezembro +domingo vinte e seis de outubro~domingo 26 de outubro +trinta e um de dezembro de mil novecentos e oitenta e oito~31 de dezembro de 1988 +vinte e sete de agosto de dois mil e quinze~27 de agosto de 2015 \ No newline at end of file diff --git a/tests/nemo_text_processing/pt/data_inverse_text_normalization/test_cases_decimal.txt b/tests/nemo_text_processing/pt/data_inverse_text_normalization/test_cases_decimal.txt new file mode 100755 index 000000000000..bdafdc642442 --- /dev/null +++ b/tests/nemo_text_processing/pt/data_inverse_text_normalization/test_cases_decimal.txt @@ -0,0 +1,26 @@ +um vírgula dois seis~1,26 +menos um vírgula dois seis~-1,26 +um vírgula vinte e seis~1,26 +zero vírgula vinte e seis~0,26 +zero vírgula vinte seis~0,26 +três vírgula cento e quarenta e um~3,141 +três vírgula zero cento e quarenta e um~3,0141 +três vírgula cento e quarenta e um cinquenta e nove~3,14159 +três vírgula quatorze cento e cinquenta e nove~3,14159 +três vírgula quatorze quinze noventa e dois sessenta e cinco trinta e cinco~3,1415926535 +três vírgula quatorze quinze zero noventa e dois sessenta e cinco trinta e cinco~3,14150926535 +três vírgula quatorze quinze zero novecentos e vinte e seis zero quinhentos e trinta e cinco~3,141509260535 +quatrocentos milhões~400 milhões +um ponto trinta e três~1.33 +um ponto trinta e três milhões~1.33 milhões +zero vírgula seis milhões~0,6 milhões +mil oitocentos e vinte e quatro milhões~1824 milhões +ponto dois seis~.26 +um milhão~1 milhão +dois milhões~2 milhões +um bilhão~1 bilhão +dois bilhões~2 bilhões +um trilhão~1 trilhão +dois trilhões~2 trilhões +um quatrilhão~1 quatrilhão +dois quatrilhões~2 quatrilhões diff --git a/tests/nemo_text_processing/pt/data_inverse_text_normalization/test_cases_electronic.txt b/tests/nemo_text_processing/pt/data_inverse_text_normalization/test_cases_electronic.txt new file mode 100644 index 000000000000..70a5319756c0 --- /dev/null +++ b/tests/nemo_text_processing/pt/data_inverse_text_normalization/test_cases_electronic.txt @@ -0,0 +1,13 @@ +a ponto b c arroba g mail ponto com~a.bc@gmail.com +c d f arroba a b c ponto e d u~cdf@abc.edu +a b c arroba g mail ponto a b c~abc@gmail.abc +a b c arroba a b c ponto com~abc@abc.com +a s d f um dois três arroba a b c ponto com~asdf123@abc.com +a um b dois arroba a b c ponto com~a1b2@abc.com +a b três ponto s d d ponto três arroba g mail ponto com~ab3.sdd.3@gmail.com +agá tê tê pê ésse dois pontos barra barra dáblio dáblio dáblio ponto n vidia ponto com~https://www.nvidia.com +dáblio dáblio dáblio ponto n vidia ponto com~www.nvidia.com +dáblio dáblio dáblio ponto nvidia ponto com~www.nvidia.com +w w w ponto nvidia ponto com~www.nvidia.com +dáblio dáblio dáblio ponto a b c ponto es barra e f g~www.abc.es/efg +dáblio dáblio dáblio ponto a b c ponto br~www.abc.br \ No newline at end of file diff --git a/tests/nemo_text_processing/pt/data_inverse_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/pt/data_inverse_text_normalization/test_cases_measure.txt new file mode 100755 index 000000000000..7e3c8ca8b90c --- /dev/null +++ b/tests/nemo_text_processing/pt/data_inverse_text_normalization/test_cases_measure.txt @@ -0,0 +1,12 @@ +duzentos metros~200 m +duzentos e quarenta e cinco milhas por hora~245 mph +duzentos e quarenta e cinco quilômetros por hora~245 kph +duzentos e quarenta e cinco metros por segundo~245 m/s +dois quilos~2 kg +sessenta vírgula dois quatro zero zero quilogramas~60,2400 kg +menos sessenta vírgula dois quatro zero zero quilogramas~-60,2400 kg +oito vírgula cinco dois por cento~8,52 % +menos oito vírgula cinco dois por cento~-8,52 % +um por cento~1 % +três centímetros~3 cm +cinco litros~5 l \ No newline at end of file diff --git a/tests/nemo_text_processing/pt/data_inverse_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/pt/data_inverse_text_normalization/test_cases_money.txt new file mode 100755 index 000000000000..88d7e0e7db07 --- /dev/null +++ b/tests/nemo_text_processing/pt/data_inverse_text_normalization/test_cases_money.txt @@ -0,0 +1,25 @@ +doze dólares e cinco centavos~$ 12,05 +doze dólares~$ 12 +setenta e cinco dólares e sessenta e três~$ 75,63 +setenta e cinco dólares e sessenta e três centavos~$ 75,63 +setenta e cinco dólares com sessenta e três centavos~$ 75,63 +setenta e cinco dólares com sessenta e três~$ 75,63 +vinte e nove dólares e cinquenta centavos~$ 29,50 +um dólar~$ 1 +um real~R$ 1 +cem reais~R$ 100 +duzentos reais~R$ 200 +cento e noventa e nove reais e noventa e nove centavos~R$ 199,99 +um real e um centavo~R$ 1,01 +vinte centavos~R$ 0,20 +vinte e cinco centavos~R$ 0,25 +doze euros e cinco centavos~€ 12,05 +doze dólares americanos e cinco centavos~US$ 12,05 +duas libras esterlinas~£ 2 +doze dólares e cinco centavos~$ 12,05 +pagamos cento e quinze reais por uma bala~pagamos R$ 115 por uma bala +quinze mil reais~R$ 15000 +dois bilhões de reais~R$ 2 bilhões +dois milhões de reais~R$ 2 milhões +três vírgula sete milhões de reais~R$ 3,7 milhões +quatro ponto oito milhões de dólares~$ 4.8 milhões diff --git a/tests/nemo_text_processing/pt/data_inverse_text_normalization/test_cases_ordinal.txt b/tests/nemo_text_processing/pt/data_inverse_text_normalization/test_cases_ordinal.txt new file mode 100755 index 000000000000..0ae43e978c8b --- /dev/null +++ b/tests/nemo_text_processing/pt/data_inverse_text_normalization/test_cases_ordinal.txt @@ -0,0 +1,19 @@ +primeiro~primeiro +terceira~terceira +nono~nono +nona~nona +décimo~10º +décima~10ª +décimo primeiro~11º +décima primeira~11ª +(technically ungrammatical) décima primeira~(technically ungrammatical) 11ª +(technically ungrammatical) décima primeira casa~(technically ungrammatical) 11ª casa +décimo terceiro~13º +vigésimo primeiro~21º +vigésima primeira~21ª +(technically ungrammatical) vigésimo primeira~(technically ungrammatical) 21ª +vigésimo segundo~22º +vigésima segunda~22ª +vigésimo terceiro~23º +centésimo décimo primeiro~111º +centésimo trigésimo quarto~134º diff --git a/tests/nemo_text_processing/pt/data_inverse_text_normalization/test_cases_telephone.txt b/tests/nemo_text_processing/pt/data_inverse_text_normalization/test_cases_telephone.txt new file mode 100755 index 000000000000..04e2875b5290 --- /dev/null +++ b/tests/nemo_text_processing/pt/data_inverse_text_normalization/test_cases_telephone.txt @@ -0,0 +1,27 @@ +um dois três quatro cinco seis sete oito~1234-5678 +um dois três quatro cinco seis sete oito telefone~1234-5678 telefone +um dois três quatro cinco meia sete oito~1234-5678 +um dois três quatro cinco meia sete oito telefone~1234-5678 telefone +um dois três quatro cinco seis sete oito nove~12345-6789 +um dois três quatro cinco seis sete oito nove telefone~12345-6789 telefone +quatro cinco um dois três quatro cinco seis sete oito~(45) 1234-5678 +quatro cinco um dois três quatro cinco seis sete oito telefone~(45) 1234-5678 telefone +quatro cinco um dois três quatro cinco seis sete oito nove~(45) 12345-6789 +quatro cinco um dois três quatro cinco seis sete oito nove telefone~(45) 12345-6789 telefone +vinte e sete vinte e oito trinta e sete trinta e oito~2728-3738 +vinte e sete vinte e oito trinta e sete trinta e oito telefone~2728-3738 telefone +um vinte e sete vinte e oito trinta e sete trinta e oito~12728-3738 +um vinte e sete vinte e oito trinta e sete trinta e oito telefone~12728-3738 telefone +nove oito sete seis cinquenta e quatro zero zero~9876-5400 +nove oito sete seis cinquenta e quatro zero um~9876-5401 +noventa e oito setenta e seis zero zero trinta e cinco~9876-0035 +noventa e oito setenta e seis zero zero trinta~9876-0030 +nove noventa e oito setenta e seis zero zero trinta e um~99876-0031 +nove noventa e oito setenta e seis zero zero trinta~99876-0030 +dois três nove noventa e oito setenta e seis zero zero trinta e um~(23) 99876-0031 +dois três nove noventa e oito setenta e seis zero zero trinta~(23) 99876-0030 +vinte e três nove noventa e oito setenta e seis zero zero trinta e um~(23) 99876-0031 +vinte e três nove noventa e oito setenta e seis zero zero trinta~(23) 99876-0030 +vinte e três nove noventa e oito setenta e seis zero meia trinta e um~(23) 99876-0631 +vinte e três nove noventa e oito setenta e seis zero meia trinta~(23) 99876-0630 +vinte três nove noventa e oito setenta e seis zero meia trinta~(23) 99876-0630 \ No newline at end of file diff --git a/tests/nemo_text_processing/pt/data_inverse_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/pt/data_inverse_text_normalization/test_cases_time.txt new file mode 100755 index 000000000000..7aef50d2d489 --- /dev/null +++ b/tests/nemo_text_processing/pt/data_inverse_text_normalization/test_cases_time.txt @@ -0,0 +1,19 @@ +às quinze pro meio dia~às 11:45 +às quinze pra meia noite~às 23:45 +às quinze para a meia noite~às 23:45 +às quinze pras duas da tarde~às 1:45 da tarde +às dez pras duas da madrugada~às 1:50 da madrugada +às quinze pras duas da tarde~às 1:45 da tarde +chegaram às quinze pras duas da tarde~chegaram às 1:45 da tarde +ao meio dia~ao meio dia +ao meio dia e meia hora~ao 12:30 +ao meio dia e meia~ao 12:30 +ao meio dia e meio~ao 12:30 +meia noite~meia noite +à meia noite~à meia noite +à meia noite e quinze~à 0:15 +meia noite e meia~0:30 +à uma e trinta~à 1:30 +às onze e trinta~às 11:30 +às três horas e trinta minutos~às 3:30 +às quinze horas e quarenta minutos~às 15:40 diff --git a/tests/nemo_text_processing/pt/data_inverse_text_normalization/test_cases_whitelist.txt b/tests/nemo_text_processing/pt/data_inverse_text_normalization/test_cases_whitelist.txt new file mode 100755 index 000000000000..798df8b9bdfd --- /dev/null +++ b/tests/nemo_text_processing/pt/data_inverse_text_normalization/test_cases_whitelist.txt @@ -0,0 +1,6 @@ +primeira segunda feira~primeira segunda-feira +primeira segunda-feira~primeira segunda-feira +terça feira~terça-feira +quarta feira~quarta-feira +quinta feira~quinta-feira +sexta feira~sexta-feira \ No newline at end of file diff --git a/tests/nemo_text_processing/pt/data_inverse_text_normalization/test_cases_word.txt b/tests/nemo_text_processing/pt/data_inverse_text_normalization/test_cases_word.txt new file mode 100755 index 000000000000..fdd32f2ea4f2 --- /dev/null +++ b/tests/nemo_text_processing/pt/data_inverse_text_normalization/test_cases_word.txt @@ -0,0 +1,49 @@ +~ +yahoo!~yahoo! +vinte!~20 ! +x ~x +—~— +aaa~aaa +aabach~aabach +aabenraa~aabenraa +aabye~aabye +aaccessed~aaccessed +aach~aach +aachen's~aachen's +aadri~aadri +aafia~aafia +aagaard~aagaard +aagadu~aagadu +aagard~aagard +aagathadi~aagathadi +aaghart's~aaghart's +aagnes~aagnes +aagomoni~aagomoni +aagon~aagon +aagoo~aagoo +aagot~aagot +aahar~aahar +aahh~aahh +aahperd~aahperd +aaibinterstate~aaibinterstate +aajab~aajab +aakasa~aakasa +aakervik~aakervik +aakirkeby~aakirkeby +aalam~aalam +aalbaek~aalbaek +aaldiu~aaldiu +aalem~aalem +a'ali~a'ali +aalilaassamthey~aalilaassamthey +aalin~aalin +aaliyan~aaliyan +aaliyan's~aaliyan's +aamadu~aamadu +aamara~aamara +aambala~aambala +aamera~aamera +aamer's~aamer's +aamina~aamina +aaminah~aaminah +aamjiwnaang~aamjiwnaang diff --git a/tests/nemo_text_processing/pt/test_cardinal.py b/tests/nemo_text_processing/pt/test_cardinal.py new file mode 100644 index 000000000000..bfe7d82d0db2 --- /dev/null +++ b/tests/nemo_text_processing/pt/test_cardinal.py @@ -0,0 +1,31 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from parameterized import parameterized + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestCardinal: + + inverse_normalizer = InverseNormalizer(lang='pt', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('pt/data_inverse_text_normalization/test_cases_cardinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/pt/test_date.py b/tests/nemo_text_processing/pt/test_date.py new file mode 100644 index 000000000000..88b5a50ebe5c --- /dev/null +++ b/tests/nemo_text_processing/pt/test_date.py @@ -0,0 +1,30 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from parameterized import parameterized + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestDate: + inverse_normalizer = InverseNormalizer(lang='pt', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('pt/data_inverse_text_normalization/test_cases_date.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/pt/test_decimal.py b/tests/nemo_text_processing/pt/test_decimal.py new file mode 100644 index 000000000000..4fd77295e4a3 --- /dev/null +++ b/tests/nemo_text_processing/pt/test_decimal.py @@ -0,0 +1,30 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from parameterized import parameterized + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestDecimal: + inverse_normalizer = InverseNormalizer(lang='pt', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('pt/data_inverse_text_normalization/test_cases_decimal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/pt/test_electronic.py b/tests/nemo_text_processing/pt/test_electronic.py new file mode 100644 index 000000000000..9e340471f299 --- /dev/null +++ b/tests/nemo_text_processing/pt/test_electronic.py @@ -0,0 +1,30 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from parameterized import parameterized + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestElectronic: + inverse_normalizer = InverseNormalizer(lang='pt', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('pt/data_inverse_text_normalization/test_cases_electronic.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/pt/test_measure.py b/tests/nemo_text_processing/pt/test_measure.py new file mode 100644 index 000000000000..892b45962699 --- /dev/null +++ b/tests/nemo_text_processing/pt/test_measure.py @@ -0,0 +1,31 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pytest +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from parameterized import parameterized + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestMeasure: + inverse_normalizer = InverseNormalizer(lang='pt', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('pt/data_inverse_text_normalization/test_cases_measure.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/pt/test_money.py b/tests/nemo_text_processing/pt/test_money.py new file mode 100644 index 000000000000..40c682fe99cd --- /dev/null +++ b/tests/nemo_text_processing/pt/test_money.py @@ -0,0 +1,31 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pytest +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from parameterized import parameterized + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestMoney: + inverse_normalizer = InverseNormalizer(lang='pt', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('pt/data_inverse_text_normalization/test_cases_money.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/pt/test_ordinal.py b/tests/nemo_text_processing/pt/test_ordinal.py new file mode 100644 index 000000000000..19acfbaee131 --- /dev/null +++ b/tests/nemo_text_processing/pt/test_ordinal.py @@ -0,0 +1,31 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pytest +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from parameterized import parameterized + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestOrdinal: + inverse_normalizer = InverseNormalizer(lang='pt', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('pt/data_inverse_text_normalization/test_cases_ordinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/pt/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/pt/test_sparrowhawk_inverse_text_normalization.sh new file mode 100755 index 000000000000..74d8ddafdfc6 --- /dev/null +++ b/tests/nemo_text_processing/pt/test_sparrowhawk_inverse_text_normalization.sh @@ -0,0 +1,84 @@ +#! /bin/sh + +PROJECT_DIR=/workspace/tests + +runtest () { + input=$1 + cd /workspace/sparrowhawk/documentation/grammars + + # read test file + while read testcase; do + IFS='~' read spoken written <<< $testcase + denorm_pred=$(echo $spoken | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1) + + # trim white space + written="$(echo -e "${written}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + + # input expected actual + assertEquals "$spoken" "$written" "$denorm_pred" + done < "$input" +} + +testITNCardinal() { + input=$PROJECT_DIR/pt/data_inverse_text_normalization/test_cases_cardinal.txt + runtest $input +} + +testITNDate() { + input=$PROJECT_DIR/pt/data_inverse_text_normalization/test_cases_date.txt + runtest $input +} + +testITNDecimal() { + input=$PROJECT_DIR/pt/data_inverse_text_normalization/test_cases_decimal.txt + runtest $input +} + +testITNOrdinal() { + input=$PROJECT_DIR/pt/data_inverse_text_normalization/test_cases_ordinal.txt + runtest $input +} + +#testITNFraction() { +# input=$PROJECT_DIR/pt/data_inverse_text_normalization/test_cases_fraction.txt +# runtest $input +#} + +testITNTime() { + input=$PROJECT_DIR/pt/data_inverse_text_normalization/test_cases_time.txt + runtest $input +} + +testITNMeasure() { + input=$PROJECT_DIR/pt/data_inverse_text_normalization/test_cases_measure.txt + runtest $input +} + +testITNMoney() { + input=$PROJECT_DIR/pt/data_inverse_text_normalization/test_cases_money.txt + runtest $input +} + +testITNWhitelist() { + input=$PROJECT_DIR/pt/data_inverse_text_normalization/test_cases_whitelist.txt + runtest $input +} + +testITNTelephone() { + input=$PROJECT_DIR/pt/data_inverse_text_normalization/test_cases_telephone.txt + runtest $input +} + +testITNElectronic() { + input=$PROJECT_DIR/pt/data_inverse_text_normalization/test_cases_electronic.txt + runtest $input +} + +testITNWord() { + input=$PROJECT_DIR/pt/data_inverse_text_normalization/test_cases_word.txt + runtest $input +} + +# Load shUnit2 +. $PROJECT_DIR/../shunit2/shunit2 diff --git a/tests/nemo_text_processing/pt/test_telephone.py b/tests/nemo_text_processing/pt/test_telephone.py new file mode 100644 index 000000000000..6d36e9db2bfb --- /dev/null +++ b/tests/nemo_text_processing/pt/test_telephone.py @@ -0,0 +1,31 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pytest +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from parameterized import parameterized + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestTelephone: + inverse_normalizer = InverseNormalizer(lang='pt', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('pt/data_inverse_text_normalization/test_cases_telephone.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/pt/test_time.py b/tests/nemo_text_processing/pt/test_time.py new file mode 100644 index 000000000000..7a556b36bf4b --- /dev/null +++ b/tests/nemo_text_processing/pt/test_time.py @@ -0,0 +1,30 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from parameterized import parameterized + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestTime: + inverse_normalizer = InverseNormalizer(lang='pt', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('pt/data_inverse_text_normalization/test_cases_time.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/pt/test_whitelist.py b/tests/nemo_text_processing/pt/test_whitelist.py new file mode 100644 index 000000000000..0f8884b53293 --- /dev/null +++ b/tests/nemo_text_processing/pt/test_whitelist.py @@ -0,0 +1,31 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pytest +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from parameterized import parameterized + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestWhitelist: + inverse_normalizer = InverseNormalizer(lang='pt', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('pt/data_inverse_text_normalization/test_cases_whitelist.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/pt/test_word.py b/tests/nemo_text_processing/pt/test_word.py new file mode 100644 index 000000000000..2ad54b15ef18 --- /dev/null +++ b/tests/nemo_text_processing/pt/test_word.py @@ -0,0 +1,31 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pytest +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from parameterized import parameterized + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestWord: + inverse_normalizer = InverseNormalizer(lang='pt', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('pt/data_inverse_text_normalization/test_cases_word.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tools/text_processing_deployment/export_grammars.sh b/tools/text_processing_deployment/export_grammars.sh index 31fb4cecf822..379a06977014 100644 --- a/tools/text_processing_deployment/export_grammars.sh +++ b/tools/text_processing_deployment/export_grammars.sh @@ -32,7 +32,7 @@ GRAMMARS="itn_grammars" # tn_grammars INPUT_CASE="cased" # lower_cased, only for tn_grammars -LANGUAGE="en" # language, {'en', 'es', 'de'} supports both TN and ITN, {'ru', 'fr'} supports ITN only +LANGUAGE="en" # language, {'en', 'es', 'de'} supports both TN and ITN, {'pt', 'ru', 'fr'} supports ITN only MODE="export" OVERWRITE_CACHE="True" # Set to False to re-use .far files FORCE_REBUILD="False" # Set to True to re-build docker file diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 3ae8e926bffa..52be2a6dd8c9 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -73,7 +73,7 @@ def parse_args(): parser = ArgumentParser() parser.add_argument("--output_dir", help="output directory for grammars", required=True, type=str) parser.add_argument( - "--language", help="language", choices=["en", "de", "es", "ru", 'fr', 'vi'], type=str, default='en' + "--language", help="language", choices=["en", "de", "es", "pt", "ru", 'fr', 'vi'], type=str, default='en' ) parser.add_argument( "--grammars", help="grammars to be exported", choices=["tn_grammars", "itn_grammars"], type=str, required=True @@ -94,7 +94,7 @@ def parse_args(): if __name__ == '__main__': args = parse_args() - if args.language in ['ru', 'fr', 'vi'] and args.grammars == 'tn_grammars': + if args.language in ['pt', 'ru', 'fr', 'vi'] and args.grammars == 'tn_grammars': raise ValueError('Only ITN grammars could be deployed in Sparrowhawk for the selected languages.') if args.language == 'en': @@ -137,6 +137,13 @@ def parse_args(): ClassifyFst as TNClassifyFst, ) from nemo_text_processing.text_normalization.es.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst + elif args.language == 'pt': + from nemo_text_processing.inverse_text_normalization.pt.taggers.tokenize_and_classify import ( + ClassifyFst as ITNClassifyFst, + ) + from nemo_text_processing.inverse_text_normalization.pt.verbalizers.verbalize import ( + VerbalizeFst as ITNVerbalizeFst, + ) elif args.language == 'fr': from nemo_text_processing.inverse_text_normalization.fr.taggers.tokenize_and_classify import ( ClassifyFst as ITNClassifyFst,