diff --git a/nemo_text_processing/text_normalization/zh/README.md b/nemo_text_processing/text_normalization/zh/README.md index fbbf1dc6bc47..0eedd574d839 100644 --- a/nemo_text_processing/text_normalization/zh/README.md +++ b/nemo_text_processing/text_normalization/zh/README.md @@ -20,12 +20,12 @@ There are 3 components in TN pipeline: * covers English letters, digits, punctuations and some symbols * the complete mapping table `data/char/fullwidth_to_halfwidth.tsv` -#### Blacklist (Removal) +#### Denylist (Removal) Sometime you may want to remove certain things like interjections/fillers "啊", "呃" etc ``` 呃这个呃啊我不知道 -> 这个我不知道 ``` -* customizable via `data/blacklist/interjections.tsv` +* customizable via `data/denylist/denylist.tsv` ### 2.2 Non-Standard-Words(NSW) normalization diff --git a/nemo_text_processing/text_normalization/zh/data/char/__init__.py b/nemo_text_processing/text_normalization/zh/data/char/__init__.py new file mode 100644 index 000000000000..a1cf281f0908 --- /dev/null +++ b/nemo_text_processing/text_normalization/zh/data/char/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/zh/data/date/__init__.py b/nemo_text_processing/text_normalization/zh/data/date/__init__.py new file mode 100644 index 000000000000..a1cf281f0908 --- /dev/null +++ b/nemo_text_processing/text_normalization/zh/data/date/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/zh/data/denylist/__init__.py b/nemo_text_processing/text_normalization/zh/data/denylist/__init__.py new file mode 100644 index 000000000000..a1cf281f0908 --- /dev/null +++ b/nemo_text_processing/text_normalization/zh/data/denylist/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/zh/data/blacklist/denylist.tsv b/nemo_text_processing/text_normalization/zh/data/denylist/denylist.tsv similarity index 100% rename from nemo_text_processing/text_normalization/zh/data/blacklist/denylist.tsv rename to nemo_text_processing/text_normalization/zh/data/denylist/denylist.tsv diff --git a/nemo_text_processing/text_normalization/zh/data/erhua/__init__.py b/nemo_text_processing/text_normalization/zh/data/erhua/__init__.py new file mode 100644 index 000000000000..a1cf281f0908 --- /dev/null +++ b/nemo_text_processing/text_normalization/zh/data/erhua/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/zh/data/math/__init__.py b/nemo_text_processing/text_normalization/zh/data/math/__init__.py new file mode 100644 index 000000000000..a1cf281f0908 --- /dev/null +++ b/nemo_text_processing/text_normalization/zh/data/math/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/zh/data/measure/__init__.py b/nemo_text_processing/text_normalization/zh/data/measure/__init__.py new file mode 100644 index 000000000000..a1cf281f0908 --- /dev/null +++ b/nemo_text_processing/text_normalization/zh/data/measure/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/zh/data/money/__init__.py b/nemo_text_processing/text_normalization/zh/data/money/__init__.py new file mode 100644 index 000000000000..a1cf281f0908 --- /dev/null +++ b/nemo_text_processing/text_normalization/zh/data/money/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/zh/data/number/__init__.py b/nemo_text_processing/text_normalization/zh/data/number/__init__.py new file mode 100644 index 000000000000..a1cf281f0908 --- /dev/null +++ b/nemo_text_processing/text_normalization/zh/data/number/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/zh/data/time/__init__.py b/nemo_text_processing/text_normalization/zh/data/time/__init__.py new file mode 100644 index 000000000000..a1cf281f0908 --- /dev/null +++ b/nemo_text_processing/text_normalization/zh/data/time/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/zh/data/whitelist/__init__.py b/nemo_text_processing/text_normalization/zh/data/whitelist/__init__.py new file mode 100644 index 000000000000..a1cf281f0908 --- /dev/null +++ b/nemo_text_processing/text_normalization/zh/data/whitelist/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/zh/taggers/preprocessor.py b/nemo_text_processing/text_normalization/zh/taggers/preprocessor.py index 0f0661ffce7b..57d672dc62e1 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/preprocessor.py +++ b/nemo_text_processing/text_normalization/zh/taggers/preprocessor.py @@ -36,9 +36,7 @@ def __init__( graph = pynini.cdrewrite('', '', '', NEMO_SIGMA) if remove_interjections: - remove_interjections_graph = pynutil.delete( - pynini.string_file(get_abs_path('data/blacklist/denylist.tsv')) - ) + remove_interjections_graph = pynutil.delete(pynini.string_file(get_abs_path('data/denylist/denylist.tsv'))) graph @= pynini.cdrewrite(remove_interjections_graph, '', '', NEMO_SIGMA) if fullwidth_to_halfwidth: diff --git a/setup.py b/setup.py index 3ada70d21d64..02e471e0499a 100644 --- a/setup.py +++ b/setup.py @@ -95,6 +95,7 @@ def req_file(filename, folder="requirements"): extras_require['all'] = list(chain(extras_require.values())) # Add lightning requirements as needed +extras_require['nemo_text_processing'] = list(chain([extras_require['nemo_text_processing'], extras_require['core']])) extras_require['common'] = list(chain([extras_require['common'], extras_require['core']])) extras_require['test'] = list( chain(