Introduce pre-commit with pre-commit suggestions

Signed-off-by: SeanNaren <snarenthiran@nvidia.com>
NVIDIA · Sep 21, 2022 · 81f2d2e · 81f2d2e
1 parent 5d5c0fb
commit 81f2d2e
Show file tree

Hide file tree

Showing 620 changed files with 4,794 additions and 2,714 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,44 @@
+                                        # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+default_language_version:
+  python: python3
+
+ci:
+  autofix_prs: true
+  autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
+  autoupdate_schedule: quarterly
+
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.3.0
+    hooks:
+      - id: check-yaml
+      - id: check-case-conflict
+      - id: detect-private-key
+
+
+  - repo: https://github.com/PyCQA/isort
+    rev: 4.3.21
+    hooks:
+      - id: isort
+        name: Format imports
+        args: [--multi-line=3, --trailing-comma, --force-grid-wrap=0, --use-parentheses, --line-width=119, -rc, -ws]
+
+  - repo: https://github.com/psf/black
+    rev: 22.3.0
+    hooks:
+      - id: black
+        name: Format code
+        args: [--skip-string-normalization, --line-length=119]
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -15,12 +15,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import glob
 import os
 import re
 import sys
-import glob
 
 import sphinx_book_theme
+from package_info import __version__
 
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
@@ -30,7 +31,6 @@
 sys.path.insert(0, os.path.abspath("../../nemo"))
 sys.path.insert(0, os.path.abspath("../../nemo_text_processing"))
 
-from package_info import __version__
 
 templates_path = ["_templates"]
 

diff --git a/examples/asr/asr_chunked_inference/ctc/speech_to_text_buffered_infer_ctc.py b/examples/asr/asr_chunked_inference/ctc/speech_to_text_buffered_infer_ctc.py
@@ -74,7 +74,10 @@ def get_wer_feat(mfst, asr, frame_len, tokens_per_chunk, delay, preprocessor_cfg
 def main():
     parser = ArgumentParser()
     parser.add_argument(
-        "--asr_model", type=str, required=True, help="Path to asr model .nemo file",
+        "--asr_model",
+        type=str,
+        required=True,
+        help="Path to asr model .nemo file",
     )
     parser.add_argument("--test_manifest", type=str, required=True, help="path to evaluation data")
     parser.add_argument("--batch_size", type=int, default=32)
@@ -128,7 +131,10 @@ def main():
     print(tokens_per_chunk, mid_delay)
 
     frame_asr = FrameBatchASR(
-        asr_model=asr_model, frame_len=chunk_len, total_buffer=args.total_buffer_in_secs, batch_size=args.batch_size,
+        asr_model=asr_model,
+        frame_len=chunk_len,
+        total_buffer=args.total_buffer_in_secs,
+        batch_size=args.batch_size,
     )
 
     hyps, refs, wer = get_wer_feat(

diff --git a/examples/asr/asr_chunked_inference/rnnt/speech_to_text_buffered_infer_rnnt.py b/examples/asr/asr_chunked_inference/rnnt/speech_to_text_buffered_infer_rnnt.py
@@ -79,7 +79,10 @@
 # Common Arguments
 parser = ArgumentParser()
 parser.add_argument(
-    "--asr_model", type=str, required=True, help="Path to asr model .nemo file",
+    "--asr_model",
+    type=str,
+    required=True,
+    help="Path to asr model .nemo file",
 )
 parser.add_argument("--test_manifest", type=str, required=True, help="path to evaluation data")
 parser.add_argument("--batch_size", type=int, default=32)

diff --git a/examples/asr/asr_streaming/speech_to_text_streaming_infer.py b/examples/asr/asr_streaming/speech_to_text_streaming_infer.py
@@ -88,8 +88,8 @@
 
 def extract_transcriptions(hyps):
     """
-        The transcribed_texts returned by CTC and RNNT models are different.
-        This method would extract and return the text section of the hypothesis.
+    The transcribed_texts returned by CTC and RNNT models are different.
+    This method would extract and return the text section of the hypothesis.
     """
     if isinstance(hyps[0], Hypothesis):
         transcriptions = []
@@ -190,7 +190,10 @@ def perform_streaming(asr_model, streaming_buffer, compare_vs_offline=False, deb
 def main():
     parser = ArgumentParser()
     parser.add_argument(
-        "--asr_model", type=str, required=True, help="Path to an ASR model .nemo file or name of a pretrained model.",
+        "--asr_model",
+        type=str,
+        required=True,
+        help="Path to an ASR model .nemo file or name of a pretrained model.",
     )
     parser.add_argument(
         "--device", type=str, help="The device to load the model onto and perform the streaming", default="cuda"

diff --git a/examples/asr/experimental/k2/make_token_lm.py b/examples/asr/experimental/k2/make_token_lm.py
@@ -29,7 +29,10 @@ def main():
         description="""Create token LM for input manifest and tokenizer.""",
     )
     parser.add_argument(
-        "--manifest", required=True, type=str, help="Comma separated list of manifest files",
+        "--manifest",
+        required=True,
+        type=str,
+        help="Comma separated list of manifest files",
     )
     parser.add_argument(
         "--tokenizer_dir",
@@ -54,13 +57,22 @@ def main():
         ),
     )
     parser.add_argument(
-        "--ngram_order", type=int, default=2, choices=[2, 3, 4, 5], help="Order of n-gram to use",
+        "--ngram_order",
+        type=int,
+        default=2,
+        choices=[2, 3, 4, 5],
+        help="Order of n-gram to use",
     )
     parser.add_argument(
-        "--output_file", required=True, type=str, help="The path to store the token LM",
+        "--output_file",
+        required=True,
+        type=str,
+        help="The path to store the token LM",
     )
     parser.add_argument(
-        "--do_lowercase", action="store_true", help="Whether to apply lower case conversion on the text",
+        "--do_lowercase",
+        action="store_true",
+        help="Whether to apply lower case conversion on the text",
     )
     args = parser.parse_args()
 

diff --git a/examples/asr/experimental/sclite/speech_to_text_sclite.py b/examples/asr/experimental/sclite/speech_to_text_sclite.py
@@ -101,7 +101,11 @@ def get_utt_info(manifest_path):
 def main():
     parser = ArgumentParser()
     parser.add_argument(
-        "--asr_model", type=str, default="QuartzNet15x5Base-En", required=False, help="Pass: 'QuartzNet15x5Base-En'",
+        "--asr_model",
+        type=str,
+        default="QuartzNet15x5Base-En",
+        required=False,
+        help="Pass: 'QuartzNet15x5Base-En'",
     )
     parser.add_argument("--dataset", type=str, required=True, help="path to evaluation data")
     parser.add_argument("--batch_size", type=int, default=4)

diff --git a/examples/asr/experimental/structured/speech_to_text_structured.py b/examples/asr/experimental/structured/speech_to_text_structured.py
@@ -64,7 +64,13 @@
     ),
     # ... repeat 14 more times
     nemo_asr.modules.conv_asr.JasperEncoderConfig(
-        filters=1024, repeat=1, kernel=[1], stride=[1], dilation=[1], dropout=cfg.model.dropout, residual=False,
+        filters=1024,
+        repeat=1,
+        kernel=[1],
+        stride=[1],
+        dilation=[1],
+        dropout=cfg.model.dropout,
+        residual=False,
     ),
 ]
 

diff --git a/examples/asr/export/transducer/infer_transducer_onnx.py b/examples/asr/export/transducer/infer_transducer_onnx.py
@@ -56,7 +56,11 @@
 def parse_arguments():
     parser = ArgumentParser()
     parser.add_argument(
-        "--nemo_model", type=str, default=None, required=True, help="Path to .nemo file",
+        "--nemo_model",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to .nemo file",
     )
     parser.add_argument('--onnx_encoder', type=str, default=None, required=False, help="Path to onnx encoder model")
     parser.add_argument(

diff --git a/examples/asr/quantization/speech_to_text_calibrate.py b/examples/asr/quantization/speech_to_text_calibrate.py
@@ -51,7 +51,11 @@ def autocast(enabled=None):
 def main():
     parser = ArgumentParser()
     parser.add_argument(
-        "--asr_model", type=str, default="QuartzNet15x5Base-En", required=True, help="Pass: 'QuartzNet15x5Base-En'",
+        "--asr_model",
+        type=str,
+        default="QuartzNet15x5Base-En",
+        required=True,
+        help="Pass: 'QuartzNet15x5Base-En'",
     )
     parser.add_argument("--dataset", type=str, required=True, help="path to evaluation data")
     parser.add_argument("--batch_size", type=int, default=256)

diff --git a/examples/asr/quantization/speech_to_text_quant_infer.py b/examples/asr/quantization/speech_to_text_quant_infer.py
@@ -53,7 +53,11 @@ def autocast(enabled=None):
 def main():
     parser = ArgumentParser()
     parser.add_argument(
-        "--asr_model", type=str, default="QuartzNet15x5Base-En", required=True, help="Pass: 'QuartzNet15x5Base-En'",
+        "--asr_model",
+        type=str,
+        default="QuartzNet15x5Base-En",
+        required=True,
+        help="Pass: 'QuartzNet15x5Base-En'",
     )
     parser.add_argument("--dataset", type=str, required=True, help="path to evaluation data")
     parser.add_argument("--wer_target", type=float, default=None, help="used by test")

diff --git a/examples/asr/quantization/speech_to_text_quant_infer_trt.py b/examples/asr/quantization/speech_to_text_quant_infer_trt.py
@@ -47,7 +47,11 @@ def autocast(enabled=None):
 def main():
     parser = ArgumentParser()
     parser.add_argument(
-        "--asr_model", type=str, default="QuartzNet15x5Base-En", required=True, help="Pass: 'QuartzNet15x5Base-En'",
+        "--asr_model",
+        type=str,
+        default="QuartzNet15x5Base-En",
+        required=True,
+        help="Pass: 'QuartzNet15x5Base-En'",
     )
     parser.add_argument(
         "--asr_onnx",

diff --git a/examples/asr/speech_classification/vad_infer.py b/examples/asr/speech_classification/vad_infer.py
@@ -91,7 +91,9 @@ def main(cfg):
             'vad_stream': True,
             'sample_rate': 16000,
             'manifest_filepath': manifest_vad_input,
-            'labels': ['infer',],
+            'labels': [
+                'infer',
+            ],
             'num_workers': cfg.num_workers,
             'shuffle': False,
             'window_length_in_sec': cfg.vad.parameters.window_length_in_sec,

diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py
@@ -227,11 +227,15 @@ def autocast():
                         "RNNT models do not support transcribe partial audio for now. Transcribing full audio."
                     )
                     transcriptions = asr_model.transcribe(
-                        paths2audio_files=filepaths, batch_size=cfg.batch_size, num_workers=cfg.num_workers,
+                        paths2audio_files=filepaths,
+                        batch_size=cfg.batch_size,
+                        num_workers=cfg.num_workers,
                     )
             else:
                 transcriptions = asr_model.transcribe(
-                    paths2audio_files=filepaths, batch_size=cfg.batch_size, num_workers=cfg.num_workers,
+                    paths2audio_files=filepaths,
+                    batch_size=cfg.batch_size,
+                    num_workers=cfg.num_workers,
                 )
 
     logging.info(f"Finished transcribing {len(filepaths)} files !")

diff --git a/examples/nlp/duplex_text_normalization/analyze_errors.py b/examples/nlp/duplex_text_normalization/analyze_errors.py
@@ -33,7 +33,7 @@
 
 # Longest Common Subsequence
 def lcs(X, Y):
-    """ Function for finding the longest common subsequence between two lists.
+    """Function for finding the longest common subsequence between two lists.
     In this script, this function is particular used for aligning between the
     ground-truth output string and the predicted string (for visualization purpose).
     Args:

diff --git a/examples/nlp/duplex_text_normalization/data/en/data_preprocessing.py b/examples/nlp/duplex_text_normalization/data/en/data_preprocessing.py
@@ -85,7 +85,7 @@ def process_url(o):
     """
 
     def flatten(l):
-        """ flatten a list of lists """
+        """flatten a list of lists"""
         return [item for sublist in l for item in sublist]
 
     if o != '<self>' and '_letter' in o:
@@ -288,7 +288,7 @@ def convert(example):
 def ignore(example):
     """
     This function makes sure specific class types like 'PLAIN', 'ELECTRONIC' etc. are left unchanged.
-    
+
     Args:
         example: data example
     """
@@ -300,7 +300,7 @@ def ignore(example):
 
 
 def process_file(fp):
-    """ Reading the raw data from a file of NeMo format and preprocesses it. Write is out to the output directory.
+    """Reading the raw data from a file of NeMo format and preprocesses it. Write is out to the output directory.
     For more info about the data format, refer to the
     `text_normalization doc <https://github.com/NVIDIA/NeMo/blob/main/docs/source/nlp/text_normalization.rst>`.
 

diff --git a/examples/nlp/duplex_text_normalization/data/en/upsample.py b/examples/nlp/duplex_text_normalization/data/en/upsample.py
@@ -88,7 +88,7 @@
 
 def include_sentence(sentence_patterns) -> bool:
     """
-    Determines whether to use a sentence for upsampling whose patterns are provided as input. This will check the global pattern tables 
+    Determines whether to use a sentence for upsampling whose patterns are provided as input. This will check the global pattern tables
     if this sentence includes any patterns that are still needed.
 
     Args:
@@ -143,7 +143,7 @@ def include_sentence(sentence_patterns) -> bool:
 
 
 def read_data_file(fp: str, upsample_file: bool = False):
-    """ Reading the raw data from a file of NeMo format
+    """Reading the raw data from a file of NeMo format
     For more info about the data format, refer to the
     `text_normalization doc <https://github.com/NVIDIA/NeMo/blob/main/docs/source/nlp/text_normalization.rst>`.
 
@@ -204,7 +204,7 @@ def update_patterns(patterns: dict, new_patterns: dict):
 
     Args:
         patterns: main table
-        new_patterns: new table to update the main table with 
+        new_patterns: new table to update the main table with
     """
     for k, v in new_patterns.items():
         patterns[k] += v
@@ -254,7 +254,7 @@ def lookup_patterns(cls: str, input_str: str) -> dict:
 
 def create_pattern(templates: List[str], input_str: str, pretty: bool = False):
     """
-    create all patterns based on list of input templates using the input string. 
+    create all patterns based on list of input templates using the input string.
 
     Args:
         templates: list of templates/stencils

diff --git a/examples/nlp/duplex_text_normalization/duplex_text_normalization_infer.py b/examples/nlp/duplex_text_normalization/duplex_text_normalization_infer.py
@@ -98,14 +98,17 @@ def main(cfg: DictConfig) -> None:
             ]
 
         def _get_predictions(lines: List[str], mode: str, batch_size: int, text_file: str):
-            """ Runs inference on a batch data without labels and saved predictions to a file. """
+            """Runs inference on a batch data without labels and saved predictions to a file."""
             assert mode in ['tn', 'itn']
             file_name, extension = os.path.splitext(text_file)
             batch, all_preds = [], []
             for i, line in enumerate(lines):
                 batch.append(line.strip())
                 if len(batch) == batch_size or i == len(lines) - 1:
-                    outputs = tn_model._infer(batch, [constants.DIRECTIONS_TO_MODE[mode]] * len(batch),)
+                    outputs = tn_model._infer(
+                        batch,
+                        [constants.DIRECTIONS_TO_MODE[mode]] * len(batch),
+                    )
                     all_preds.extend([x for x in outputs[-1]])
                     batch = []
             assert len(all_preds) == len(lines)

diff --git a/examples/nlp/duplex_text_normalization/helpers.py b/examples/nlp/duplex_text_normalization/helpers.py
@@ -29,7 +29,7 @@
 
 
 def instantiate_model_and_trainer(cfg: DictConfig, model_name: str, do_training: bool):
-    """ Function for instantiating a model and a trainer
+    """Function for instantiating a model and a trainer
     Args:
         cfg: The config used to instantiate the model and the trainer.
         model_name: A str indicates whether the model to be instantiated is a tagger or a decoder (i.e., model_name should be either TAGGER_MODEL or DECODER_MODEL).

diff --git a/examples/nlp/duplex_text_normalization/nn_wfst/en/electronic/normalize.py b/examples/nlp/duplex_text_normalization/nn_wfst/en/electronic/normalize.py
@@ -21,7 +21,7 @@
 class ElectronicNormalizer(Normalizer):
     """
     Normalizer for ELECTRONIC.
-    
+
     Args:
         input_case: accepting either "lower_cased" or "cased" input.
         lang: language

diff --git a/examples/nlp/duplex_text_normalization/nn_wfst/en/electronic/tokenize_and_classify.py b/examples/nlp/duplex_text_normalization/nn_wfst/en/electronic/tokenize_and_classify.py
@@ -34,9 +34,9 @@
 class ClassifyFst(GraphFst):
     """
     Final class that composes all other classification grammars. This class can process an entire sentence including punctuation.
-    For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File. 
+    For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File.
     More details to deployment at NeMo/tools/text_processing_deployment.
-    
+
     Args:
         input_case: accepting either "lower_cased" or "cased" input.
         deterministic: if True will provide a single transduction option,