From 1a1be1ab92f25f7f99504a4218ed87de4f69f1e0 Mon Sep 17 00:00:00 2001
From: David Gardner <96306125+dagardner-nv@users.noreply.github.com>
Date: Wed, 8 Jun 2022 15:17:10 -0700
Subject: [PATCH] Limit which lfs assets are pulled by default (#139)

* Adds helper script `scripts/fetch_data.py` to pull additional assets.
* Update `README.md` & `CONTRIBUTING.md` docs to reflect changes
* Updated the hammah unittest to no longer depend on training data.
* ~~Pinned Neo dep to 22.04.00a to work-around incompatible neo version~~


```
$ scripts/fetch_data.py -h
usage: Fetches data not included in the repository by default [-h] {fetch,check} ...

optional arguments:
  -h, --help     show this help message and exit

Subcommands:
  valid subcommands

  {fetch,check}
    fetch        Fetch datasets
    check        Check download status of large files. Displays a True/False whether all files are downloaded.
```

Closes #88

Authors:
  - David Gardner (https://github.com/dagardner-nv)
  - Michael Demoret (https://github.com/mdemoret-nv)

Approvers:
  - Bartley Richardson (https://github.com/BartleyR)
  - Michael Demoret (https://github.com/mdemoret-nv)

URL: https://github.com/nv-morpheus/Morpheus/pull/139
---
 .lfsconfig                                 |   2 +
 CONTRIBUTING.md                            |  26 +++-
 ci/scripts/jenkins/build.sh                |   3 +
 ci/scripts/jenkins/test.sh                 |   2 +-
 docker/conda/environments/cuda11.4_dev.yml |   1 +
 models/mlflow/README.md                    |   8 +-
 scripts/fetch_data.py                      | 142 +++++++++++++++++++++
 tests/test_hammah.py                       |   6 +-
 8 files changed, 179 insertions(+), 11 deletions(-)
 create mode 100644 .lfsconfig
 create mode 100755 scripts/fetch_data.py
diff --git a/.lfsconfig b/.lfsconfig
new file mode 100644
index 0000000000..9fdb0464ae
--- /dev/null
+++ b/.lfsconfig
@@ -0,0 +1,2 @@
+[lfs]
+	fetchinclude = morpheus/data/*
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index c06f2780d4..e63074e864 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -65,9 +65,31 @@ cd $MORPHEUS_ROOT
 ```
 The large model and data files in this repo are stored using [Git Large File Storage (LFS)](https://git-lfs.github.com/). These files will be required for running the training/validation scripts and example pipelines for the Morpheus pre-trained models.
 
+By default only those files stored in LFS strictly needed for running Morpheus are included when the Morpheus repository is cloned. Additional datasets can be downloaded using the `scripts/fetch_data.py` script. Usage of the script is as follows:
+```bash
+scripts/fetch_data.py fetch <dataset> [<dataset>...]
+```
+
+At time of writing the defined datasets are:
+* all - Metaset includes all others
+* examples - Data needed by scripts in the `examples` subdir
+* models - Morpheus models (largest dataset)
+* tests - Data used by unittests
+* validation - Subset of the models dataset needed by some unittests
+
+To download just the examples and models:
+```bash
+scripts/fetch_data.py fetch examples models
+```
+
+To download the data needed for unittests:
+```bash
+scripts/fetch_data.py fetch tests validation
+```
+
 If `Git LFS` is not installed before cloning the repository, the large files will not be pulled. If this is the case, follow the instructions for installing `Git LFS` from [here](https://git-lfs.github.com/), and then run the following command.
 ```bash
-git lfs pull
+scripts/fetch_data.py fetch all
 ```
 
 ### Build in Docker Container
@@ -94,7 +116,7 @@ This workflow utilizes a docker container to set up most dependencies ensuring a
    ```shell
    DOCKER_TARGET=development_pydbg ./docker/build_container_dev.sh
    ```
-   1. Note: When debugging python code, you just need to add `ci/conda/recipes/python-dbg/source` to your debugger's 
+   1. Note: When debugging python code, you just need to add `ci/conda/recipes/python-dbg/source` to your debugger's
    source path.
    1. Once created, you will be able to introspect python objects from within GDB. For example, if we were to break
    within a generator setup call and examine it's PyFrame_Object `f`, it might look like this:
diff --git a/ci/scripts/jenkins/build.sh b/ci/scripts/jenkins/build.sh
index 31e505da44..97b3490f82 100755
--- a/ci/scripts/jenkins/build.sh
+++ b/ci/scripts/jenkins/build.sh
@@ -82,6 +82,9 @@ gpuci_logger "Installing other dependencies"
 mamba env update -q -n morpheus -f ${MORPHEUS_ROOT}/docker/conda/environments/cuda${CUDA_VER}_dev.yml
 conda deactivate && conda activate morpheus
 
+gpuci_logger "Final Conda Environment"
+conda list
+
 gpuci_logger "Check cmake & ninja"
 cmake --version
 ninja --version
diff --git a/ci/scripts/jenkins/test.sh b/ci/scripts/jenkins/test.sh
index 93ae07bf6c..08fb6fbbc9 100755
--- a/ci/scripts/jenkins/test.sh
+++ b/ci/scripts/jenkins/test.sh
@@ -49,7 +49,7 @@ mamba install -q -y -c conda-forge "git-lfs=3.1.4"
 gpuci_logger "Pulling LFS assets"
 cd ${MORPHEUS_ROOT}
 git lfs install
-git lfs pull
+${MORPHEUS_ROOT}/scripts/fetch_data.py fetch tests validation
 
 pip install -e ${MORPHEUS_ROOT}
 
diff --git a/docker/conda/environments/cuda11.4_dev.yml b/docker/conda/environments/cuda11.4_dev.yml
index 6620542ccd..1000407d8e 100644
--- a/docker/conda/environments/cuda11.4_dev.yml
+++ b/docker/conda/environments/cuda11.4_dev.yml
@@ -69,6 +69,7 @@ dependencies:
     - sphinx_rtd_theme
     - sysroot_linux-64=2.17
     - tqdm
+    - yapf=0.32.0
     ####### Morpheus Pip Dependencies (keep sorted!) #######
     - pip:
         # Add additional dev dependencies here
diff --git a/models/mlflow/README.md b/models/mlflow/README.md
index 7271a7bcf6..766461c821 100644
--- a/models/mlflow/README.md
+++ b/models/mlflow/README.md
@@ -82,10 +82,10 @@ nohup mlflow server --backend-store-uri sqlite:////tmp/mlflow-db.sqlite --defaul
 
 The Morpheus reference models can be found in the [Morpheus](https://github.com/NVIDIA/Morpheus) repo.
 
-```
-git clone https://github.com/NVIDIA/Morpheus.git
-cd morpheus/models
-git lfs pull
+```bash
+git clone https://github.com/NVIDIA/Morpheus.git morpheus
+cd morpheus
+scripts/fetch_data.py fetch models
 ```
 
 ## Publish reference models to MLflow
diff --git a/scripts/fetch_data.py b/scripts/fetch_data.py
new file mode 100755
index 0000000000..2749c7f42b
--- /dev/null
+++ b/scripts/fetch_data.py
@@ -0,0 +1,142 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import logging
+import os
+import subprocess
+import sys
+import time
+
+LFS_DATASETS = {
+    'all': '**',
+    'examples': 'examples/**',
+    'models': 'models/**',
+    'tests': 'tests/**',
+    'validation': 'models/datasets/validation-data/**'
+}
+
+
+def lfsPull(include_paths, poll_interval=0.1):
+    """
+    Performs a git lfs pull.
+    """
+    cmd = 'git lfs pull -I "{}"'.format(','.join(include_paths))
+    env = os.environ.copy()
+
+    # Instruct git lfs to not supress progress output. Fetching the models can
+    # take over a minute to complete, so we want our users to receive feedback.
+    env['GIT_LFS_FORCE_PROGRESS'] = '1'
+    popen = subprocess.Popen(cmd,
+                             env=env,
+                             shell=True,
+                             universal_newlines=True,
+                             stderr=subprocess.STDOUT,
+                             stdout=subprocess.PIPE)
+
+    outpipe = popen.stdout
+    returncode = None
+    all_out = []
+    while returncode is None:
+        time.sleep(poll_interval)
+        out = outpipe.readline()
+        if out.rstrip() != '':
+            logging.info(out.rstrip())
+            all_out.append(out)
+
+        returncode = popen.poll()
+
+    # Check if we have any additional output written to the pipe before our last poll
+    out = outpipe.read()
+    if out != '':
+        all_out.append(out)
+
+    output = ''.join(all_out).rstrip("\n")
+    if returncode != 0:
+        logging.error(output)
+        raise subprocess.CalledProcessError(returncode=returncode, cmd=cmd, output=output)
+
+    return output
+
+
+def lfsCheck(list_all=False):
+    output = subprocess.check_output('git lfs ls-files', shell=True, universal_newlines=True)
+    output_lines = output.splitlines()
+
+    # Output lines are in the format of:
+    # <oid> [-|*] <file name>
+    # where '-' indicates a file pointer and '*' indicates a downloaded file
+    # https://github.com/git-lfs/git-lfs/blob/main/docs/man/git-lfs-ls-files.1.ronn
+    missing_files = []
+    for file_status in output_lines:
+        parts = file_status.split()
+        downloaded = parts[1] == '*'
+        filename = ' '.join(parts[2:])
+
+        if not downloaded:
+            missing_files.append(filename)
+
+        if list_all:
+            # the join on 2: is needed to handle file names that contain a blank space
+            logging.info('%s - %s', filename, downloaded)
+
+    if not list_all:
+        if len(missing_files):
+            logging.error("Missing the following LFS files:\n%s", "\n".join(missing_files))
+        else:
+            logging.info("All LFS files downloaded")
+
+    if len(missing_files):
+        sys.exit(1)
+
+
+def parse_args():
+    argparser = argparse.ArgumentParser("Fetches data not included in the repository by default")
+    subparsers = argparser.add_subparsers(title='Subcommands',
+                                          description='valid subcommands',
+                                          required=True,
+                                          dest='subcommand')
+
+    fetch_parser = subparsers.add_parser('fetch', help='Fetch datasets')
+    fetch_parser.add_argument("data_set", nargs='*', choices=list(LFS_DATASETS.keys()), help="Data set to fetch")
+
+    check_parser = subparsers.add_parser('check',
+                                         help=('Check download status of large files. Exits with a status of 0 if all '
+                                               'large files have been downloaded, 1 otherwise.'))
+    check_parser.add_argument("-l",
+                              "--list",
+                              action="store_true",
+                              default=False,
+                              dest='list_all',
+                              help="List all missing files")
+
+    args = argparser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    logging.basicConfig(level=logging.INFO, format="%(message)s")
+
+    if args.subcommand == 'fetch':
+        include_paths = [LFS_DATASETS[p] for p in args.data_set]
+        lfsPull(include_paths)
+    else:
+        lfsCheck(list_all=args.list_all)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_hammah.py b/tests/test_hammah.py
index 5e8e275d4f..c778a4d1b6 100755
--- a/tests/test_hammah.py
+++ b/tests/test_hammah.py
@@ -73,14 +73,13 @@ def test_hammah_roleg(mock_ae, config, tmp_path):
         config.ae.feature_columns = [x.strip() for x in fh.readlines()]
 
     input_glob = os.path.join(TEST_DIRS.validation_data_dir, "hammah-*.csv")
-    train_data_glob = os.path.join(TEST_DIRS.training_data_dir, "hammah-*.csv")
     out_file = os.path.join(tmp_path, 'results.csv')
     val_file_name = os.path.join(TEST_DIRS.validation_data_dir, 'hammah-role-g-validation-data.csv')
     results_file_name = os.path.join(tmp_path, 'results.json')
 
     pipe = LinearPipeline(config)
     pipe.set_source(CloudTrailSourceStage(config, input_glob=input_glob, sort_glob=True))
-    pipe.add_stage(train_ae_stage.TrainAEStage(config, train_data_glob=train_data_glob, seed=42, sort_glob=True))
+    pipe.add_stage(train_ae_stage.TrainAEStage(config, train_data_glob=input_glob, seed=42, sort_glob=True))
     pipe.add_stage(preprocess_ae_stage.PreprocessAEStage(config))
     pipe.add_stage(AutoEncoderInferenceStage(config))
     pipe.add_stage(AddScoresStage(config))
@@ -147,14 +146,13 @@ def test_hammah_user123(mock_ae, config, tmp_path):
         config.ae.feature_columns = [x.strip() for x in fh.readlines()]
 
     input_glob = os.path.join(TEST_DIRS.validation_data_dir, "hammah-*.csv")
-    train_data_glob = os.path.join(TEST_DIRS.training_data_dir, "hammah-*.csv")
     out_file = os.path.join(tmp_path, 'results.csv')
     val_file_name = os.path.join(TEST_DIRS.validation_data_dir, 'hammah-user123-validation-data.csv')
     results_file_name = os.path.join(tmp_path, 'results.json')
 
     pipe = LinearPipeline(config)
     pipe.set_source(CloudTrailSourceStage(config, input_glob=input_glob, sort_glob=True))
-    pipe.add_stage(train_ae_stage.TrainAEStage(config, train_data_glob=train_data_glob, seed=42, sort_glob=True))
+    pipe.add_stage(train_ae_stage.TrainAEStage(config, train_data_glob=input_glob, seed=42, sort_glob=True))
     pipe.add_stage(preprocess_ae_stage.PreprocessAEStage(config))
     pipe.add_stage(AutoEncoderInferenceStage(config))
     pipe.add_stage(AddScoresStage(config))