nv-morpheus · Jun 8, 2022 · Jun 1, 2022 · Jun 1, 2022 · Jun 1, 2022 · Jun 1, 2022
@@ -0,0 +1,2 @@
+[lfs]
+	fetchinclude = morpheus/data/*
diff --git a/ci/scripts/fetch_data.py b/ci/scripts/fetch_data.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import logging
+import os
+import subprocess
+import tempfile
+import time
+from curses import wrapper
+
+LFS_DATASETS = {
+    'all': '**',
+    'examples': 'examples/**',
+    'models': 'models/**',
+    'tests': 'tests/**',
+    'validation': 'models/datasets/validation-data/**'
+}
+
+
+def print_line(stdscr, max_x, last_print_len, line):
+    print_len = min(len(line), max_x)
+    if print_len < last_print_len:
+        stdscr.move(0, 0)
+        stdscr.clrtoeol()
+
+    stdscr.addstr(0, 0, line, print_len)
+    stdscr.refresh()
+    return print_len
+
+
+def lfsPull(stdscr, include_paths, poll_interval=0.1):
+    """
+    Performs a git lfs pull.
+    This can take upwards of a minute to complete we will make use of the
+    GIT_LFS_PROGRESS hook to send progress to a file which we can then tail
+    while the command is executing.
+    """
+    cmd = 'git lfs pull -I "{}"'.format(','.join(include_paths))
+    with tempfile.NamedTemporaryFile() as progress_file:
+        env = os.environ.copy()
+        env['GIT_LFS_PROGRESS'] = progress_file.name
+        popen = subprocess.Popen(cmd,
+                                 env=env,
+                                 shell=True,
+                                 universal_newlines=True,
+                                 stderr=subprocess.STDOUT,
+                                 stdout=subprocess.PIPE)
+
+        (_, max_x) = stdscr.getmaxyx()
+        last_print_len = 0
+
+        returncode = None
+        while returncode is None:
+            time.sleep(poll_interval)
+            progress_lines = progress_file.readlines()
+            if len(progress_lines):
+                line = progress_lines[-1].decode("UTF8")
+                last_print_len = print_line(stdscr, max_x, last_print_len, line)
+
+            returncode = popen.poll()
+
+        # Check if we have any output
+        output = popen.stdout.read().rstrip("\n")
+
+        if returncode != 0:
+            raise subprocess.CalledProcessError(returncode=returncode, cmd=cmd, output=output)
+
+        logging.info('')
+        if output != '':
+            logging.info(output)
+        else:
+            logging.info("Done.")
+
+        return output
+
+
+def parse_args():
+    argparser = argparse.ArgumentParser("Fetches data not included in the repository by default")
+    argparser.add_argument("data_set",
+                           nargs='*',
+                           choices=LFS_DATASETS.keys(),
+                           help="Data set to fetch")
+    args = argparser.parse_args()
+    return args
+
+
+def main(stdscr):
+    args = parse_args()
+    logging.basicConfig(level=logging.INFO, format="%(message)s")
+    include_paths = [LFS_DATASETS[p] for p in args.data_set]
+
+    lfsPull(stdscr, include_paths)
+
+
+if __name__ == "__main__":
+    wrapper(main)
@@ -49,7 +49,7 @@ mamba install -q -y -c conda-forge "git-lfs=3.1.4"
 gpuci_logger "Pulling LFS assets"
 cd ${MORPHEUS_ROOT}
 git lfs install
-git lfs pull
+${MORPHEUS_ROOT}/ci/scripts/fetch_data.py tests validation
 
 pip install -e ${MORPHEUS_ROOT}
 

@@ -50,7 +50,7 @@ dependencies:
     - isort
     - mlflow>=1.23
     - myst-parser==0.17
-    - neo 22.04.*
+    - neo 22.04.00a*
     - ninja=1.10
     - nodejs=17.4.0
     - pandas=1.3

@@ -74,14 +74,13 @@ def test_hammah_roleg(mock_ae, config, tmp_path):
         config.ae.feature_columns = [x.strip() for x in fh.readlines()]
 
     input_glob = os.path.join(TEST_DIRS.validation_data_dir, "hammah-*.csv")
-    train_data_glob = os.path.join(TEST_DIRS.training_data_dir, "hammah-*.csv")
     out_file = os.path.join(tmp_path, 'results.csv')
     val_file_name = os.path.join(TEST_DIRS.validation_data_dir, 'hammah-role-g-validation-data.csv')
     results_file_name = os.path.join(tmp_path, 'results.json')
 
     pipe = LinearPipeline(config)
     pipe.set_source(CloudTrailSourceStage(config, input_glob=input_glob, sort_glob=True))
-    pipe.add_stage(train_ae_stage.TrainAEStage(config, train_data_glob=train_data_glob, seed=42, sort_glob=True))
+    pipe.add_stage(train_ae_stage.TrainAEStage(config, train_data_glob=input_glob, seed=42, sort_glob=True))
     pipe.add_stage(preprocess_ae_stage.PreprocessAEStage(config))
     pipe.add_stage(AutoEncoderInferenceStage(config))
     pipe.add_stage(AddScoresStage(config))
@@ -148,14 +147,13 @@ def test_hammah_user123(mock_ae, config, tmp_path):
         config.ae.feature_columns = [x.strip() for x in fh.readlines()]
 
     input_glob = os.path.join(TEST_DIRS.validation_data_dir, "hammah-*.csv")
-    train_data_glob = os.path.join(TEST_DIRS.training_data_dir, "hammah-*.csv")
     out_file = os.path.join(tmp_path, 'results.csv')
     val_file_name = os.path.join(TEST_DIRS.validation_data_dir, 'hammah-user123-validation-data.csv')
     results_file_name = os.path.join(tmp_path, 'results.json')
 
     pipe = LinearPipeline(config)
     pipe.set_source(CloudTrailSourceStage(config, input_glob=input_glob, sort_glob=True))
-    pipe.add_stage(train_ae_stage.TrainAEStage(config, train_data_glob=train_data_glob, seed=42, sort_glob=True))
+    pipe.add_stage(train_ae_stage.TrainAEStage(config, train_data_glob=input_glob, seed=42, sort_glob=True))
     pipe.add_stage(preprocess_ae_stage.PreprocessAEStage(config))
     pipe.add_stage(AutoEncoderInferenceStage(config))
     pipe.add_stage(AddScoresStage(config))