From 1a1be1ab92f25f7f99504a4218ed87de4f69f1e0 Mon Sep 17 00:00:00 2001 From: David Gardner <96306125+dagardner-nv@users.noreply.github.com> Date: Wed, 8 Jun 2022 15:17:10 -0700 Subject: [PATCH] Limit which lfs assets are pulled by default (#139) * Adds helper script `scripts/fetch_data.py` to pull additional assets. * Update `README.md` & `CONTRIBUTING.md` docs to reflect changes * Updated the hammah unittest to no longer depend on training data. * ~~Pinned Neo dep to 22.04.00a to work-around incompatible neo version~~ ``` $ scripts/fetch_data.py -h usage: Fetches data not included in the repository by default [-h] {fetch,check} ... optional arguments: -h, --help show this help message and exit Subcommands: valid subcommands {fetch,check} fetch Fetch datasets check Check download status of large files. Displays a True/False whether all files are downloaded. ``` Closes #88 Authors: - David Gardner (https://github.com/dagardner-nv) - Michael Demoret (https://github.com/mdemoret-nv) Approvers: - Bartley Richardson (https://github.com/BartleyR) - Michael Demoret (https://github.com/mdemoret-nv) URL: https://github.com/nv-morpheus/Morpheus/pull/139 --- .lfsconfig | 2 + CONTRIBUTING.md | 26 +++- ci/scripts/jenkins/build.sh | 3 + ci/scripts/jenkins/test.sh | 2 +- docker/conda/environments/cuda11.4_dev.yml | 1 + models/mlflow/README.md | 8 +- scripts/fetch_data.py | 142 +++++++++++++++++++++ tests/test_hammah.py | 6 +- 8 files changed, 179 insertions(+), 11 deletions(-) create mode 100644 .lfsconfig create mode 100755 scripts/fetch_data.py diff --git a/.lfsconfig b/.lfsconfig new file mode 100644 index 0000000000..9fdb0464ae --- /dev/null +++ b/.lfsconfig @@ -0,0 +1,2 @@ +[lfs] + fetchinclude = morpheus/data/* diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c06f2780d4..e63074e864 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -65,9 +65,31 @@ cd $MORPHEUS_ROOT ``` The large model and data files in this repo are stored using [Git Large File Storage (LFS)](https://git-lfs.github.com/). These files will be required for running the training/validation scripts and example pipelines for the Morpheus pre-trained models. +By default only those files stored in LFS strictly needed for running Morpheus are included when the Morpheus repository is cloned. Additional datasets can be downloaded using the `scripts/fetch_data.py` script. Usage of the script is as follows: +```bash +scripts/fetch_data.py fetch [...] +``` + +At time of writing the defined datasets are: +* all - Metaset includes all others +* examples - Data needed by scripts in the `examples` subdir +* models - Morpheus models (largest dataset) +* tests - Data used by unittests +* validation - Subset of the models dataset needed by some unittests + +To download just the examples and models: +```bash +scripts/fetch_data.py fetch examples models +``` + +To download the data needed for unittests: +```bash +scripts/fetch_data.py fetch tests validation +``` + If `Git LFS` is not installed before cloning the repository, the large files will not be pulled. If this is the case, follow the instructions for installing `Git LFS` from [here](https://git-lfs.github.com/), and then run the following command. ```bash -git lfs pull +scripts/fetch_data.py fetch all ``` ### Build in Docker Container @@ -94,7 +116,7 @@ This workflow utilizes a docker container to set up most dependencies ensuring a ```shell DOCKER_TARGET=development_pydbg ./docker/build_container_dev.sh ``` - 1. Note: When debugging python code, you just need to add `ci/conda/recipes/python-dbg/source` to your debugger's + 1. Note: When debugging python code, you just need to add `ci/conda/recipes/python-dbg/source` to your debugger's source path. 1. Once created, you will be able to introspect python objects from within GDB. For example, if we were to break within a generator setup call and examine it's PyFrame_Object `f`, it might look like this: diff --git a/ci/scripts/jenkins/build.sh b/ci/scripts/jenkins/build.sh index 31e505da44..97b3490f82 100755 --- a/ci/scripts/jenkins/build.sh +++ b/ci/scripts/jenkins/build.sh @@ -82,6 +82,9 @@ gpuci_logger "Installing other dependencies" mamba env update -q -n morpheus -f ${MORPHEUS_ROOT}/docker/conda/environments/cuda${CUDA_VER}_dev.yml conda deactivate && conda activate morpheus +gpuci_logger "Final Conda Environment" +conda list + gpuci_logger "Check cmake & ninja" cmake --version ninja --version diff --git a/ci/scripts/jenkins/test.sh b/ci/scripts/jenkins/test.sh index 93ae07bf6c..08fb6fbbc9 100755 --- a/ci/scripts/jenkins/test.sh +++ b/ci/scripts/jenkins/test.sh @@ -49,7 +49,7 @@ mamba install -q -y -c conda-forge "git-lfs=3.1.4" gpuci_logger "Pulling LFS assets" cd ${MORPHEUS_ROOT} git lfs install -git lfs pull +${MORPHEUS_ROOT}/scripts/fetch_data.py fetch tests validation pip install -e ${MORPHEUS_ROOT} diff --git a/docker/conda/environments/cuda11.4_dev.yml b/docker/conda/environments/cuda11.4_dev.yml index 6620542ccd..1000407d8e 100644 --- a/docker/conda/environments/cuda11.4_dev.yml +++ b/docker/conda/environments/cuda11.4_dev.yml @@ -69,6 +69,7 @@ dependencies: - sphinx_rtd_theme - sysroot_linux-64=2.17 - tqdm + - yapf=0.32.0 ####### Morpheus Pip Dependencies (keep sorted!) ####### - pip: # Add additional dev dependencies here diff --git a/models/mlflow/README.md b/models/mlflow/README.md index 7271a7bcf6..766461c821 100644 --- a/models/mlflow/README.md +++ b/models/mlflow/README.md @@ -82,10 +82,10 @@ nohup mlflow server --backend-store-uri sqlite:////tmp/mlflow-db.sqlite --defaul The Morpheus reference models can be found in the [Morpheus](https://github.com/NVIDIA/Morpheus) repo. -``` -git clone https://github.com/NVIDIA/Morpheus.git -cd morpheus/models -git lfs pull +```bash +git clone https://github.com/NVIDIA/Morpheus.git morpheus +cd morpheus +scripts/fetch_data.py fetch models ``` ## Publish reference models to MLflow diff --git a/scripts/fetch_data.py b/scripts/fetch_data.py new file mode 100755 index 0000000000..2749c7f42b --- /dev/null +++ b/scripts/fetch_data.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import logging +import os +import subprocess +import sys +import time + +LFS_DATASETS = { + 'all': '**', + 'examples': 'examples/**', + 'models': 'models/**', + 'tests': 'tests/**', + 'validation': 'models/datasets/validation-data/**' +} + + +def lfsPull(include_paths, poll_interval=0.1): + """ + Performs a git lfs pull. + """ + cmd = 'git lfs pull -I "{}"'.format(','.join(include_paths)) + env = os.environ.copy() + + # Instruct git lfs to not supress progress output. Fetching the models can + # take over a minute to complete, so we want our users to receive feedback. + env['GIT_LFS_FORCE_PROGRESS'] = '1' + popen = subprocess.Popen(cmd, + env=env, + shell=True, + universal_newlines=True, + stderr=subprocess.STDOUT, + stdout=subprocess.PIPE) + + outpipe = popen.stdout + returncode = None + all_out = [] + while returncode is None: + time.sleep(poll_interval) + out = outpipe.readline() + if out.rstrip() != '': + logging.info(out.rstrip()) + all_out.append(out) + + returncode = popen.poll() + + # Check if we have any additional output written to the pipe before our last poll + out = outpipe.read() + if out != '': + all_out.append(out) + + output = ''.join(all_out).rstrip("\n") + if returncode != 0: + logging.error(output) + raise subprocess.CalledProcessError(returncode=returncode, cmd=cmd, output=output) + + return output + + +def lfsCheck(list_all=False): + output = subprocess.check_output('git lfs ls-files', shell=True, universal_newlines=True) + output_lines = output.splitlines() + + # Output lines are in the format of: + # [-|*] + # where '-' indicates a file pointer and '*' indicates a downloaded file + # https://github.com/git-lfs/git-lfs/blob/main/docs/man/git-lfs-ls-files.1.ronn + missing_files = [] + for file_status in output_lines: + parts = file_status.split() + downloaded = parts[1] == '*' + filename = ' '.join(parts[2:]) + + if not downloaded: + missing_files.append(filename) + + if list_all: + # the join on 2: is needed to handle file names that contain a blank space + logging.info('%s - %s', filename, downloaded) + + if not list_all: + if len(missing_files): + logging.error("Missing the following LFS files:\n%s", "\n".join(missing_files)) + else: + logging.info("All LFS files downloaded") + + if len(missing_files): + sys.exit(1) + + +def parse_args(): + argparser = argparse.ArgumentParser("Fetches data not included in the repository by default") + subparsers = argparser.add_subparsers(title='Subcommands', + description='valid subcommands', + required=True, + dest='subcommand') + + fetch_parser = subparsers.add_parser('fetch', help='Fetch datasets') + fetch_parser.add_argument("data_set", nargs='*', choices=list(LFS_DATASETS.keys()), help="Data set to fetch") + + check_parser = subparsers.add_parser('check', + help=('Check download status of large files. Exits with a status of 0 if all ' + 'large files have been downloaded, 1 otherwise.')) + check_parser.add_argument("-l", + "--list", + action="store_true", + default=False, + dest='list_all', + help="List all missing files") + + args = argparser.parse_args() + return args + + +def main(): + args = parse_args() + logging.basicConfig(level=logging.INFO, format="%(message)s") + + if args.subcommand == 'fetch': + include_paths = [LFS_DATASETS[p] for p in args.data_set] + lfsPull(include_paths) + else: + lfsCheck(list_all=args.list_all) + + +if __name__ == "__main__": + main() diff --git a/tests/test_hammah.py b/tests/test_hammah.py index 5e8e275d4f..c778a4d1b6 100755 --- a/tests/test_hammah.py +++ b/tests/test_hammah.py @@ -73,14 +73,13 @@ def test_hammah_roleg(mock_ae, config, tmp_path): config.ae.feature_columns = [x.strip() for x in fh.readlines()] input_glob = os.path.join(TEST_DIRS.validation_data_dir, "hammah-*.csv") - train_data_glob = os.path.join(TEST_DIRS.training_data_dir, "hammah-*.csv") out_file = os.path.join(tmp_path, 'results.csv') val_file_name = os.path.join(TEST_DIRS.validation_data_dir, 'hammah-role-g-validation-data.csv') results_file_name = os.path.join(tmp_path, 'results.json') pipe = LinearPipeline(config) pipe.set_source(CloudTrailSourceStage(config, input_glob=input_glob, sort_glob=True)) - pipe.add_stage(train_ae_stage.TrainAEStage(config, train_data_glob=train_data_glob, seed=42, sort_glob=True)) + pipe.add_stage(train_ae_stage.TrainAEStage(config, train_data_glob=input_glob, seed=42, sort_glob=True)) pipe.add_stage(preprocess_ae_stage.PreprocessAEStage(config)) pipe.add_stage(AutoEncoderInferenceStage(config)) pipe.add_stage(AddScoresStage(config)) @@ -147,14 +146,13 @@ def test_hammah_user123(mock_ae, config, tmp_path): config.ae.feature_columns = [x.strip() for x in fh.readlines()] input_glob = os.path.join(TEST_DIRS.validation_data_dir, "hammah-*.csv") - train_data_glob = os.path.join(TEST_DIRS.training_data_dir, "hammah-*.csv") out_file = os.path.join(tmp_path, 'results.csv') val_file_name = os.path.join(TEST_DIRS.validation_data_dir, 'hammah-user123-validation-data.csv') results_file_name = os.path.join(tmp_path, 'results.json') pipe = LinearPipeline(config) pipe.set_source(CloudTrailSourceStage(config, input_glob=input_glob, sort_glob=True)) - pipe.add_stage(train_ae_stage.TrainAEStage(config, train_data_glob=train_data_glob, seed=42, sort_glob=True)) + pipe.add_stage(train_ae_stage.TrainAEStage(config, train_data_glob=input_glob, seed=42, sort_glob=True)) pipe.add_stage(preprocess_ae_stage.PreprocessAEStage(config)) pipe.add_stage(AutoEncoderInferenceStage(config)) pipe.add_stage(AddScoresStage(config))