Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Limit which lfs assets are pulled by default #139

Merged
44 commits merged into from
Jun 8, 2022
Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
664a1a2
By default only pull the lfs files strictly needed to run morpheus
dagardner-nv Jun 1, 2022
b8009c7
Remove quotes
dagardner-nv Jun 1, 2022
a4fcb58
Fix format
dagardner-nv Jun 1, 2022
a68931d
Add helper method for performing 'git lfs pull'
dagardner-nv Jun 1, 2022
e4ca8e5
Return output from __git command
dagardner-nv Jun 1, 2022
761f447
Simple wrapper script around git lfs pull
dagardner-nv Jun 2, 2022
c0b3f70
Attempt to display git download progress
dagardner-nv Jun 2, 2022
dfee433
Config logging
dagardner-nv Jun 2, 2022
68dc1f3
Config logging
dagardner-nv Jun 2, 2022
70f502b
Use new fetch_data script to limit our lfs pull to just the test data
dagardner-nv Jun 2, 2022
b3e1374
flake8 reqs
dagardner-nv Jun 2, 2022
0970d41
Simplify fetch_data and lfsPull
dagardner-nv Jun 2, 2022
91a78ba
Slow tests need validation data in addition to the test data
dagardner-nv Jun 2, 2022
7c61622
Remove usage of training data, cuts down on the amount of lfs data ne…
dagardner-nv Jun 2, 2022
187beb3
Attempt to force an interactive shell
dagardner-nv Jun 3, 2022
44f8c83
Switch to using GIT_LFS_PROGRESS to track progress
dagardner-nv Jun 3, 2022
51dc0d2
Lower the poll interval
dagardner-nv Jun 3, 2022
125d24a
Adjust formatting
dagardner-nv Jun 3, 2022
76433c1
Try using curses
dagardner-nv Jun 3, 2022
8f6174e
Fix type-o
dagardner-nv Jun 3, 2022
f32e268
Switch to using curses for progress handling
dagardner-nv Jun 3, 2022
0a2175a
Only display the last line
dagardner-nv Jun 3, 2022
f4d56b0
Flake8 reqs
dagardner-nv Jun 3, 2022
f035e45
Remove out of date docstrings
dagardner-nv Jun 3, 2022
f121a1f
Add a complete message
dagardner-nv Jun 3, 2022
fc3947f
Pin to older neo
dagardner-nv Jun 3, 2022
37c1f82
Don't use curses when TERM is unset
dagardner-nv Jun 3, 2022
7255cfe
Document new fetch_data script
dagardner-nv Jun 3, 2022
919b7ea
Adding simple conda output before starting build
mdemoret-nv Jun 3, 2022
69db733
Revert "Pin to older neo"
dagardner-nv Jun 6, 2022
d6fd332
Manually ensure that the build is clean
dagardner-nv Jun 6, 2022
25b2018
Re-source the conda env
dagardner-nv Jun 6, 2022
cc4686e
Move fetch_data.py from ci/scripts to the scripts dir, update doc ref…
dagardner-nv Jun 7, 2022
62ba319
Use GIT_LFS_FORCE_PROGRESS to remove the need for curses and a tempor…
dagardner-nv Jun 7, 2022
148962a
Fix out of date docstring
dagardner-nv Jun 7, 2022
a25f16c
Add missing dep for yapf
dagardner-nv Jun 7, 2022
2b99914
Fix indenting to pass style checks
dagardner-nv Jun 7, 2022
9dc9efb
Implement check sub-command
dagardner-nv Jun 7, 2022
53eb653
Update fetch command
dagardner-nv Jun 7, 2022
17c41ce
Update documentation to reflect changes to fetch_data.py script
dagardner-nv Jun 7, 2022
8a8b864
Formatting fixes
dagardner-nv Jun 7, 2022
dccf41b
Merge branch 'branch-22.06' into david-lfs-config
dagardner-nv Jun 7, 2022
789f1c9
Updates to the check subcommand
dagardner-nv Jun 8, 2022
aa8a6f0
Quick style cleanup.
mdemoret-nv Jun 8, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .lfsconfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[lfs]
fetchinclude = morpheus/data/*
110 changes: 110 additions & 0 deletions ci/scripts/fetch_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#!/usr/bin/env python3
# SPDX-FileCopyrightText: Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import logging
import os
import subprocess
import tempfile
import time
from curses import wrapper

LFS_DATASETS = {
'all': '**',
'examples': 'examples/**',
'models': 'models/**',
'tests': 'tests/**',
'validation': 'models/datasets/validation-data/**'
}


def print_line(stdscr, max_x, last_print_len, line):
print_len = min(len(line), max_x)
if print_len < last_print_len:
stdscr.move(0, 0)
stdscr.clrtoeol()

stdscr.addstr(0, 0, line, print_len)
stdscr.refresh()
return print_len


def lfsPull(stdscr, include_paths, poll_interval=0.1):
"""
Performs a git lfs pull.
This can take upwards of a minute to complete we will make use of the
GIT_LFS_PROGRESS hook to send progress to a file which we can then tail
while the command is executing.
"""
cmd = 'git lfs pull -I "{}"'.format(','.join(include_paths))
with tempfile.NamedTemporaryFile() as progress_file:
env = os.environ.copy()
env['GIT_LFS_PROGRESS'] = progress_file.name
dagardner-nv marked this conversation as resolved.
Show resolved Hide resolved
popen = subprocess.Popen(cmd,
env=env,
shell=True,
universal_newlines=True,
stderr=subprocess.STDOUT,
stdout=subprocess.PIPE)

(_, max_x) = stdscr.getmaxyx()
last_print_len = 0

returncode = None
while returncode is None:
time.sleep(poll_interval)
progress_lines = progress_file.readlines()
if len(progress_lines):
line = progress_lines[-1].decode("UTF8")
last_print_len = print_line(stdscr, max_x, last_print_len, line)

returncode = popen.poll()

# Check if we have any output
output = popen.stdout.read().rstrip("\n")

if returncode != 0:
raise subprocess.CalledProcessError(returncode=returncode, cmd=cmd, output=output)
dagardner-nv marked this conversation as resolved.
Show resolved Hide resolved

logging.info('')
if output != '':
logging.info(output)
else:
logging.info("Done.")

return output


def parse_args():
argparser = argparse.ArgumentParser("Fetches data not included in the repository by default")
argparser.add_argument("data_set",
nargs='*',
choices=LFS_DATASETS.keys(),
help="Data set to fetch")
args = argparser.parse_args()
return args


def main(stdscr):
args = parse_args()
logging.basicConfig(level=logging.INFO, format="%(message)s")
include_paths = [LFS_DATASETS[p] for p in args.data_set]

lfsPull(stdscr, include_paths)


if __name__ == "__main__":
wrapper(main)
2 changes: 1 addition & 1 deletion ci/scripts/jenkins/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ mamba install -q -y -c conda-forge "git-lfs=3.1.4"
gpuci_logger "Pulling LFS assets"
cd ${MORPHEUS_ROOT}
git lfs install
git lfs pull
${MORPHEUS_ROOT}/ci/scripts/fetch_data.py tests validation

pip install -e ${MORPHEUS_ROOT}

Expand Down
2 changes: 1 addition & 1 deletion docker/conda/environments/cuda11.4_dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ dependencies:
- isort
- mlflow>=1.23
- myst-parser==0.17
- neo 22.04.*
- neo 22.04.00a*
dagardner-nv marked this conversation as resolved.
Show resolved Hide resolved
dagardner-nv marked this conversation as resolved.
Show resolved Hide resolved
- ninja=1.10
- nodejs=17.4.0
- pandas=1.3
Expand Down
6 changes: 2 additions & 4 deletions tests/test_hammah.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,14 +74,13 @@ def test_hammah_roleg(mock_ae, config, tmp_path):
config.ae.feature_columns = [x.strip() for x in fh.readlines()]

input_glob = os.path.join(TEST_DIRS.validation_data_dir, "hammah-*.csv")
train_data_glob = os.path.join(TEST_DIRS.training_data_dir, "hammah-*.csv")
out_file = os.path.join(tmp_path, 'results.csv')
val_file_name = os.path.join(TEST_DIRS.validation_data_dir, 'hammah-role-g-validation-data.csv')
results_file_name = os.path.join(tmp_path, 'results.json')

pipe = LinearPipeline(config)
pipe.set_source(CloudTrailSourceStage(config, input_glob=input_glob, sort_glob=True))
pipe.add_stage(train_ae_stage.TrainAEStage(config, train_data_glob=train_data_glob, seed=42, sort_glob=True))
pipe.add_stage(train_ae_stage.TrainAEStage(config, train_data_glob=input_glob, seed=42, sort_glob=True))
pipe.add_stage(preprocess_ae_stage.PreprocessAEStage(config))
pipe.add_stage(AutoEncoderInferenceStage(config))
pipe.add_stage(AddScoresStage(config))
Expand Down Expand Up @@ -148,14 +147,13 @@ def test_hammah_user123(mock_ae, config, tmp_path):
config.ae.feature_columns = [x.strip() for x in fh.readlines()]

input_glob = os.path.join(TEST_DIRS.validation_data_dir, "hammah-*.csv")
train_data_glob = os.path.join(TEST_DIRS.training_data_dir, "hammah-*.csv")
out_file = os.path.join(tmp_path, 'results.csv')
val_file_name = os.path.join(TEST_DIRS.validation_data_dir, 'hammah-user123-validation-data.csv')
results_file_name = os.path.join(tmp_path, 'results.json')

pipe = LinearPipeline(config)
pipe.set_source(CloudTrailSourceStage(config, input_glob=input_glob, sort_glob=True))
pipe.add_stage(train_ae_stage.TrainAEStage(config, train_data_glob=train_data_glob, seed=42, sort_glob=True))
pipe.add_stage(train_ae_stage.TrainAEStage(config, train_data_glob=input_glob, seed=42, sort_glob=True))
pipe.add_stage(preprocess_ae_stage.PreprocessAEStage(config))
pipe.add_stage(AutoEncoderInferenceStage(config))
pipe.add_stage(AddScoresStage(config))
Expand Down