Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cmsTriton bug fixes and improvements #43814

Merged
merged 19 commits into from
Feb 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2501,7 +2501,7 @@ class UpgradeWorkflow_SonicTriton(UpgradeWorkflow):
def setup_(self, step, stepName, stepDict, k, properties):
stepDict[stepName][k] = merge([{'--procModifiers': 'allSonicTriton'}, stepDict[step][k]])
def condition(self, fragment, stepList, key, hasHarvest):
return (fragment=='TTbar_13' and '2021' in key) \
return ((fragment=='TTbar_13' or fragment=='TTbar_14TeV') and '2021' in key) \
or (fragment=='TTbar_14TeV' and '2026' in key)
upgradeWFs['SonicTriton'] = UpgradeWorkflow_SonicTriton(
steps = [
Expand Down
2 changes: 1 addition & 1 deletion Configuration/PyReleaseValidation/scripts/runTheMatrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def runSelected(opt):
ret = 0
if opt.show:
mrd.show(opt.testList, opt.extended, opt.cafVeto)
if opt.testList : print('testListected items:', opt.testList)
if opt.testList : print('selected items:', opt.testList)
else:
mRunnerHi = MatrixRunner(mrd.workFlows, opt.nProcs, opt.nThreads)
ret = mRunnerHi.runTests(opt)
Expand Down
2 changes: 2 additions & 0 deletions HeterogeneousCore/SonicTriton/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ The model information from the server can be printed by enabling `verbose` outpu
* `modelConfigPath`: path to `config.pbtxt` file for the model (using `edm::FileInPath`)
* `preferredServer`: name of preferred server, for testing (see [Services](#services) below)
* `timeout`: maximum allowed time for a request (disabled with 0)
* `timeoutUnit`: seconds, milliseconds, or microseconds (default: seconds)
* `outputs`: optional, specify which output(s) the server should send
* `verbose`: enable verbose printouts (default: false)
* `useSharedMemory`: enable use of shared memory (see [below](#shared-memory)) with local servers (default: true)
Expand Down Expand Up @@ -132,6 +133,7 @@ The script has three operations (`start`, `stop`, `check`) and the following opt
* `-C [dir]`: directory containing Nvidia compatibility drivers (checks CMSSW_BASE by default if available)
* `-D`: dry run: print container commands rather than executing them
* `-d`: use Docker instead of Apptainer
* `-E [path]`: include extra path(s) for executables (default: /cvmfs/oasis.opensciencegrid.org/mis/apptainer/current/bin)
* `-f`: force reuse of (possibly) existing container instance
* `-g`: use GPU instead of CPU
* `-i` [name]`: server image name (default: fastml/triton-torchgeo:22.07-py3-geometric)
Expand Down
2 changes: 2 additions & 0 deletions HeterogeneousCore/SonicTriton/interface/TritonClient.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ class TritonClient : public SonicClient<TritonInputMap, TritonOutputMap> {
void resetBatchMode();
void reset() override;
TritonServerType serverType() const { return serverType_; }
bool isLocal() const { return isLocal_; }

//for fillDescriptions
static void fillPSetDescription(edm::ParameterSetDescription& iDesc);
Expand Down Expand Up @@ -78,6 +79,7 @@ class TritonClient : public SonicClient<TritonInputMap, TritonOutputMap> {
bool verbose_;
bool useSharedMemory_;
TritonServerType serverType_;
bool isLocal_;
grpc_compression_algorithm compressionAlgo_;
triton::client::Headers headers_;

Expand Down
2 changes: 1 addition & 1 deletion HeterogeneousCore/SonicTriton/interface/TritonException.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

class TritonException : public cms::Exception {
public:
explicit TritonException(std::string const& aCategory);
explicit TritonException(std::string const& aCategory, bool signal = false);
void convertToWarning() const;
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ class TritonMemResource {
uint8_t* addr() { return addr_; }
size_t size() const { return size_; }
virtual void close() {}
void closeSafe();
//used for input
virtual void copyInput(const void* values, size_t offset, unsigned entry) {}
//used for output
Expand Down
3 changes: 3 additions & 0 deletions HeterogeneousCore/SonicTriton/interface/TritonService.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <string>
#include <functional>
#include <utility>
#include <atomic>

#include "grpc_client.h"

Expand Down Expand Up @@ -112,6 +113,7 @@ class TritonService {
void addModel(const std::string& modelName, const std::string& path);
Server serverInfo(const std::string& model, const std::string& preferred = "") const;
const std::string& pid() const { return pid_; }
void notifyCallStatus(bool status) const;

static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);

Expand All @@ -132,6 +134,7 @@ class TritonService {
unsigned currentModuleId_;
bool allowAddModel_;
bool startedFallback_;
mutable std::atomic<int> callFails_;
std::string pid_;
std::unordered_map<std::string, Model> unservedModels_;
//this represents a many:many:many map
Expand Down
13 changes: 8 additions & 5 deletions HeterogeneousCore/SonicTriton/interface/triton_utils.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef HeterogeneousCore_SonicTriton_triton_utils
#define HeterogeneousCore_SonicTriton_triton_utils

#include "FWCore/Utilities/interface/Exception.h"
#include "FWCore/Utilities/interface/Span.h"
#include "HeterogeneousCore/SonicTriton/interface/TritonException.h"

Expand All @@ -19,6 +20,8 @@ namespace triton_utils {
bool checkType(inference::DataType dtype) {
return false;
}
//turn CMS exceptions into warnings
void convertToWarning(const cms::Exception& e);
} // namespace triton_utils

//explicit specializations (inlined)
Expand Down Expand Up @@ -72,11 +75,11 @@ inline bool triton_utils::checkType<double>(inference::DataType dtype) {

//helper to turn triton error into exception
//implemented as a macro to avoid constructing the MSG string for successful function calls
#define TRITON_THROW_IF_ERROR(X, MSG) \
{ \
triton::client::Error err = (X); \
if (!err.IsOk()) \
throw TritonException("TritonFailure") << (MSG) << (err.Message().empty() ? "" : ": " + err.Message()); \
#define TRITON_THROW_IF_ERROR(X, MSG, NOTIFY) \
{ \
triton::client::Error err = (X); \
if (!err.IsOk()) \
throw TritonException("TritonFailure", NOTIFY) << (MSG) << (err.Message().empty() ? "" : ": " + err.Message()); \
}

extern template std::string triton_utils::printColl(const edm::Span<std::vector<int64_t>::const_iterator>& coll,
Expand Down
50 changes: 33 additions & 17 deletions HeterogeneousCore/SonicTriton/scripts/cmsTriton
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ USEDOCKER=""
GPU=""
VERBOSE=""
VERBOSE_ARGS="--log-verbose=1 --log-error=1 --log-warning=1 --log-info=1"
WTIME=300
WTIME=600
SERVER=triton_server_instance
RETRIES=3
REPOS=()
Expand All @@ -23,6 +23,7 @@ NPORTS=3
IMAGE=fastml/triton-torchgeo:22.07-py3-geometric
SANDBOX=""
COMPAT_USR=""
EXTRAPATH=/cvmfs/oasis.opensciencegrid.org/mis/apptainer/current/bin

get_sandbox(){
if [ -z "$SANDBOX" ]; then
Expand All @@ -41,6 +42,7 @@ usage() {
$ECHO "-C [dir] \t directory containing Nvidia compatibility drivers (checks CMSSW_BASE by default if available)"
$ECHO "-D \t dry run: print container commands rather than executing them"
$ECHO "-d \t use Docker instead of Apptainer"
$ECHO "-E [path] \t include extra path(s) for executables (default: ${EXTRAPATH})"
$ECHO "-f \t force reuse of (possibly) existing container instance"
$ECHO "-g \t use GPU instead of CPU"
$ECHO "-i [name] \t server image name (default: ${IMAGE})"
Expand Down Expand Up @@ -131,6 +133,11 @@ else
TMPDIR=$(readlink -f $TMPDIR)
fi

# update path
if [ -n "$EXTRAPATH" ]; then
export PATH="${EXTRAPATH}:${PATH}"
fi

# find executables
if [ -n "$USEDOCKER" ]; then
if [ -z "$DOCKER" ]; then
Expand All @@ -149,7 +156,6 @@ else
fi
fi


SANDBOX=$(get_sandbox)
SANDBOX=$(readlink -f ${SANDBOX})
LOG="log_${SERVER}.log"
Expand All @@ -160,6 +166,12 @@ SEGFAULT_INDICATOR="Address already in use"
EXTRA=""
COMPAT_SCRIPT=/etc/shinit_v2

THREADCONTROL=""
# do not apply thread control settings if GPU use is requested
if [ "$INSTANCES" -gt 0 ] && [ -z "$GPU" ]; then
THREADCONTROL=true
fi

compute_ports(){
# compute derived port numbers
export HTTPPORT=$BASEPORT
Expand Down Expand Up @@ -341,7 +353,7 @@ wait_server(){
list_models(){
# make list of model repositories
LOCALMODELREPO="local_model_repo"
if [ "$INSTANCES" -gt 0 ]; then
if [ -n "$THREADCONTROL" ]; then
if [ -d "$TMPDIR/$LOCALMODELREPO" ]; then
#Want to start with a fresh copy of model files in case this directory already exists with local edits
rm -rf $TMPDIR/$LOCALMODELREPO
Expand All @@ -359,7 +371,7 @@ list_models(){
if [ -f "$MODEL" ]; then
MODEL="$(dirname "$MODEL")"
fi
if [ "$INSTANCES" -gt 0 ]; then
if [ -n "$THREADCONTROL" ]; then
$DRYRUN cmsTritonConfigTool threadcontrol -c ${MODEL}/config.pbtxt --copy $TMPDIR/$LOCALMODELREPO --nThreads $INSTANCES
TOOL_EXIT=$?
if [ "$TOOL_EXIT" -ne 0 ]; then
Expand All @@ -370,7 +382,7 @@ list_models(){
REPOS+=("$(dirname "$MODEL")")
fi
done
if [ "$INSTANCES" -gt 0 ]; then
if [ -n "$THREADCONTROL" ]; then
REPOS=$TMPDIR/$LOCALMODELREPO
else
for ((r=0; r < ${#REPOS[@]}; r++)); do
Expand All @@ -394,6 +406,7 @@ auto_stop(){
fi
PCOUNTER=0
PMAX=5
# builtin wait is not used here because it can only monitor a child process, not a parent process
while [ "$PCOUNTER" -le "$PMAX" ]; do
if ! kill -0 $PARENTPID >& /dev/null; then
PCOUNTER=$((PCOUNTER+1))
Expand All @@ -415,13 +428,11 @@ auto_stop(){
$STOP_FN

# move logs out of tmp dir
if [ -z "$DRYRUN" ]; then
if [ -n "$VERBOSE" ]; then
mv "$LOG" "$TOPDIR"
# only keep non-empty log
if [ -s "$STOPLOG" ]; then
mv "$STOPLOG" "$TOPDIR"
fi
if [ -z "$DRYRUN" ] && [ -n "$VERBOSE" ]; then
mv "$LOG" "$TOPDIR"
# only keep non-empty log
if [ -s "$STOPLOG" ]; then
mv "$STOPLOG" "$TOPDIR"
fi
fi

Expand Down Expand Up @@ -569,11 +580,16 @@ elif [ "$OP" == start ]; then
START_EXIT=0
for ((counter=0; counter < ${RETRIES}; counter++)); do
make_tmp
#If we plan on editing model configs, must repull files into /tmp/local_model_repo, which is deleted upon retry
if [ "$counter" -eq 0 ] || [ "$INSTANCES" -gt 0 ]; then list_models; fi
check_drivers
DRIVER_EXIT=$?
if [ "$DRIVER_EXIT" -ne 0 ]; then exit $DRIVER_EXIT; fi

# if we plan on editing model configs, must copy files into /tmp/local_model_repo, which is deleted upon retry
if [ "$counter" -eq 0 ] || [ -n "$THREADCONTROL" ]; then list_models; fi

# only need to check drivers if using GPU
if [ -n "$GPU" ]; then
check_drivers
DRIVER_EXIT=$?
if [ "$DRIVER_EXIT" -ne 0 ]; then exit $DRIVER_EXIT; fi
fi

$START_FN
START_EXIT=$?
Expand Down
11 changes: 8 additions & 3 deletions HeterogeneousCore/SonicTriton/scripts/cmsTritonConfigTool
Original file line number Diff line number Diff line change
Expand Up @@ -261,9 +261,11 @@ def cfg_checksum(args):
missing = []

from glob import glob
config_dir = os.path.dirname(args.config)
# evaluate symbolic links
config_dir = os.path.realpath(os.path.dirname(args.config))
for filename in glob(os.path.join(config_dir,"*/*")):
if os.path.islink(os.path.dirname(filename)): continue
# evaluate symbolic links again
filename = os.path.realpath(filename)
checksum = get_checksum(filename)
# key = algorithm:[filename relative to config.pbtxt dir]
filename = os.path.relpath(filename, config_dir)
Expand Down Expand Up @@ -324,10 +326,12 @@ def cfg_versioncheck(args):
missing = []

for path in os.environ['CMSSW_SEARCH_PATH'].split(':'):
for dirpath, dirnames, filenames in os.walk(path):
if args.verbose: print("Checking: "+path)
for dirpath, dirnames, filenames in os.walk(path, followlinks=True):
for filename in filenames:
if filename=="config.pbtxt":
filepath = os.path.join(dirpath,filename)
if args.verbose: print(filepath)
checksum_args = Namespace(
config=filepath, should_return=True,
copy=False, json=False, defaults=False, view=False,
Expand Down Expand Up @@ -435,6 +439,7 @@ if __name__=="__main__":
parser_checksum.set_defaults(func=cfg_checksum)

parser_versioncheck = subparsers.add_parser("versioncheck", parents=[_parser_checksum_update], help="check all model checksums")
parser_versioncheck.add_argument("--verbose", default=False, action="store_true", help="verbose output (show all files checked)")
parser_versioncheck.set_defaults(func=cfg_versioncheck)

_parser_copy_req = ArgumentParser(add_help=False, parents=[_parser_copy_view])
Expand Down
Loading