diff --git a/.pylintrc b/.pylintrc index 428b85fabe..691cefcd74 100644 --- a/.pylintrc +++ b/.pylintrc @@ -1,27 +1,71 @@ -[MASTER] +[MAIN] + +# Analyse import fallback blocks. This can be used to support both Python 2 and +# 3 compatible code, which means that the block might have code that exists +# only in one or another interpreter, leading to false positives when analysed. +analyse-fallback-blocks=no + +# Load and enable all available extensions. Use --list-extensions to see a list +# all available extensions. +#enable-all-extensions= + +# In error mode, messages with a category besides ERROR or FATAL are +# suppressed, and no reports are done by default. Error mode is compatible with +# disabling specific errors. +#errors-only= + +# Always return a 0 (non-error) status code, even if lint errors are found. +# This is primarily useful in continuous integration scripts. +#exit-zero= # A comma-separated list of package or module names from where C extensions may # be loaded. Extensions are loading into the active Python interpreter and may # run arbitrary code. extension-pkg-allow-list=srf,morpheus._lib +# A comma-separated list of package or module names from where C extensions may +# be loaded. Extensions are loading into the active Python interpreter and may +# run arbitrary code. (This is an alternative name to extension-pkg-allow-list +# for backward compatibility.) +extension-pkg-whitelist= + +# Return non-zero exit code if any of these messages/categories are detected, +# even if score is above --fail-under value. Syntax same as enable. Messages +# specified are enabled, while categories only check already-enabled messages. +fail-on= + # Specify a score threshold to be exceeded before program exits with error. fail-under=10 -# Add files or directories to the blacklist. They should be base names, not -# paths. +# Interpret the stdin as a python script, whose filename needs to be passed as +# the module_or_package argument. +#from-stdin= + +# Files or directories to be skipped. They should be base names, not paths. ignore=CVS -# Add files or directories matching the regex patterns to the blacklist. The -# regex matches against base names, not paths. -ignore-patterns= +# Add files or directories matching the regex patterns to the ignore-list. The +# regex matches against paths and can be in Posix or Windows format. +ignore-paths= + +# Files or directories matching the regex patterns are skipped. The regex +# matches against base names, not paths. The default value ignores Emacs file +# locks +ignore-patterns=^\.# + +# List of module names for which member attributes should not be checked +# (useful for modules/projects where namespaces are manipulated during runtime +# and thus existing member attributes cannot be deduced by static analysis). It +# supports qualified module names, as well as Unix pattern matching. +ignored-modules= # Python code to execute, usually for sys.path manipulation such as # pygtk.require(). #init-hook= # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the -# number of processors available to use. +# number of processors available to use, and will cap the count on Windows to +# avoid hangs. jobs=1 # Control the amount of potential inferred values when inferring a single @@ -31,11 +75,18 @@ limit-inference-results=100 # List of plugins (as comma separated values of python module names) to load, # usually to register additional checkers. -load-plugins= +load-plugins=pylint.extensions.docparams # Pickle collected data for later comparisons. persistent=yes +# Minimum Python version to use for version dependent checks. Will default to +# the version used to run pylint. +py-version=3.8 + +# Discover python modules and packages in the file system subtree. +recursive=no + # When enabled, pylint would attempt to guess common misconfiguration and emit # user-friendly hints instead of false-positive error messages. suggestion-mode=yes @@ -44,127 +95,22 @@ suggestion-mode=yes # active Python interpreter and may run arbitrary code. unsafe-load-any-extension=no - -[MESSAGES CONTROL] - -# Only show warnings with the listed confidence levels. Leave empty to show -# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED. -confidence= - -# Disable the message, report, category or checker with the given id(s). You -# can either give multiple identifiers separated by comma (,) or put this -# option multiple times (only on the command line, not in the configuration -# file where it should appear only once). You can also use "--disable=all" to -# disable everything first and then reenable specific checks. For example, if -# you want to run only the similarities checker, you can use "--disable=all -# --enable=similarities". If you want to run only the classes checker, but have -# no Warning level messages displayed, use "--disable=all --enable=classes -# --disable=W". -disable=print-statement, - parameter-unpacking, - unpacking-in-except, - old-raise-syntax, - backtick, - long-suffix, - old-ne-operator, - old-octal-literal, - import-star-module-level, - non-ascii-bytes-literal, - raw-checker-failed, - bad-inline-option, - locally-disabled, - file-ignored, - suppressed-message, - useless-suppression, - deprecated-pragma, - use-symbolic-message-instead, - apply-builtin, - basestring-builtin, - buffer-builtin, - cmp-builtin, - coerce-builtin, - execfile-builtin, - file-builtin, - long-builtin, - raw_input-builtin, - reduce-builtin, - standarderror-builtin, - unicode-builtin, - xrange-builtin, - coerce-method, - delslice-method, - getslice-method, - setslice-method, - no-absolute-import, - old-division, - dict-iter-method, - dict-view-method, - next-method-called, - metaclass-assignment, - indexing-exception, - raising-string, - reload-builtin, - oct-method, - hex-method, - nonzero-method, - cmp-method, - input-builtin, - round-builtin, - intern-builtin, - unichr-builtin, - map-builtin-not-iterating, - zip-builtin-not-iterating, - range-builtin-not-iterating, - filter-builtin-not-iterating, - using-cmp-argument, - eq-without-hash, - div-method, - idiv-method, - rdiv-method, - exception-message-attribute, - invalid-str-codec, - sys-max-int, - bad-python3-import, - deprecated-string-function, - deprecated-str-translate-call, - deprecated-itertools-function, - deprecated-types-field, - next-method-defined, - dict-items-not-iterating, - dict-keys-not-iterating, - dict-values-not-iterating, - deprecated-operator-function, - deprecated-urllib-function, - xreadlines-attribute, - deprecated-sys-function, - exception-escape, - comprehension-escape, - superfluous-parens, - missing-module-docstring, - missing-class-docstring, - missing-function-docstring, - protected-access, - unspecified-encoding - -# Enable the message, report, category or checker with the given id(s). You can -# either give multiple identifier separated by comma (,) or put this option -# multiple time (only on the command line, not in the configuration file where -# it should appear only once). See also the "--disable" option for examples. -enable=c-extension-no-member +# In verbose mode, extra non-checker-related info will be displayed. +#verbose= [REPORTS] # Python expression which should return a score less than or equal to 10. You -# have access to the variables 'error', 'warning', 'refactor', and 'convention' -# which contain the number of messages in each category, as well as 'statement' -# which is the total number of statements analyzed. This score is used by the -# global evaluation report (RP0004). -evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) +# have access to the variables 'fatal', 'error', 'warning', 'refactor', +# 'convention', and 'info' which contain the number of messages in each +# category, as well as 'statement' which is the total number of statements +# analyzed. This score is used by the global evaluation report (RP0004). +evaluation=max(0, 0 if fatal else 10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)) # Template used to display messages. This is a python new-style format string # used to format the message information. See doc for all details. -#msg-template= +msg-template= # Set the output format. Available formats are text, parseable, colorized, json # and msvs (visual studio). You can also give a reporter class, e.g. @@ -178,16 +124,45 @@ reports=no score=yes -[REFACTORING] +[MESSAGES CONTROL] -# Maximum number of nested blocks for function / method body -max-nested-blocks=5 +# Only show warnings with the listed confidence levels. Leave empty to show +# all. Valid levels: HIGH, CONTROL_FLOW, INFERENCE, INFERENCE_FAILURE, +# UNDEFINED. +confidence=HIGH, + CONTROL_FLOW, + INFERENCE, + INFERENCE_FAILURE, + UNDEFINED -# Complete name of functions that never returns. When checking for -# inconsistent-return-statements if a never returning function is called then -# it will be considered as an explicit return statement and no message will be -# printed. -never-returning-functions=sys.exit +# Disable the message, report, category or checker with the given id(s). You +# can either give multiple identifiers separated by comma (,) or put this +# option multiple times (only on the command line, not in the configuration +# file where it should appear only once). You can also use "--disable=all" to +# disable everything first and then re-enable specific checks. For example, if +# you want to run only the similarities checker, you can use "--disable=all +# --enable=similarities". If you want to run only the classes checker, but have +# no Warning level messages displayed, use "--disable=all --enable=classes +# --disable=W". +disable=bad-inline-option, + deprecated-pragma, + file-ignored, + locally-disabled, + missing-class-docstring, + missing-function-docstring, + missing-module-docstring, + protected-access, + raw-checker-failed, + superfluous-parens, + suppressed-message, + use-symbolic-message-instead, + useless-suppression + +# Enable the message, report, category or checker with the given id(s). You can +# either give multiple identifier separated by comma (,) or put this option +# multiple time (only on the command line, not in the configuration file where +# it should appear only once). See also the "--disable" option for examples. +enable=c-extension-no-member [STRING] @@ -210,6 +185,9 @@ additional-builtins= # Tells whether unused global variables should be treated as a violation. allow-global-unused-variables=yes +# List of names allowed to shadow builtins +allowed-redefined-builtins= + # List of strings which can identify a callback function by name. A callback # name must start or end with one of those strings. callbacks=cb_, @@ -242,56 +220,57 @@ logging-format-style=old logging-modules=logging -[SIMILARITIES] +[REFACTORING] -# Ignore comments when computing similarities. -ignore-comments=yes +# Maximum number of nested blocks for function / method body +max-nested-blocks=5 -# Ignore docstrings when computing similarities. -ignore-docstrings=yes +# Complete name of functions that never returns. When checking for +# inconsistent-return-statements if a never returning function is called then +# it will be considered as an explicit return statement and no message will be +# printed. +never-returning-functions=sys.exit,argparse.parse_error -# Ignore imports when computing similarities. -ignore-imports=no -# Minimum lines number of a similarity. -min-similarity-lines=4 +[DESIGN] +# List of regular expressions of class ancestor names to ignore when counting +# public methods (see R0903) +exclude-too-few-public-methods= -[FORMAT] +# List of qualified class names to ignore when counting class parents (see +# R0901) +ignored-parents= -# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. -expected-line-ending-format= +# Maximum number of arguments for function / method. +max-args=5 -# Regexp for a line that is allowed to be longer than the limit. -ignore-long-lines=^\s*(# )??$ +# Maximum number of attributes for a class (see R0902). +max-attributes=7 -# Number of spaces of indent required inside a hanging or continued line. -indent-after-paren=4 +# Maximum number of boolean expressions in an if statement (see R0916). +max-bool-expr=5 -# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 -# tab). -indent-string=' ' +# Maximum number of branch for function / method body. +max-branches=12 -# Maximum number of characters on a single line. -max-line-length=130 +# Maximum number of locals for function / method body. +max-locals=15 -# Maximum number of lines in a module. -max-module-lines=1000 +# Maximum number of parents for a class (see R0901). +max-parents=7 -# List of optional constructs for which whitespace checking is disabled. `dict- -# separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. -# `trailing-comma` allows a space between comma and closing bracket: (a, ). -# `empty-line` allows space-only lines. -no-space-check=trailing-comma, - dict-separator +# Maximum number of public methods for a class (see R0904). +max-public-methods=20 -# Allow the body of a class to be on the same line as the declaration if body -# contains single statement. -single-line-class-stmt=no +# Maximum number of return / yield for function / method body. +max-returns=6 -# Allow the body of an if to be on the same line as the test if there is no -# else. -single-line-if-stmt=no +# Maximum number of statements in function / method body. +max-statements=50 + +# Minimum number of public methods for a class (see R0903). +min-public-methods=2 [BASIC] @@ -300,13 +279,15 @@ single-line-if-stmt=no argument-naming-style=snake_case # Regular expression matching correct argument names. Overrides argument- -# naming-style. +# naming-style. If left empty, argument names will be checked with the set +# naming style. #argument-rgx= # Naming style matching correct attribute names. attr-naming-style=snake_case # Regular expression matching correct attribute names. Overrides attr-naming- +# style. If left empty, attribute names will be checked with the set naming # style. #attr-rgx= @@ -326,20 +307,30 @@ bad-names-rgxs= class-attribute-naming-style=any # Regular expression matching correct class attribute names. Overrides class- -# attribute-naming-style. +# attribute-naming-style. If left empty, class attribute names will be checked +# with the set naming style. #class-attribute-rgx= +# Naming style matching correct class constant names. +class-const-naming-style=UPPER_CASE + +# Regular expression matching correct class constant names. Overrides class- +# const-naming-style. If left empty, class constant names will be checked with +# the set naming style. +#class-const-rgx= + # Naming style matching correct class names. class-naming-style=PascalCase # Regular expression matching correct class names. Overrides class-naming- -# style. +# style. If left empty, class names will be checked with the set naming style. #class-rgx= # Naming style matching correct constant names. const-naming-style=UPPER_CASE # Regular expression matching correct constant names. Overrides const-naming- +# style. If left empty, constant names will be checked with the set naming # style. #const-rgx= @@ -351,7 +342,8 @@ docstring-min-length=-1 function-naming-style=snake_case # Regular expression matching correct function names. Overrides function- -# naming-style. +# naming-style. If left empty, function names will be checked with the set +# naming style. #function-rgx= # Good variable names which should always be accepted, separated by a comma. @@ -377,21 +369,22 @@ include-naming-hint=yes inlinevar-naming-style=any # Regular expression matching correct inline iteration names. Overrides -# inlinevar-naming-style. +# inlinevar-naming-style. If left empty, inline iteration names will be checked +# with the set naming style. #inlinevar-rgx= # Naming style matching correct method names. method-naming-style=snake_case # Regular expression matching correct method names. Overrides method-naming- -# style. +# style. If left empty, method names will be checked with the set naming style. #method-rgx= # Naming style matching correct module names. module-naming-style=snake_case # Regular expression matching correct module names. Overrides module-naming- -# style. +# style. If left empty, module names will be checked with the set naming style. #module-rgx= # Colon-delimited sets of names that determine each other's naming style when @@ -407,14 +400,136 @@ no-docstring-rgx=^_ # These decorators are taken in consideration only for invalid-name. property-classes=abc.abstractproperty +# Regular expression matching correct type variable names. If left empty, type +# variable names will be checked with the set naming style. +#typevar-rgx= + # Naming style matching correct variable names. variable-naming-style=snake_case # Regular expression matching correct variable names. Overrides variable- -# naming-style. +# naming-style. If left empty, variable names will be checked with the set +# naming style. #variable-rgx= +[CLASSES] + +# Warn about protected attribute access inside special methods +check-protected-access-in-special-methods=no + +# List of method names used to declare (i.e. assign) instance attributes. +defining-attr-methods=__init__, + __new__, + setUp, + __post_init__ + +# List of member names, which should be excluded from the protected access +# warning. +exclude-protected=_asdict, + _fields, + _replace, + _source, + _make + +# List of valid names for the first argument in a class method. +valid-classmethod-first-arg=cls + +# List of valid names for the first argument in a metaclass class method. +valid-metaclass-classmethod-first-arg=cls + + +[SIMILARITIES] + +# Comments are removed from the similarity computation +ignore-comments=yes + +# Docstrings are removed from the similarity computation +ignore-docstrings=yes + +# Imports are removed from the similarity computation +ignore-imports=yes + +# Signatures are removed from the similarity computation +ignore-signatures=yes + +# Minimum lines number of a similarity. +min-similarity-lines=4 + + +[FORMAT] + +# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. +expected-line-ending-format= + +# Regexp for a line that is allowed to be longer than the limit. +ignore-long-lines=^\s*(# )??$ + +# Number of spaces of indent required inside a hanging or continued line. +indent-after-paren=4 + +# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 +# tab). +indent-string=' ' + +# Maximum number of characters on a single line. +max-line-length=120 + +# Maximum number of lines in a module. +max-module-lines=1000 + +# Allow the body of a class to be on the same line as the declaration if body +# contains single statement. +single-line-class-stmt=no + +# Allow the body of an if to be on the same line as the test if there is no +# else. +single-line-if-stmt=no + + +[EXCEPTIONS] + +# Exceptions that will emit a warning when caught. +overgeneral-exceptions=BaseException, + Exception + + +[IMPORTS] + +# List of modules that can be imported at any level, not just the top level +# one. +allow-any-import-level= + +# Allow wildcard imports from modules that define __all__. +allow-wildcard-with-all=no + +# Deprecated modules which should not be used, separated by a comma. +deprecated-modules= + +# Output a graph (.gv or any supported image format) of external dependencies +# to the given file (report RP0402 must not be disabled). +ext-import-graph= + +# Output a graph (.gv or any supported image format) of all (i.e. internal and +# external) dependencies to the given file (report RP0402 must not be +# disabled). +import-graph= + +# Output a graph (.gv or any supported image format) of internal dependencies +# to the given file (report RP0402 must not be disabled). +int-import-graph= + +# Force import order to recognize a module as part of the standard +# compatibility libraries. +known-standard-library= + +# Force import order to recognize a module as part of a third party library. +known-third-party=enchant + +# Couples of modules and preferred modules, separated by a comma. +preferred-modules= + + [TYPECHECK] # List of decorators that produce context managers, such as @@ -427,10 +542,6 @@ contextmanager-decorators=contextlib.contextmanager # expressions are accepted. generated-members= -# Tells whether missing members accessed in mixin class should be ignored. A -# mixin class is detected if its name ends with "mixin" (case insensitive). -ignore-mixin-members=yes - # Tells whether to warn about missing members when the owner of the attribute # is inferred to be None. ignore-none=yes @@ -443,16 +554,16 @@ ignore-none=yes # the rest of the inferred objects. ignore-on-opaque-inference=yes +# List of symbolic message names to ignore for Mixin members. +ignored-checks-for-mixins=no-member, + not-async-context-manager, + not-context-manager, + attribute-defined-outside-init + # List of class names for which member attributes should not be checked (useful # for classes with dynamically set attributes). This supports the use of # qualified names. -ignored-classes=optparse.Values,thread._local,_thread._local - -# List of module names for which member attributes should not be checked -# (useful for modules/projects where namespaces are manipulated during runtime -# and thus existing member attributes cannot be deduced by static analysis). It -# supports qualified module names, as well as Unix pattern matching. -ignored-modules= +ignored-classes=optparse.Values,thread._local,_thread._local,argparse.Namespace # Show a hint with possible names when a member name was not found. The aspect # of finding the hint is based on edit distance. @@ -466,6 +577,9 @@ missing-member-hint-distance=1 # showing a hint for a missing member. missing-member-max-choices=1 +# Regex pattern to define which classes are considered mixins. +mixin-class-rgx=.*[Mm]ixin + # List of decorators that change the signature of a decorated function. signature-mutators= @@ -476,9 +590,13 @@ signature-mutators= max-spelling-suggestions=4 # Spelling dictionary name. Available dictionaries: none. To make it work, -# install the python-enchant package. +# install the 'python-enchant' package. spelling-dict= +# List of comma separated words that should be considered directives if they +# appear at the beginning of a comment and should not be checked. +spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy: + # List of comma separated words that should not be checked. spelling-ignore-words= @@ -498,109 +616,4 @@ notes=FIXME, TODO # Regular expression of note tags to take in consideration. -#notes-rgx= - - -[DESIGN] - -# Maximum number of arguments for function / method. -max-args=5 - -# Maximum number of attributes for a class (see R0902). -max-attributes=7 - -# Maximum number of boolean expressions in an if statement (see R0916). -max-bool-expr=5 - -# Maximum number of branch for function / method body. -max-branches=12 - -# Maximum number of locals for function / method body. -max-locals=15 - -# Maximum number of parents for a class (see R0901). -max-parents=7 - -# Maximum number of public methods for a class (see R0904). -max-public-methods=20 - -# Maximum number of return / yield for function / method body. -max-returns=6 - -# Maximum number of statements in function / method body. -max-statements=50 - -# Minimum number of public methods for a class (see R0903). -min-public-methods=2 - - -[IMPORTS] - -# List of modules that can be imported at any level, not just the top level -# one. -allow-any-import-level= - -# Allow wildcard imports from modules that define __all__. -allow-wildcard-with-all=no - -# Analyse import fallback blocks. This can be used to support both Python 2 and -# 3 compatible code, which means that the block might have code that exists -# only in one or another interpreter, leading to false positives when analysed. -analyse-fallback-blocks=no - -# Deprecated modules which should not be used, separated by a comma. -deprecated-modules=optparse,tkinter.tix - -# Create a graph of external dependencies in the given file (report RP0402 must -# not be disabled). -ext-import-graph= - -# Create a graph of every (i.e. internal and external) dependencies in the -# given file (report RP0402 must not be disabled). -import-graph= - -# Create a graph of internal dependencies in the given file (report RP0402 must -# not be disabled). -int-import-graph= - -# Force import order to recognize a module as part of the standard -# compatibility libraries. -known-standard-library= - -# Force import order to recognize a module as part of a third party library. -known-third-party=enchant - -# Couples of modules and preferred modules, separated by a comma. -preferred-modules= - - -[CLASSES] - -# List of method names used to declare (i.e. assign) instance attributes. -defining-attr-methods=__init__, - __new__, - setUp, - __post_init__, - __set_name__ - -# List of member names, which should be excluded from the protected access -# warning. -exclude-protected=_asdict, - _fields, - _replace, - _source, - _make - -# List of valid names for the first argument in a class method. -valid-classmethod-first-arg=cls - -# List of valid names for the first argument in a metaclass class method. -valid-metaclass-classmethod-first-arg=cls - - -[EXCEPTIONS] - -# Exceptions that will emit a warning when being caught. Defaults to -# "BaseException, Exception". -overgeneral-exceptions=BaseException, - Exception +notes-rgx= diff --git a/ci/conda/recipes/morpheus/meta.yaml b/ci/conda/recipes/morpheus/meta.yaml index c553c800a7..11a5a6ab40 100644 --- a/ci/conda/recipes/morpheus/meta.yaml +++ b/ci/conda/recipes/morpheus/meta.yaml @@ -78,7 +78,6 @@ outputs: - mlflow >=1.23 - networkx - pandas 1.3 - - pluggy - python - tqdm - typing_utils diff --git a/examples/digital_fingerprinting/README.md b/examples/digital_fingerprinting/README.md index c0f1a9d5ce..ff9e1e488a 100644 --- a/examples/digital_fingerprinting/README.md +++ b/examples/digital_fingerprinting/README.md @@ -14,280 +14,34 @@ # limitations under the License. --> -# Digital Fingerprinting Pipeline +# Digital Fingerprinting (DFP) in Morpheus -We show here how to set up and run the DFP pipeline for three log types: CloudTrail, Duo and Azure. Each of these log types uses a built-in source stage that handles that specific data format. New source stages can be added to allow the DFP pipeline to process different log types. All stages after the source stages are identical across all log types but can be configured differently via pipeline or stage configuration options. +## Organization -## Environment Setup +The DFP example workflows in Morpheus are designed to scale up to company wide workloads and handle several different log types which resulted in a large number of moving parts to handle the various servies and configuration options. To simplify things, the DFP workflow is provided as two separate examples: a simple, "starter" pipeline for new users and a complex, "production" pipeline for full scale deployments. While these two examples both peform the same general tasks, they do so in very different ways. The following is a breakdown of the differences between the two examples. -Follow the instructions [here](https://github.com/nv-morpheus/Morpheus/blob/branch-22.06/CONTRIBUTING.md) to set up your development environment in either a Docker container or conda environment. +### The "Starter" Example -## Morpheus CLI +This example is designed to simplify the number of stages and components and provided a fully contained workflow in a single pipeline. -DFP pipelines can be constructed and run using the Morpheus CLI command `morpheus run pipeline-ae ...` +Key Differences: + * A single pipeline which performs both training and inference + * Requires no external servies + * Can be run from the Morpheus CLI -Use `--help` to display information about the autoencoder pipeline command line options: -``` -morpheus run pipeline-ae --help +### The "Production" Example -Usage: morpheus run pipeline-ae [OPTIONS] COMMAND1 [ARGS]... [COMMAND2 - [ARGS]...]... +This example is designed to show what a full scale, productionr ready, DFP deployment in Morpheus would look like. It contains all of the necessary componets (such as a model store), to allow multiple Morpheus pipelines to communicate at a scale that can handle the workload of an entire company. - Configure and run the pipeline. To configure the pipeline, list the stages - in the order that data should flow. The output of each stage will become the - input for the next stage. For example, to read, classify and write to a - file, the following stages could be used +Key Differences: + * Multiple pipelines are specialized to perform either training or inference + * Requires setting up a model store to allow the training and inference pipelines to communicate + * Organized into a docker-compose deployment for easy startup + * Contains a Jupyter notebook service to ease development and debugging + * Can be deployed to Kubernetes using provided Helm charts + * Uses many customized stages to maximize performance. - pipeline from-file --filename=my_dataset.json deserialize preprocess inf-triton --model_name=my_model - --server_url=localhost:8001 filter --threshold=0.5 to-file --filename=classifications.json +## Getting Started - Pipelines must follow a few rules: - 1. Data must originate in a source stage. Current options are `from-file` or `from-kafka` - 2. A `deserialize` stage must be placed between the source stages and the rest of the pipeline - 3. Only one inference stage can be used. Zero is also fine - 4. The following stages must come after an inference stage: `add-class`, `filter`, `gen-viz` - -Options: - --columns_file FILE [default: /my_data/gitrepos/efajardo-nv/Morp - heus/morpheus/data/columns_ae_cloudtrail.txt - ] - --labels_file FILE Specifies a file to read labels from in - order to convert class IDs into labels. A - label file is a simple text file where each - line corresponds to a label. If unspecified, - only a single output label is created for - FIL - --userid_column_name TEXT Which column to use as the User ID. - [default: userIdentityaccountId; required] - --userid_filter TEXT Specifying this value will filter all - incoming data to only use rows with matching - User IDs. Which column is used for the User - ID is specified by `userid_column_name` - --feature_scaler TEXT Autoencoder feature scaler [default: - standard] - --use_generic_model BOOLEAN Whether to use a generic model when user does - not have minimum number of training rows - [default: False] - --viz_file FILE Save a visualization of the pipeline at the - specified location - --help Show this message and exit. - -Commands: - add-class Add detected classifications to each message - add-scores Add probability scores to each message - buffer (Deprecated) Buffer results - delay (Deprecated) Delay results for a certain duration - filter Filter message by a classification threshold - from-azure Load messages from a Duo directory - from-cloudtrail Load messages from a Cloudtrail directory - from-duo Load messages from a Duo directory - gen-viz (Deprecated) Write out vizualization data frames - inf-pytorch Perform inference with PyTorch - inf-triton Perform inference with Triton - monitor Display throughput numbers at a specific point in the - pipeline - preprocess Convert messages to tokens - serialize Include & exclude columns from messages - timeseries Perform time series anomaly detection and add prediction. - to-file Write all messages to a file - to-kafka Write all messages to a Kafka cluster - train-ae Deserialize source data from JSON - validate Validates pipeline output against an expected output -``` -The commands above correspond to the Morpheus stages that can be used to construct your DFP pipeline. Options are available to configure pipeline and stages. -The following table shows mapping between the main Morpheus CLI commands and underlying Morpheus Python stage classes: - -| CLI Command | Stage Class | Python File | -| ---------------| -------------------------| --------------------------------------------------------- -| from-azure | AzureSourceStage | morpheus/stages/input/azure_source_stage.py -| from-cloudtrail| CloudTrailSourceStage | morpheus/stages/input/clout_trail_source_stage.py -| from-duo | DuoSourceStage | morpheus/stages/input/duo_source_stage.py -| train-ae | TrainAEStage | morpheus/stages/preprocess/train_ae_stage.py -| preprocess | PreprocessAEStage | morpheus/stages/preprocess/preprocess_ae_stage.py -| inf-pytorch | AutoEncoderInferenceStage| morpheus/stages/inference/auto_encoder_inference_stage.py -| add-scores | AddScoresStage | morpheus/stages/postprocess/add_scores_stage.py -| serialize | SerializeStage | morpheus/stages/postprocess/serialize_stage.py -| to-file | WriteToFileStage | morpheus/stages/output/write_to_file_stage.py - - -## Morpheus DFP Stages - -**Source stages** - These include `AzureSourceStage`, `CloudTrailSourceStage` and `DuoSourceStage`. They are responsible for reading log file(s) that match provided `--input_glob` (e.g. `/duo_logs/*.json`). Data is grouped by user so that each batch processed by the pipeline will only contain rows corresponding to a single user. Feature engineering also happens in this stage. All DFP source stages must extend `AutoencoderSourceStage` and implement the `files_to_dfs_per_user` abstract method. Feature columns can be managed by overriding the `derive_features` method. Otherwise, all columns from input data pass through to next stage. - -**Preprocessing stages** - -`TrainAEStage` can either train user models using data matching a provided `--train_data_glob` or load pre-trained models from file using `--pretrained_filename`. When using `--train_data_glob`, user models can be saved using the `--models_output_filename` option. The `--source_stage_class` must also be used with `--train_data_glob` so that the training stage knows how to read the training data. The autoencoder implementation from this [fork](https://github.com/efajardo-nv/dfencoder/tree/morpheus-22.08) is used for user model training. The following are the available CLI options for the `TrainAEStage` (train-ae): - -| Option | Description -| ----------------------| --------------------------------------------------------- -| pretrained_filename | File path to pickled user models saved from previous training run using `--models_output_filename`. -| train_data_glob | Glob path to training data. -| source_stage_class | Source stage so that training stage knows how to read/parse training data. -| train_epochs | Number of training epochs. Default is 25. -| min_train_rows | Minimum number of training rows required to train user model. Default is 300. -| train_max_history | Maximum number of training rows per user. Default is 1000. -| seed | When not None, ensure random number generators are seeded with `seed` to control reproducibility of user model. -| sort_glob | If true the list of files matching `input_glob` will be processed in sorted order. Default is False. -| models_output_filename| Can be used with `--train_data_glob` to save trained user models to file using provided file path. Models can be loaded later using `--pretrained_filename`. - -The `PreprocessAEStage` is responsible for creating a Morpheus message that contains everything needed by the inference stage. For DFP inference, this stage must pass a `MultiInferenceAEMessage` to the inference stage. Each message will correspond to a single user and include the input feature columns, the user's model and training data anomaly scores. - -**Inference stage** - `AutoEncoderInferenceStage` calculates anomaly scores (i.e. reconstruction loss) and z-scores for each user input dataset. - -**Postprocessing stage** - The DFP pipeline uses the `AddScoresStage` for postprocessing to add anomaly scores and zscores from previous inference stage with matching labels. - -**Serialize stage** - `SerializeStage` is used to convert `MultiResponseProbsMessage` from previous stage to a `MessageMeta` to make it suitable for output (i.e. write to file or Kafka). - -**Write stage** - `WriteToFileStage` writes input data with inference results to an output file path. - - -## CloudTrail DFP Pipeline - -Run the following in your Morpheus container to start the CloudTrail DFP pipeline: - -``` -morpheus --log_level=DEBUG \ -run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=False \ -pipeline-ae \ ---columns_file=morpheus/data/columns_ae_cloudtrail.txt \ ---userid_column_name=userIdentitysessionContextsessionIssueruserName \ ---userid_filter=user123 \ ---feature_scaler=standard \ -from-cloudtrail \ ---input_glob=models/datasets/validation-data/dfp-cloudtrail-*-input.csv \ ---max_files=200 \ -train-ae \ ---train_data_glob=models/datasets/training-data/dfp-cloudtrail-*.csv \ ---source_stage_class=morpheus.stages.input.cloud_trail_source_stage.CloudTrailSourceStage \ ---seed=42 \ -preprocess \ -inf-pytorch \ -add-scores \ -serialize \ -to-file --filename=./cloudtrail-dfp-detections.csv --overwrite -``` - -## Duo DFP Pipeline - -First, trains user models from files in `models/datasets/training-data/duo` and saves user models to file. Pipeline then uses these models to run inference -on validation data in `models/datasets/validation-data/duo`. Inference results are written to `duo-detections.csv`. -``` -morpheus --log_level=DEBUG \ -run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=False \ -pipeline-ae \ ---columns_file=morpheus/data/columns_ae_duo.txt \ ---userid_column_name=username \ ---feature_scaler=standard \ -from-duo \ ---input_glob=models/datasets/validation-data/duo/*.json \ ---max_files=200 \ -monitor --description='Input rate' \ -train-ae \ ---train_data_glob=models/datasets/training-data/duo/*.json \ ---source_stage_class=morpheus.stages.input.duo_source_stage.DuoSourceStage \ ---seed=42 \ ---train_epochs=1 \ ---models_output_filename=models/dfp-models/duo_ae_user_models.pkl \ -preprocess \ -inf-pytorch \ -monitor --description='Inference rate' --unit inf \ -add-scores \ -serialize \ -to-file --filename=./duo-detections.csv --overwrite -``` - -The following example shows how we can load pre-trained user models from the file (`models/dfp-models/duo_ae_user_models.pkl`) we created in the previous example. Pipeline then uses these models to run inference on validation data in `models/datasets/validation-data/duo`. Inference results are written to `duo-detections.csv`. -``` -morpheus --log_level=DEBUG \ -run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=False \ -pipeline-ae \ ---columns_file=morpheus/data/columns_ae_duo.txt \ ---userid_column_name=username \ ---feature_scaler=standard \ -from-duo \ ---input_glob=models/datasets/validation-data/duo/*.json \ ---max_files=200 \ -monitor --description='Input rate' \ -train-ae \ ---pretrained_filename=models/dfp-models/duo_ae_user_models.pkl \ -preprocess \ -inf-pytorch \ -monitor --description='Inference rate' --unit inf \ -add-scores \ -serialize \ -to-file --filename=./duo-detections.csv --overwrite -``` - -## Azure DFP Pipeline - -First, trains user models from files in `models/datasets/training-data/azure` and saves user models to file. Pipeline then uses these models to run inference -on validation data in `models/datasets/validation-data/azure`. Inference results are written to `azure-detections.csv`. -``` -morpheus --log_level=DEBUG \ -run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=False \ -pipeline-ae \ ---columns_file=morpheus/data/columns_ae_azure.txt \ ---userid_column_name=userPrincipalName \ ---feature_scaler=standard \ -from-azure \ ---input_glob=models/datasets/validation-data/azure/*.json \ ---max_files=200 \ -train-ae \ ---train_data_glob=models/datasets/training-data/azure/*.json \ ---source_stage_class=morpheus.stages.input.azure_source_stage.AzureSourceStage \ ---seed=42 \ ---models_output_filename=models/dfp-models/azure_ae_user_models.pkl \ -preprocess \ -inf-pytorch \ -monitor --description='Inference rate' --unit inf \ -add-scores \ -serialize \ -to-file --filename=./azure-detections.csv --overwrite -``` - -The following example shows how we can load pre-trained user models from the file (`models/dfp-models/azure_ae_user_models.pkl`) we created in the previous example. Pipeline then uses these models to run inference on validation data in `models/datasets/validation-data/azure`. Inference results are written to `azure-detections.csv`. -``` -morpheus --log_level=DEBUG \ -run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=False \ -pipeline-ae \ ---columns_file=morpheus/data/columns_ae_azure.txt \ ---userid_column_name=userPrincipalName \ ---feature_scaler=standard \ -from-azure \ ---input_glob=models/datasets/validation-data/azure/*.json \ ---max_files=200 \ -train-ae \ ---pretrained_filename=models/dfp-models/azure_ae_user_models.pkl \ -preprocess \ -inf-pytorch \ -monitor --description='Inference rate' --unit inf \ -add-scores \ -serialize \ -to-file --filename=./azure-detections.csv --overwrite -``` - - -## Using Morpheus Python API - -The DFP pipelines can also be constructed and run via the Morpheus Python API. An [example](./run_cloudtrail_dfp.py) is included for the Cloudtrail DFP pipeline. The following are some commands to -run the example. - -Train user models from files in `models/datasets/training-data/dfp-cloudtrail-*.csv` and saves user models to file. Pipeline then uses these models to run inference on Cloudtrail validation data in `models/datasets/validation-data/dfp-cloudtrail-*-input.csv`. Inference results are written to `cloudtrail-dfp-results.csv`. -``` -python ./examples/digital_fingerprinting/run_cloudtrail_dfp.py \ - --columns_file=morpheus/data/columns_ae_cloudtrail.txt \ - --input_glob=models/datasets/validation-data/dfp-cloudtrail-*-input.csv \ - --train_data_glob=models/datasets/training-data/dfp-*.csv \ - --models_output_filename=models/dfp-models/cloudtrail_ae_user_models.pkl \ - --output_file ./cloudtrail-dfp-results.csv -``` - -Here we load pre-trained user models from the file (`models/dfp-models/cloudtrail_ae_user_models.pkl`) we created in the previous example. Pipeline then uses these models to run inference on validation data in `models/datasets/validation-data/dfp-cloudtrail-*-input.csv`. Inference results are written to `cloudtrail-dfp-results.csv`. -``` -python ./examples/digital_fingerprinting/run_cloudtrail_dfp.py \ - --columns_file=morpheus/data/columns_ae_cloudtrail.txt \ - --input_glob=models/datasets/validation-data/dfp-cloudtrail-*-input.csv \ - --pretrained_filename=models/dfp-models/cloudtrail_ae_user_models.pkl \ - --output_file=./cloudtrail-dfp-results.csv -``` \ No newline at end of file +Guides for each of the two examples can be found in their respective directories: [The Starter Example](./starter/README.md) and [The Production Example](./production/README.md) diff --git a/examples/digital_fingerprinting/production/.env.sample b/examples/digital_fingerprinting/production/.env.sample new file mode 100644 index 0000000000..3fee3685cc --- /dev/null +++ b/examples/digital_fingerprinting/production/.env.sample @@ -0,0 +1,11 @@ +# NOTE: This file should be copied to `.env` in the same folder and updated for each user +MYSQL_DATABASE="db" +MYSQL_USER="mlflow" +MYSQL_PASSWORD="good_password" +MYSQL_ROOT_PASSWORD="even_better_password" +MYSQL_ROOT_HOST="172.*.*.*" +MYSQL_LOG_CONSOLE=1 + +# Update these with your own credentials UID=$(id -u) GID=$(id -g) +UID=1000 +GID=1000 diff --git a/examples/digital_fingerprinting/production/.gitignore b/examples/digital_fingerprinting/production/.gitignore new file mode 100644 index 0000000000..175181a4c4 --- /dev/null +++ b/examples/digital_fingerprinting/production/.gitignore @@ -0,0 +1,2 @@ +*.s3_cache +artifacts/ diff --git a/examples/digital_fingerprinting/production/README.md b/examples/digital_fingerprinting/production/README.md new file mode 100644 index 0000000000..a7e01c636b --- /dev/null +++ b/examples/digital_fingerprinting/production/README.md @@ -0,0 +1,17 @@ +# "Production" Digital Fingerprinting Pipeline + +### Build the Morpheus container + +This is necessary to get the latest changes needed for DFP + +```bash +./docker/build_container_release.sh +``` + +### Running locally via `docker-compose` + +```bash +docker-compose build + +docker-compose up +``` diff --git a/examples/digital_fingerprinting/production/docker-compose.yml b/examples/digital_fingerprinting/production/docker-compose.yml new file mode 100644 index 0000000000..26d5cb843a --- /dev/null +++ b/examples/digital_fingerprinting/production/docker-compose.yml @@ -0,0 +1,134 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +version: '3.3' + +services: + db: + restart: always + image: mysql/mysql-server + container_name: mlflow_db + expose: + - "3306" + networks: + - backend + environment: + - MYSQL_DATABASE=${MYSQL_DATABASE} + - MYSQL_USER=${MYSQL_USER} + - MYSQL_PASSWORD=${MYSQL_PASSWORD} + - MYSQL_ROOT_PASSWORD=${MYSQL_ROOT_PASSWORD} + - MYSQL_ROOT_HOST=${MYSQL_ROOT_HOST} + volumes: + - db_data:/var/lib/mysql + + mlflow: + restart: always + build: ./mlflow + image: mlflow_server + container_name: mlflow_server + ports: + - "5000:5000" + networks: + - frontend + - backend + # command: mlflow server --backend-store-uri mysql+pymysql://${MYSQL_USER}:${MYSQL_PASSWORD}@db:3306/${MYSQL_DATABASE} --serve-artifacts --artifacts-destination /opt/mlflow/artifacts --host 0.0.0.0 + command: mlflow server --gunicorn-opts "--log-level debug" --backend-store-uri sqlite:////opt/mlflow/dbdata/mlflow.db --serve-artifacts --artifacts-destination /opt/mlflow/artifacts --host 0.0.0.0 + volumes: + - db_data:/opt/mlflow/dbdata + - mlflow_data:/opt/mlflow/artifacts + # depends_on: + # - db + + jupyter: + restart: always + build: + context: ./jupyter + args: + - MORPHEUS_CONTAINER_VERSION=v22.08.00a-runtime + image: dfp_morpheus_jupyter + container_name: jupyter + ports: + - "8888:8888" + networks: + - frontend + - backend + environment: + - VAULT_ROLE_ID=${VAULT_ROLE_ID} + - VAULT_SECRET_ID=${VAULT_SECRET_ID} + command: jupyter-lab --no-browser --allow-root --ip='*' + volumes: + - ../..:/work + working_dir: /work/examples/dfp_workflow + depends_on: + - mlflow + profiles: + - dev + + morpheus_training: + # restart: always + build: + context: ./morpheus + args: + - MORPHEUS_CONTAINER_VERSION=v22.08.00a-runtime + image: dfp_morpheus + container_name: morpheus_training + networks: + - frontend + - backend + environment: + # Colorize the terminal in the container if possible + TERM: "${TERM:-}" + # PS1: "$$(whoami):$$(pwd) $$ " + VAULT_ROLE_ID: "${VAULT_ROLE_ID}" + VAULT_SECRET_ID: "${VAULT_SECRET_ID}" + DFP_CACHE_DIR: "/work/.cache/dfp" + DFP_TRACKING_URI: "http://mlflow:5000" + command: ./launch.sh --train_users=generic --duration=1d + volumes: + # - ./.s3_cache:/work/.s3_cache + - ../..:/work + # - /etc/passwd:/etc/passwd:ro + # - /etc/group:/etc/group:ro + working_dir: /work/examples/dfp_workflow/morpheus + depends_on: + - mlflow + profiles: + - training + cap_add: + - sys_nice + user: "${UID}:${GID}" + + # nginx: + # restart: always + # build: ./nginx + # image: mlflow_nginx + # container_name: mlflow_nginx + # ports: + # - "80:80" + # networks: + # - frontend + # depends_on: + # - web + +networks: + frontend: + driver: bridge + backend: + driver: bridge + +volumes: + db_data: + mlflow_data: diff --git a/examples/digital_fingerprinting/production/jupyter/Dockerfile b/examples/digital_fingerprinting/production/jupyter/Dockerfile new file mode 100644 index 0000000000..d6d5a3ba58 --- /dev/null +++ b/examples/digital_fingerprinting/production/jupyter/Dockerfile @@ -0,0 +1,74 @@ +# SPDX-FileCopyrightText: Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ARG MORPHEUS_CONTAINER=nvcr.io/nvidia/morpheus/morpheus +ARG MORPHEUS_CONTAINER_VERSION=v22.08.00-runtime + +FROM ${MORPHEUS_CONTAINER}:${MORPHEUS_CONTAINER_VERSION} as base + +# Fix the entrypoint to work with different WORKDIR +ENTRYPOINT [ "/opt/conda/bin/tini", "--", "/workspace/docker/entrypoint.sh" ] + +SHELL ["/bin/bash", "-c"] + +# Install vault +RUN apt-get update \ + && apt-get install -y \ + apt-utils \ + jq \ + lsb-release \ + software-properties-common \ + && curl -fsSL https://apt.releases.hashicorp.com/gpg | apt-key add - \ + && apt-add-repository "deb [arch=amd64] https://apt.releases.hashicorp.com $(lsb_release -cs) main" \ + && apt-get update \ + && apt-get install --reinstall -y \ + vault \ + && apt-get clean all \ + && rm -rf /var/lib/apt/lists/* + +# Fix vault install per: https://github.com/hashicorp/vault/issues/10924#issuecomment-1197259930 +RUN setcap -r /usr/bin/vault + +# # Install NGC CLI +# COPY ${UBA_COMMON}/utils/*.sh ./ +# RUN chmod +x *.sh \ +# && bash ngc-cli-install.sh + +# Install DFP dependencies +RUN source activate morpheus \ + && mamba install -y -c conda-forge \ + boto3 \ + dill \ + ipywidgets \ + jupyterlab \ + kfp \ + librdkafka \ + mlflow \ + nb_conda_kernels \ + papermill \ + s3fs + +WORKDIR /work/examples/dfp_workflow/morpheus + +# # This will get used by pipelines for the --s3_cache option +# ENV DFP_S3_CACHE="/work/examples/dfp_workflow/morpheus/.s3_cache" + +# Set the tracking URI for mlflow +ENV MLFLOW_TRACKING_URI="http://mlflow:5000" + +# Copy the sources +COPY . ./ + +CMD ["jupyter-lab", "--ip=0.0.0.0", "--no-browser", "--allow-root"] diff --git a/examples/digital_fingerprinting/production/mlflow/Dockerfile b/examples/digital_fingerprinting/production/mlflow/Dockerfile new file mode 100644 index 0000000000..8d8dc942bb --- /dev/null +++ b/examples/digital_fingerprinting/production/mlflow/Dockerfile @@ -0,0 +1,32 @@ +# SPDX-FileCopyrightText: Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM python:3.8-slim-buster + +# Install curl for health check +RUN apt update && \ + apt install -y --no-install-recommends \ + curl libyaml-cpp-dev libyaml-dev && \ + apt autoremove -y && \ + apt clean all && \ + rm -rf /var/cache/apt/* /var/lib/apt/lists/* + +# Install python packages +RUN pip install mlflow boto3 pymysql pyyaml + +# We run on port 5000 +EXPOSE 5000 + +HEALTHCHECK CMD curl -f http://localhost:5000/health || exit 1 diff --git a/examples/digital_fingerprinting/production/morpheus/.dockerignore b/examples/digital_fingerprinting/production/morpheus/.dockerignore new file mode 100644 index 0000000000..ec31dc13d3 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/.dockerignore @@ -0,0 +1,9 @@ +# Ignore any Dockerfile +Dockerfile + +# ML Flow files +artifacts/ +mlflow.db + +# Ignore any S3 cache folders +*.s3_cache diff --git a/examples/digital_fingerprinting/production/morpheus/.gitignore b/examples/digital_fingerprinting/production/morpheus/.gitignore new file mode 100644 index 0000000000..5f423a4797 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/.gitignore @@ -0,0 +1 @@ +get_aws_credentials.sh diff --git a/examples/digital_fingerprinting/production/morpheus/Dockerfile b/examples/digital_fingerprinting/production/morpheus/Dockerfile new file mode 100644 index 0000000000..4126d938d3 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/Dockerfile @@ -0,0 +1,83 @@ +# SPDX-FileCopyrightText: Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ARG MORPHEUS_CONTAINER=nvcr.io/nvidia/morpheus/morpheus +ARG MORPHEUS_CONTAINER_VERSION=v22.09.00-runtime +ARG ADDITIONAL_GROUPS +ARG FIXUID_VERSION=0.5.1 + +FROM ${MORPHEUS_CONTAINER}:${MORPHEUS_CONTAINER_VERSION} as common_base + +# Install fixuid +RUN curl -SsL https://github.com/boxboat/fixuid/releases/download/v0.5.1/fixuid-0.5.1-linux-amd64.tar.gz | tar -C /usr/bin -xzf - && \ + chown root:root /usr/bin/fixuid && chmod 4755 /usr/bin/fixuid && mkdir -p /etc/fixuid && \ + bash -c 'echo -e "\ +user: morpheus\n\ +group: morpheus\n\ +paths:\n\ + - /home/morpheus\n\ + - /opt/conda/envs/morpheus\n\ +" | tee /etc/fixuid/config.yml >/dev/null' && \ + \ + # Add a non-root user + useradd \ + --uid 1000 --shell /bin/bash \ + --user-group ${ADDITIONAL_GROUPS} \ + --create-home --home-dir /home/morpheus \ + morpheus + +# Fix the entrypoint to work with different WORKDIR +ENTRYPOINT [ "/opt/conda/bin/tini", "--", "fixuid", "-q", "/workspace/docker/entrypoint.sh" ] + +SHELL ["/bin/bash", "-c"] + +# Install vault +RUN apt-get update \ + && apt-get install -y \ + apt-utils \ + jq \ + lsb-release \ + software-properties-common \ + && curl -fsSL https://apt.releases.hashicorp.com/gpg | apt-key add - \ + && apt-add-repository "deb [arch=amd64] https://apt.releases.hashicorp.com $(lsb_release -cs) main" \ + && apt-get update \ + && apt-get install --reinstall -y \ + vault \ + && apt-get clean all \ + && rm -rf /var/lib/apt/lists/* + +# Fix vault install per: https://github.com/hashicorp/vault/issues/10924#issuecomment-1197259930 +RUN setcap -r /usr/bin/vault + +# Copy the conda_env file +COPY ./conda_env.yml ./ + +# Install DFP dependencies +RUN source activate morpheus \ + && mamba env update -n morpheus -f ./conda_env.yml + +USER morpheus + +FROM common_base as base + +WORKDIR /work/examples/dfp_workflow/morpheus + +# This will get used by pipelines for the --s3_cache option +# ENV DFP_S3_CACHE="/work/examples/dfp_workflow/morpheus/.s3_cache" + +# Copy the sources +COPY . ./ + +CMD ["bash", "-c", "./launch.sh"] diff --git a/examples/digital_fingerprinting/production/morpheus/conda_env.yml b/examples/digital_fingerprinting/production/morpheus/conda_env.yml new file mode 100644 index 0000000000..bfef17915a --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/conda_env.yml @@ -0,0 +1,31 @@ +# SPDX-FileCopyrightText: Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: morpheus +channels: + - rapidsai + - nvidia + - nvidia/label/dev # For pre-releases of SRF. Should still default to full releases if available + - nvidia/label/cuda-11.5.2 # For cuda-nvml-dev=11.5, which is not published under nvidia channel yet. + - conda-forge +dependencies: + ####### Morpheus Dependencies (keep sorted!) ####### + - boto3 + - dill + - kfp + - librdkafka + - mlflow + - papermill + - s3fs diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/__init__.py b/examples/digital_fingerprinting/production/morpheus/dfp/__init__.py new file mode 100644 index 0000000000..d11ef3c507 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/messages/__init__.py b/examples/digital_fingerprinting/production/morpheus/dfp/messages/__init__.py new file mode 100644 index 0000000000..d11ef3c507 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/messages/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/messages/multi_dfp_message.py b/examples/digital_fingerprinting/production/morpheus/dfp/messages/multi_dfp_message.py new file mode 100644 index 0000000000..e48c5ba269 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/messages/multi_dfp_message.py @@ -0,0 +1,79 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dataclasses +import logging +import typing + +from morpheus.messages.message_meta import MessageMeta +from morpheus.messages.multi_message import MultiMessage + +logger = logging.getLogger(__name__) + + +@dataclasses.dataclass +class DFPMessageMeta(MessageMeta, cpp_class=None): + """ + This class extends MessageMeta to also hold userid corresponding to batched metadata. + + Parameters + ---------- + df : pandas.DataFrame + Input rows in dataframe. + user_id : str + User id. + + """ + user_id: str + + def get_df(self): + return self.df + + def set_df(self, df): + self.df = df + + +@dataclasses.dataclass +class MultiDFPMessage(MultiMessage): + + def __post_init__(self): + + assert isinstance(self.meta, DFPMessageMeta), "`meta` must be an instance of DFPMessageMeta" + + @property + def user_id(self): + return typing.cast(DFPMessageMeta, self.meta).user_id + + def get_meta_dataframe(self): + return typing.cast(DFPMessageMeta, self.meta).get_df() + + def get_slice(self, start, stop): + """ + Returns sliced batches based on offsets supplied. Automatically calculates the correct `mess_offset` + and `mess_count`. + + Parameters + ---------- + start : int + Start offset address. + stop : int + Stop offset address. + + Returns + ------- + morpheus.pipeline.preprocess.autoencoder.MultiAEMessage + A new `MultiAEMessage` with sliced offset and count. + + """ + return MultiDFPMessage(meta=self.meta, mess_offset=start, mess_count=stop - start) diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/stages/__init__.py b/examples/digital_fingerprinting/production/morpheus/dfp/stages/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_file_batcher_stage.py b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_file_batcher_stage.py new file mode 100644 index 0000000000..bd5034ccf3 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_file_batcher_stage.py @@ -0,0 +1,104 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import typing + +import fsspec +import pandas as pd +import srf +from srf.core import operators as ops + +from morpheus.config import Config +from morpheus.pipeline.single_port_stage import SinglePortStage +from morpheus.pipeline.stream_pair import StreamPair + +logger = logging.getLogger("morpheus.{}".format(__name__)) + + +class DFPFileBatcherStage(SinglePortStage): + + def __init__(self, c: Config, date_conversion_func, period="D", sampling_rate_s=0): + super().__init__(c) + + self._date_conversion_func = date_conversion_func + self._sampling_rate_s = sampling_rate_s + self._period = period + + @property + def name(self) -> str: + return "dfp-file-batcher" + + def supports_cpp_node(self): + return False + + def accepted_types(self) -> typing.Tuple: + return (fsspec.core.OpenFiles, ) + + def on_data(self, file_objects: fsspec.core.OpenFiles): + + file_object_list = file_objects + + # Create a dataframe with the incoming metadata + if ((len(file_object_list) > 1) and (self._sampling_rate_s > 0)): + file_sampled_list = [] + + file_object_list.sort(key=lambda file_object: self._date_conversion_func(file_object)) + + ts_last = self._date_conversion_func(file_object_list[0]) + + file_sampled_list.append(file_object_list[0]) + + for idx in range(1, len(file_object_list)): + ts = self._date_conversion_func(file_object_list[idx]) + + if ((ts - ts_last).seconds >= self._sampling_rate_s): + + file_sampled_list.append(file_object_list[idx]) + ts_last = ts + else: + file_object_list = file_sampled_list + + df = pd.DataFrame() + + df["dfp_timestamp"] = [self._date_conversion_func(file_object) for file_object in file_object_list] + df["key"] = [file_object.full_name for file_object in file_object_list] + df["objects"] = file_object_list + + # Now split by the batching settings + df_period = df["dfp_timestamp"].dt.to_period(self._period) + + period_gb = df.groupby(df_period) + + output_batches = [] + + n_groups = len(period_gb) + for group in period_gb.groups: + period_df = period_gb.get_group(group) + + obj_list = fsspec.core.OpenFiles(period_df["objects"].to_list(), mode=file_objects.mode, fs=file_objects.fs) + + output_batches.append((obj_list, n_groups)) + + return output_batches + + def _build_single(self, builder: srf.Builder, input_stream: StreamPair) -> StreamPair: + + def node_fn(obs: srf.Observable, sub: srf.Subscriber): + obs.pipe(ops.map(self.on_data), ops.flatten()).subscribe(sub) + + stream = builder.make_node_full(self.unique_name, node_fn) + builder.make_edge(input_stream[0], stream) + + return stream, typing.List[fsspec.core.OpenFiles] diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_file_to_df.py b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_file_to_df.py new file mode 100644 index 0000000000..dbd2416aaa --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_file_to_df.py @@ -0,0 +1,257 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import hashlib +import json +import logging +import multiprocessing as mp +import os +import time +import typing +from functools import partial + +import fsspec +import pandas as pd +import srf +from srf.core import operators as ops + +import dask +from dask.distributed import Client +from dask.distributed import LocalCluster + +import cudf + +from morpheus._lib.file_types import FileTypes +from morpheus.config import Config +from morpheus.io.deserializers import read_file_to_df +from morpheus.pipeline.single_port_stage import SinglePortStage +from morpheus.pipeline.stream_pair import StreamPair + +from ..utils.column_info import DataFrameInputSchema +from ..utils.column_info import process_dataframe + +logger = logging.getLogger("morpheus.{}".format(__name__)) + + +def _single_object_to_dataframe(file_object: fsspec.core.OpenFile, + schema: DataFrameInputSchema, + file_type: FileTypes, + filter_null: bool, + parser_kwargs: dict): + + retries = 0 + s3_df = None + while (retries < 2): + try: + with file_object as f: + s3_df = read_file_to_df(f, + file_type, + filter_nulls=filter_null, + df_type="pandas", + parser_kwargs=parser_kwargs) + + break + except Exception as e: + if (retries < 2): + logger.warning("Refreshing S3 credentials") + # cred_refresh() + retries += 1 + else: + raise e + + # Run the pre-processing before returning + if (s3_df is None): + return s3_df + + s3_df = process_dataframe(df_in=s3_df, input_schema=schema) + + return s3_df + + +class DFPFileToDataFrameStage(SinglePortStage): + + def __init__(self, + c: Config, + schema: DataFrameInputSchema, + filter_null: bool = True, + file_type: FileTypes = FileTypes.Auto, + parser_kwargs: dict = None, + cache_dir: str = "./.cache/dfp"): + super().__init__(c) + + self._schema = schema + + self._batch_size = 10 + self._batch_cache = [] + self._file_type = file_type + self._filter_null = filter_null + self._parser_kwargs = {} if parser_kwargs is None else parser_kwargs + self._cache_dir = os.path.join(cache_dir, "file_cache") + + self._dask_cluster: Client = None + + self._download_method: typing.Literal["single_thread", "multiprocess", "dask", + "dask_thread"] = os.environ.get("FILE_DOWNLOAD_TYPE", "dask_thread") + + @property + def name(self) -> str: + return "dfp-s3-to-df" + + def supports_cpp_node(self): + return False + + def accepted_types(self) -> typing.Tuple: + return (typing.Any, ) + + def _get_dask_cluster(self): + + if (self._dask_cluster is None): + logger.debug("Creating dask cluster...") + + # Up the heartbeat interval which can get violated with long download times + dask.config.set({"distributed.client.heartbeat": "30s"}) + + self._dask_cluster = LocalCluster(start=True, processes=not self._download_method == "dask_thread") + + logger.debug("Creating dask cluster... Done. Dashboard: %s", self._dask_cluster.dashboard_link) + + return self._dask_cluster + + def _close_dask_cluster(self): + if (self._dask_cluster is not None): + logger.debug("Stopping dask cluster...") + + self._dask_cluster.close() + + self._dask_cluster = None + + logger.debug("Stopping dask cluster... Done.") + + def _get_or_create_dataframe_from_s3_batch( + self, file_object_batch: typing.Tuple[fsspec.core.OpenFiles, int]) -> typing.Tuple[cudf.DataFrame, bool]: + + if (not file_object_batch): + return None, False + + file_list = file_object_batch[0] + batch_count = file_object_batch[1] + + fs: fsspec.AbstractFileSystem = file_list.fs + + # Create a list of dictionaries that only contains the information we are interested in hashing. `ukey` just + # hashes all of the output of `info()` which is perfect + hash_data = [{"ukey": fs.ukey(file_object.path)} for file_object in file_list] + + # Convert to base 64 encoding to remove - values + objects_hash_hex = hashlib.md5(json.dumps(hash_data, sort_keys=True).encode()).hexdigest() + + batch_cache_location = os.path.join(self._cache_dir, "batches", f"{objects_hash_hex}.pkl") + + # Return the cache if it exists + if (os.path.exists(batch_cache_location)): + output_df = pd.read_pickle(batch_cache_location) + output_df["origin_hash"] = objects_hash_hex + output_df["batch_count"] = batch_count + + return (output_df, True) + + # Cache miss + download_method = partial(_single_object_to_dataframe, + schema=self._schema, + file_type=self._file_type, + filter_null=self._filter_null, + parser_kwargs=self._parser_kwargs) + + download_buckets = file_list + + # Loop over dataframes and concat into one + try: + dfs = [] + if (self._download_method.startswith("dask")): + + # Create the client each time to ensure all connections to the cluster are closed (they can time out) + with Client(self._get_dask_cluster()) as client: + dfs = client.map(download_method, download_buckets) + + dfs = client.gather(dfs) + + elif (self._download_method == "multiprocessing"): + # Use multiprocessing here since parallel downloads are a pain + with mp.get_context("spawn").Pool(mp.cpu_count()) as p: + dfs = p.map(download_method, download_buckets) + else: + # Simply loop + for s3_object in download_buckets: + dfs.append(download_method(s3_object)) + + except Exception: + logger.exception("Failed to download logs. Error: ", exc_info=True) + return None, False + + if (not dfs): + logger.error("No logs were downloaded") + return None, False + + output_df: pd.DataFrame = pd.concat(dfs) + + # Finally sort by timestamp and then reset the index + output_df.sort_values(by=["timestamp"], inplace=True) + + output_df.reset_index(drop=True, inplace=True) + + # Save dataframe to cache future runs + os.makedirs(os.path.dirname(batch_cache_location), exist_ok=True) + + try: + output_df.to_pickle(batch_cache_location) + except Exception: + logger.warning("Failed to save batch cache. Skipping cache for this batch.", exc_info=True) + + output_df["batch_count"] = batch_count + output_df["origin_hash"] = objects_hash_hex + + return (output_df, False) + + def convert_to_dataframe(self, s3_object_batch: typing.Tuple[fsspec.core.OpenFiles, int]): + if (not s3_object_batch): + return None + + start_time = time.time() + + try: + + output_df, cache_hit = self._get_or_create_dataframe_from_s3_batch(s3_object_batch) + + duration = (time.time() - start_time) * 1000.0 + + logger.debug("S3 objects to DF complete. Rows: %s, Cache: %s, Duration: %s ms", + len(output_df), + "hit" if cache_hit else "miss", + duration) + + return output_df + except Exception: + logger.exception("Error while converting S3 buckets to DF.") + self._get_or_create_dataframe_from_s3_batch(s3_object_batch) + raise + + def _build_single(self, builder: srf.Builder, input_stream: StreamPair) -> StreamPair: + + def node_fn(obs: srf.Observable, sub: srf.Subscriber): + obs.pipe(ops.map(self.convert_to_dataframe), ops.on_completed(self._close_dask_cluster)).subscribe(sub) + + stream = builder.make_node_full(self.unique_name, node_fn) + builder.make_edge(input_stream[0], stream) + + return stream, cudf.DataFrame diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_inference_stage.py b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_inference_stage.py new file mode 100644 index 0000000000..2eae83f13d --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_inference_stage.py @@ -0,0 +1,121 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import time +import typing + +import srf +from mlflow.tracking.client import MlflowClient + +from morpheus.config import Config +from morpheus.messages.multi_ae_message import MultiAEMessage +from morpheus.pipeline.single_port_stage import SinglePortStage +from morpheus.pipeline.stream_pair import StreamPair + +from ..messages.multi_dfp_message import MultiDFPMessage +from ..utils.model_cache import ModelCache +from ..utils.model_cache import ModelManager +from ..utils.model_cache import UserModelMap + +logger = logging.getLogger("morpheus.{}".format(__name__)) + + +class DFPInferenceStage(SinglePortStage): + + def __init__(self, c: Config, model_name_formatter: str = ""): + super().__init__(c) + + self._client = MlflowClient() + self._fallback_user = self._config.ae.fallback_username + self._model_name_formatter = model_name_formatter + + self._users_to_models: typing.Dict[str, UserModelMap] = {} + + self._model_cache: typing.Dict[str, ModelCache] = {} + self._model_cache_size_max = 10 + + self._cache_timeout_sec = 600 + + self._model_manager = ModelManager(model_name_formatter=model_name_formatter) + + @property + def name(self) -> str: + return "dfp-inference" + + def supports_cpp_node(self): + return False + + def accepted_types(self) -> typing.Tuple: + return (MultiDFPMessage, ) + + def get_model(self, user: str) -> ModelCache: + + return self._model_manager.load_user_model(self._client, user_id=user, fallback_user_ids=[self._fallback_user]) + + def on_data(self, message: MultiDFPMessage): + if (not message or message.mess_count == 0): + return None + + start_time = time.time() + + df_user = message.get_meta() + user_id = message.user_id + + try: + model_cache = self.get_model(user_id) + + if (model_cache is None): + raise RuntimeError("Could not find model for user {}".format(user_id)) + + loaded_model = model_cache.load_model(self._client) + + except Exception: # TODO + logger.exception("Error trying to get model") + return None + + post_model_time = time.time() + + results_df = loaded_model.get_results(df_user, return_abs=True) + + # Create an output message to allow setting meta + output_message = MultiAEMessage(message.meta, + mess_offset=message.mess_offset, + mess_count=message.mess_count, + model=loaded_model) + + output_message.set_meta(list(results_df.columns), results_df) + + output_message.set_meta('model_version', f"{model_cache.reg_model_name}:{model_cache.reg_model_version}") + + if logger.isEnabledFor(logging.DEBUG): + load_model_duration = (post_model_time - start_time) * 1000.0 + get_anomaly_duration = (time.time() - post_model_time) * 1000.0 + + logger.debug("Completed inference for user %s. Model load: %s ms, Model infer: %s ms. Start: %s, End: %s", + user_id, + load_model_duration, + get_anomaly_duration, + df_user[self._config.ae.timestamp_column_name].min(), + df_user[self._config.ae.timestamp_column_name].max()) + + return output_message + + def _build_single(self, builder: srf.Builder, input_stream: StreamPair) -> StreamPair: + node = builder.make_node(self.unique_name, self.on_data) + builder.make_edge(input_stream[0], node) + + # node.launch_options.pe_count = self._config.num_threads + + return node, MultiAEMessage diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_mlflow_model_writer.py b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_mlflow_model_writer.py new file mode 100644 index 0000000000..e0cf1e5e89 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_mlflow_model_writer.py @@ -0,0 +1,262 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import hashlib +import logging +import os +import typing +import urllib.parse + +import mlflow +import requests +import srf +from dfencoder import AutoEncoder +from mlflow.exceptions import MlflowException +from mlflow.models.signature import ModelSignature +from mlflow.protos.databricks_pb2 import RESOURCE_ALREADY_EXISTS +from mlflow.protos.databricks_pb2 import ErrorCode +from mlflow.store.artifact.runs_artifact_repo import RunsArtifactRepository +from mlflow.tracking import MlflowClient +from mlflow.types import ColSpec +from mlflow.types import Schema +from mlflow.types.utils import _infer_pandas_column +from mlflow.types.utils import _infer_schema +from srf.core import operators as ops + +from morpheus.config import Config +from morpheus.messages.multi_ae_message import MultiAEMessage +from morpheus.pipeline.single_port_stage import SinglePortStage +from morpheus.pipeline.stream_pair import StreamPair + +# Setup conda environment +conda_env = { + 'channels': ['defaults', 'conda-forge'], + 'dependencies': ['python={}'.format('3.8'), 'pip'], + 'pip': ['mlflow', 'dfencoder'], + 'name': 'mlflow-env' +} + +logger = logging.getLogger(f"morpheus.{__name__}") + + +class DFPMLFlowModelWriterStage(SinglePortStage): + + def __init__(self, + c: Config, + model_name_formatter: str = "dfp-{user_id}", + experiment_name_formatter: str = "/dfp-models/{reg_model_name}", + databricks_permissions: dict = None): + super().__init__(c) + + self._model_name_formatter = model_name_formatter + self._experiment_name_formatter = experiment_name_formatter + self._databricks_permissions = databricks_permissions + + @property + def name(self) -> str: + return "dfp-mlflow-model-writer" + + def supports_cpp_node(self): + return False + + def accepted_types(self) -> typing.Tuple: + return (MultiAEMessage, ) + + def user_id_to_model(self, user_id: str): + + kwargs = { + "user_id": user_id, + "user_md5": hashlib.md5(user_id.encode('utf-8')).hexdigest(), + } + + return self._model_name_formatter.format(**kwargs) + + def user_id_to_experiment(self, user_id: str): + kwargs = { + "user_id": user_id, + "user_md5": hashlib.md5(user_id.encode('utf-8')).hexdigest(), + "reg_model_name": self.user_id_to_model(user_id=user_id) + } + + return self._experiment_name_formatter.format(**kwargs) + + def _apply_model_permissions(self, reg_model_name: str): + + # Check the required variables + databricks_host = os.environ.get("DATABRICKS_HOST", None) + databricks_token = os.environ.get("DATABRICKS_TOKEN", None) + + if (databricks_host is None or databricks_token is None): + raise RuntimeError("Cannot set Databricks model permissions. " + "Environment variables `DATABRICKS_HOST` and `DATABRICKS_TOKEN` must be set") + + headers = {"Authorization": f"Bearer {databricks_token}"} + + url_base = f"{databricks_host}" + + try: + # First get the registered model ID + get_registered_model_url = urllib.parse.urljoin(url_base, + "/api/2.0/mlflow/databricks/registered-models/get") + + get_registered_model_response = requests.get(url=get_registered_model_url, + headers=headers, + params={"name": reg_model_name}) + + registered_model_response = get_registered_model_response.json() + + reg_model_id = registered_model_response["registered_model_databricks"]["id"] + + # Now apply the permissions. If it exists already, it will be overwritten or it is a no-op + patch_registered_model_permissions_url = urllib.parse.urljoin( + url_base, f"/api/2.0/preview/permissions/registered-models/{reg_model_id}") + + patch_registered_model_permissions_body = { + "access_control_list": [{ + "group_name": group, "permission_level": permission + } for group, + permission in self._databricks_permissions.items()] + } + + requests.patch(url=patch_registered_model_permissions_url, + headers=headers, + json=patch_registered_model_permissions_body) + + except Exception: + logger.exception("Error occurred trying to apply model permissions to model: %s", + reg_model_name, + exc_info=True) + + def on_data(self, message: MultiAEMessage): + + user = message.meta.user_id + + model: AutoEncoder = message.model + + model_path = "dfencoder" + reg_model_name = self.user_id_to_model(user_id=user) + + # Write to ML Flow + try: + mlflow.end_run() + + experiment_name = self.user_id_to_experiment(user_id=user) + + # Creates a new experiment if it doesnt exist + experiment = mlflow.set_experiment(experiment_name) + + with mlflow.start_run(run_name="Duo autoencoder model training run", + experiment_id=experiment.experiment_id) as run: + + model_path = f"{model_path}-{run.info.run_uuid}" + + # Log all params in one dict to avoid round trips + mlflow.log_params({ + "Algorithm": "Denosing Autoencoder", + "Epochs": model.lr_decay.state_dict().get("last_epoch", "unknown"), + "Learning rate": model.lr, + "Batch size": model.batch_size, + "Start Epoch": message.get_meta("timestamp").min(), + "End Epoch": message.get_meta("timestamp").max(), + "Log Count": message.mess_count, + }) + + metrics_dict: typing.Dict[str, float] = {} + + # Add info on the embeddings + for k, v in model.categorical_fts.items(): + embedding = v.get("embedding", None) + + if (embedding is None): + continue + + metrics_dict[f"embedding-{k}-num_embeddings"] = embedding.num_embeddings + metrics_dict[f"embedding-{k}-embedding_dim"] = embedding.embedding_dim + + # Add metrics for all of the loss stats + if (hasattr(model, "feature_loss_stats")): + for k, v in model.feature_loss_stats.items(): + metrics_dict[f"loss-{k}-mean"] = v.get("mean", "unknown") + metrics_dict[f"loss-{k}-std"] = v.get("std", "unknown") + + mlflow.log_metrics(metrics_dict) + + # Use the prepare_df function to setup the direct inputs to the model. Only include features returned by + # prepare_df to show the actual inputs to the model (any extra are discarded) + input_df = message.get_meta().iloc[0:1] + prepared_df = model.prepare_df(input_df) + output_values = model.get_anomaly_score(input_df) + + input_schema = Schema([ + ColSpec(type=_infer_pandas_column(input_df[col_name]), name=col_name) + for col_name in list(prepared_df.columns) + ]) + output_schema = _infer_schema(output_values) + + model_sig = ModelSignature(inputs=input_schema, outputs=output_schema) + + model_info = mlflow.pytorch.log_model( + pytorch_model=model, + artifact_path=model_path, + conda_env=conda_env, + signature=model_sig, + ) + + client = MlflowClient() + + # First ensure a registered model has been created + try: + create_model_response = client.create_registered_model(reg_model_name) + logger.debug("Successfully registered model '%s'.", create_model_response.name) + except MlflowException as e: + if e.error_code == ErrorCode.Name(RESOURCE_ALREADY_EXISTS): + pass + else: + raise e + + # If we are using databricks, make sure we set the correct permissions + if (self._databricks_permissions is not None and mlflow.get_tracking_uri() == "databricks"): + # Need to apply permissions + self._apply_model_permissions(reg_model_name=reg_model_name) + + model_src = RunsArtifactRepository.get_underlying_uri(model_info.model_uri) + + tags = { + "start": message.get_meta(self._config.ae.timestamp_column_name).min(), + "end": message.get_meta(self._config.ae.timestamp_column_name).max(), + "count": message.get_meta(self._config.ae.timestamp_column_name).count() + } + + # Now create the model version + mv = client.create_model_version(name=reg_model_name, + source=model_src, + run_id=run.info.run_id, + tags=tags) + + logger.debug("ML Flow model upload complete: %s:%s:%s", user, reg_model_name, mv.version) + + except Exception: + logger.exception("Error uploading model to ML Flow", exc_info=True) + + return message + + def _build_single(self, builder: srf.Builder, input_stream: StreamPair) -> StreamPair: + + def node_fn(obs: srf.Observable, sub: srf.Subscriber): + obs.pipe(ops.map(self.on_data)).subscribe(sub) + + stream = builder.make_node_full(self.unique_name, node_fn) + builder.make_edge(input_stream[0], stream) + + return stream, MultiAEMessage diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_postprocessing_stage.py b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_postprocessing_stage.py new file mode 100644 index 0000000000..f0a23eab73 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_postprocessing_stage.py @@ -0,0 +1,97 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import time +import typing +from datetime import datetime + +import numpy as np +import srf +from srf.core import operators as ops + +from morpheus.config import Config +from morpheus.messages.multi_ae_message import MultiAEMessage +from morpheus.pipeline.single_port_stage import SinglePortStage +from morpheus.pipeline.stream_pair import StreamPair + +from ..messages.multi_dfp_message import DFPMessageMeta + +logger = logging.getLogger("morpheus.{}".format(__name__)) + + +class DFPPostprocessingStage(SinglePortStage): + + def __init__(self, c: Config, z_score_threshold=2.0): + super().__init__(c) + + self._z_score_threshold = z_score_threshold + + @property + def name(self) -> str: + return "dfp-postproc" + + def supports_cpp_node(self): + return False + + def accepted_types(self) -> typing.Tuple: + return (MultiAEMessage, ) + + def _extract_events(self, message: MultiAEMessage): + + # Return the message for the next stage + z_scores = message.get_meta("mean_abs_z") + + above_threshold_df = message.get_meta()[z_scores > self._z_score_threshold] + + if (not above_threshold_df.empty): + above_threshold_df['event_time'] = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') + above_threshold_df = above_threshold_df.replace(np.nan, 'NaN', regex=True) + + return above_threshold_df + + return None + + def on_data(self, message: MultiAEMessage): + if (not message): + return None + + start_time = time.time() + + extracted_events = self._extract_events(message) + + duration = (time.time() - start_time) * 1000.0 + + if logger.isEnabledFor(logging.DEBUG): + logger.debug("Completed postprocessing for user %s in %s ms. Event count: %s. Start: %s, End: %s", + message.meta.user_id, + duration, + 0 if extracted_events is None else len(extracted_events), + message.get_meta(self._config.ae.timestamp_column_name).min(), + message.get_meta(self._config.ae.timestamp_column_name).max()) + + if (extracted_events is None): + return None + + return DFPMessageMeta(extracted_events, user_id=message.meta.user_id) + + def _build_single(self, builder: srf.Builder, input_stream: StreamPair) -> StreamPair: + + def node_fn(obs: srf.Observable, sub: srf.Subscriber): + obs.pipe(ops.map(self.on_data), ops.filter(lambda x: x is not None)).subscribe(sub) + + stream = builder.make_node_full(self.unique_name, node_fn) + builder.make_edge(input_stream[0], stream) + + return stream, DFPMessageMeta diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_preprocessing_stage.py b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_preprocessing_stage.py new file mode 100644 index 0000000000..b7fa2eafb5 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_preprocessing_stage.py @@ -0,0 +1,94 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import time +import typing + +import pandas as pd +import srf +from srf.core import operators as ops + +from morpheus.config import Config +from morpheus.pipeline.single_port_stage import SinglePortStage +from morpheus.pipeline.stream_pair import StreamPair + +from ..messages.multi_dfp_message import MultiDFPMessage +from ..utils.column_info import DataFrameInputSchema +from ..utils.column_info import process_dataframe + +logger = logging.getLogger("morpheus.{}".format(__name__)) + + +class DFPPreprocessingStage(SinglePortStage): + + def __init__(self, + c: Config, + input_schema: DataFrameInputSchema, + return_format: str = "data", + only_new_batches=False): + super().__init__(c) + + self._cache_ids = [] + self._input_schema = input_schema + self._df_user_frames = pd.DataFrame(columns=("username", "frame_path")) + self._cache_path = "preprocessing" + self._return_format = return_format + self._only_new_batches = only_new_batches + + @property + def name(self) -> str: + return "dfp-preproc" + + def supports_cpp_node(self): + return False + + def accepted_types(self) -> typing.Tuple: + return (MultiDFPMessage, ) + + def process_features(self, message: MultiDFPMessage): + if (message is None): + return None + + start_time = time.time() + + # Process the columns + df_processed = process_dataframe(message.get_meta_dataframe(), self._input_schema) + + # Apply the new dataframe, only the rows in the offset + message.set_meta(list(df_processed.columns), + df_processed.iloc[message.mess_offset:message.mess_offset + message.mess_count]) + + if logger.isEnabledFor(logging.DEBUG): + duration = (time.time() - start_time) * 1000.0 + + logger.debug("Preprocessed %s data for logs in %s to %s in %s ms", + message.mess_count, + message.get_meta(self._config.ae.timestamp_column_name).min(), + message.get_meta(self._config.ae.timestamp_column_name).max(), + duration) + + return message + + def _build_single(self, builder: srf.Builder, input_stream: StreamPair) -> StreamPair: + + def node_fn(obs: srf.Observable, sub: srf.Subscriber): + obs.pipe(ops.map(self.process_features)).subscribe(sub) + + node = builder.make_node_full(self.unique_name, node_fn) + builder.make_edge(input_stream[0], node) + + # node.launch_options.pe_count = self._config.num_threads + + return node, MultiDFPMessage diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_rolling_window_stage.py b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_rolling_window_stage.py new file mode 100644 index 0000000000..66c0f49b49 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_rolling_window_stage.py @@ -0,0 +1,326 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dataclasses +import logging +import os +import pickle +import typing +from contextlib import contextmanager +from datetime import datetime +from datetime import timedelta +from datetime import timezone + +import pandas as pd +import srf +from srf.core import operators as ops + +from morpheus.config import Config +from morpheus.pipeline.single_port_stage import SinglePortStage +from morpheus.pipeline.stream_pair import StreamPair + +from ..messages.multi_dfp_message import DFPMessageMeta +from ..messages.multi_dfp_message import MultiDFPMessage +from ..utils.logging_timer import log_time + +# Setup conda environment +conda_env = { + 'channels': ['defaults', 'conda-forge'], + 'dependencies': ['python={}'.format('3.8'), 'pip'], + 'pip': ['mlflow', 'dfencoder'], + 'name': 'mlflow-env' +} + +logger = logging.getLogger("morpheus.{}".format(__name__)) + + +@dataclasses.dataclass +class CachedUserWindow: + user_id: str + cache_location: str + timestamp_column: str = "timestamp" + total_count: int = 0 + count: int = 0 + min_epoch: datetime = datetime(1970, 1, 1, tzinfo=timezone(timedelta(hours=0))) + max_epoch: datetime = datetime(1970, 1, 1, tzinfo=timezone(timedelta(hours=0))) + batch_count: int = 0 + pending_batch_count: int = 0 + last_train_count: int = 0 + last_train_epoch: datetime = None + last_train_batch: int = 0 + + _trained_rows: pd.Series = dataclasses.field(init=False, repr=False, default_factory=pd.DataFrame) + _df: pd.DataFrame = dataclasses.field(init=False, repr=False, default_factory=pd.DataFrame) + + def append_dataframe(self, incoming_df: pd.DataFrame) -> bool: + + # # Get the row hashes + # row_hashes = pd.util.hash_pandas_object(incoming_df) + + # Filter the incoming df by epochs later than the current max_epoch + filtered_df = incoming_df[incoming_df["timestamp"] > self.max_epoch] + + if (len(filtered_df) == 0): + # We have nothing new to add. Double check that we fit within the window + before_history = incoming_df[incoming_df["timestamp"] < self.min_epoch] + + return len(before_history) == 0 + + # Increment the batch count + self.batch_count += 1 + self.pending_batch_count += 1 + + # Set the filtered index + filtered_df.index = range(self.total_count, self.total_count + len(filtered_df)) + + # Save the row hash to make it easier to find later. Do this before the batch so it doesnt participate + filtered_df["_row_hash"] = pd.util.hash_pandas_object(filtered_df, index=False) + + # Use batch id to distinguish groups in the same dataframe + filtered_df["_batch_id"] = self.batch_count + + # Append just the new rows + self._df = pd.concat([self._df, filtered_df]) + + self.total_count += len(filtered_df) + self.count = len(self._df) + + if (len(self._df) > 0): + self.min_epoch = self._df[self.timestamp_column].min() + self.max_epoch = self._df[self.timestamp_column].max() + + return True + + def get_train_df(self, max_history) -> pd.DataFrame: + + new_df = self.trim_dataframe(self._df, + max_history=max_history, + last_batch=self.batch_count - self.pending_batch_count, + timestamp_column=self.timestamp_column) + + self.last_train_count = self.total_count + self.last_train_epoch = datetime.now() + self.last_train_batch = self.batch_count + self.pending_batch_count = 0 + + self._df = new_df + + if (len(self._df) > 0): + self.min_epoch = self._df[self.timestamp_column].min() + self.max_epoch = self._df[self.timestamp_column].max() + + return new_df + + def save(self): + + # Make sure the directories exist + os.makedirs(os.path.dirname(self.cache_location), exist_ok=True) + + with open(self.cache_location, "wb") as f: + pickle.dump(self, f) + + @staticmethod + def trim_dataframe(df: pd.DataFrame, + max_history: typing.Union[int, str], + last_batch: int, + timestamp_column: str = "timestamp") -> pd.DataFrame: + if (max_history is None): + return df + + # Want to ensure we always see data once. So any new data is preserved + new_batches = df[df["_batch_id"] > last_batch] + + # See if max history is an int + if (isinstance(max_history, int)): + return df.tail(max(max_history, len(new_batches))) + + # If its a string, then its a duration + if (isinstance(max_history, str)): + # Get the latest timestamp + latest = df[timestamp_column].max() + + time_delta = pd.Timedelta(max_history) + + # Calc the earliest + earliest = min(latest - time_delta, new_batches[timestamp_column].min()) + + return df[df[timestamp_column] >= earliest] + + raise RuntimeError("Unsupported max_history") + + @staticmethod + def load(cache_location: str) -> "CachedUserWindow": + + with open(cache_location, "rb") as f: + return pickle.load(f) + + +class DFPRollingWindowStage(SinglePortStage): + + def __init__(self, + c: Config, + min_history: int, + min_increment: int, + max_history: typing.Union[int, str], + cache_dir: str = "./.cache/dfp"): + super().__init__(c) + + self._min_history = min_history + self._min_increment = min_increment + self._max_history = max_history + self._cache_dir = os.path.join(cache_dir, "rolling-user-data") + + # Map of user ids to total number of messages. Keeps indexes monotonic and increasing per user + self._user_cache_map: typing.Dict[str, CachedUserWindow] = {} + + @property + def name(self) -> str: + return "dfp-rolling-window" + + def supports_cpp_node(self): + return False + + def accepted_types(self) -> typing.Tuple: + return (DFPMessageMeta, ) + + def _trim_dataframe(self, df: pd.DataFrame): + + if (self._max_history is None): + return df + + # See if max history is an int + if (isinstance(self._max_history, int)): + return df.tail(self._max_history) + + # If its a string, then its a duration + if (isinstance(self._max_history, str)): + # Get the latest timestamp + latest = df[self._config.ae.timestamp_column_name].max() + + time_delta = pd.Timedelta(self._max_history) + + # Calc the earliest + earliest = latest - time_delta + + return df[df['timestamp'] >= earliest] + + raise RuntimeError("Unsupported max_history") + + @contextmanager + def _get_user_cache(self, user_id: str): + + # Determine cache location + cache_location = os.path.join(self._cache_dir, f"{user_id}.pkl") + + user_cache = None + + user_cache = self._user_cache_map.get(user_id, None) + + if (user_cache is None): + user_cache = CachedUserWindow(user_id=user_id, + cache_location=cache_location, + timestamp_column=self._config.ae.timestamp_column_name) + + self._user_cache_map[user_id] = user_cache + + yield user_cache + + # # When it returns, make sure to save + # user_cache.save() + + def _build_window(self, message: DFPMessageMeta) -> MultiDFPMessage: + + user_id = message.user_id + + with self._get_user_cache(user_id) as user_cache: + + incoming_df = message.get_df() + # existing_df = user_cache.df + + if (not user_cache.append_dataframe(incoming_df=incoming_df)): + # Then our incoming dataframe wasnt even covered by the window. Generate warning + logger.warn(("Incoming data preceeded existing history. " + "Consider deleting the rolling window cache and restarting.")) + return None + + # Exit early if we dont have enough data + if (user_cache.count < self._min_history): + return None + + # We have enough data, but has enough time since the last training taken place? + if (user_cache.total_count - user_cache.last_train_count < self._min_increment): + return None + + # Save the last train statistics + train_df = user_cache.get_train_df(max_history=self._max_history) + + # Hash the incoming data rows to find a match + incoming_hash = pd.util.hash_pandas_object(incoming_df.iloc[[0, -1]], index=False) + + # Find the index of the first and last row + match = train_df[train_df["_row_hash"] == incoming_hash.iloc[0]] + + if (len(match) == 0): + raise RuntimeError("Invalid rolling window") + + first_row_idx = match.index[0].item() + last_row_idx = train_df[train_df["_row_hash"] == incoming_hash.iloc[-1]].index[-1].item() + + found_count = (last_row_idx - first_row_idx) + 1 + + if (found_count != len(incoming_df)): + raise RuntimeError(("Overlapping rolling history detected. " + "Rolling history can only be used with non-overlapping batches")) + + train_offset = train_df.index.get_loc(first_row_idx) + + # Otherwise return a new message + return MultiDFPMessage(meta=DFPMessageMeta(df=train_df, user_id=user_id), + mess_offset=train_offset, + mess_count=found_count) + + def on_data(self, message: DFPMessageMeta): + + with log_time(logger.debug) as log_info: + + result = self._build_window(message) + + if (result is not None): + + log_info.set_log( + ("Rolling window complete for %s in {duration:0.2f} ms. " + "Input: %s rows from %s to %s. Output: %s rows from %s to %s"), + message.user_id, + len(message.df), + message.df[self._config.ae.timestamp_column_name].min(), + message.df[self._config.ae.timestamp_column_name].max(), + result.mess_count, + result.get_meta(self._config.ae.timestamp_column_name).min(), + result.get_meta(self._config.ae.timestamp_column_name).max(), + ) + else: + # Dont print anything + log_info.disable() + + return result + + def _build_single(self, builder: srf.Builder, input_stream: StreamPair) -> StreamPair: + + def node_fn(obs: srf.Observable, sub: srf.Subscriber): + obs.pipe(ops.map(self.on_data), ops.filter(lambda x: x is not None)).subscribe(sub) + + stream = builder.make_node_full(self.unique_name, node_fn) + builder.make_edge(input_stream[0], stream) + + return stream, MultiDFPMessage diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_split_users_stage.py b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_split_users_stage.py new file mode 100644 index 0000000000..1ac1334bbc --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_split_users_stage.py @@ -0,0 +1,139 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import typing + +import numpy as np +import srf +from srf.core import operators as ops + +import cudf + +from morpheus.config import Config +from morpheus.pipeline.single_port_stage import SinglePortStage +from morpheus.pipeline.stream_pair import StreamPair + +from ..messages.multi_dfp_message import DFPMessageMeta +from ..utils.logging_timer import log_time + +logger = logging.getLogger("morpheus.{}".format(__name__)) + + +class DFPSplitUsersStage(SinglePortStage): + + def __init__(self, + c: Config, + include_generic: bool, + include_individual: bool, + skip_users: typing.List[str] = None, + only_users: typing.List[str] = None): + super().__init__(c) + + self._include_generic = include_generic + self._include_individual = include_individual + self._skip_users = skip_users + self._only_users = only_users + + # Map of user ids to total number of messages. Keeps indexes monotonic and increasing per user + self._user_index_map: typing.Dict[str, int] = {} + + @property + def name(self) -> str: + return "dfp-split-users" + + def supports_cpp_node(self): + return False + + def accepted_types(self) -> typing.Tuple: + return (cudf.DataFrame, ) + + def extract_users(self, message: cudf.DataFrame): + if (message is None): + return [] + + with log_time(logger.debug) as log_info: + + if (isinstance(message, cudf.DataFrame)): + # Convert to pandas because cudf is slow at this + message = message.to_pandas() + + split_dataframes: typing.Dict[str, cudf.DataFrame] = {} + + # If we are skipping users, do that here + if (len(self._skip_users) > 0): + message = message[~message[self._config.ae.userid_column_name].isin(self._skip_users)] + + if (len(self._only_users) > 0): + message = message[message[self._config.ae.userid_column_name].isin(self._only_users)] + + # Split up the dataframes + if (self._include_generic): + split_dataframes[self._config.ae.fallback_username] = message + + if (self._include_individual): + + split_dataframes.update( + {username: user_df + for username, user_df in message.groupby("username", sort=False)}) + + output_messages: typing.List[DFPMessageMeta] = [] + + for user_id in sorted(split_dataframes.keys()): + + if (user_id in self._skip_users): + continue + + user_df = split_dataframes[user_id] + + current_user_count = self._user_index_map.get(user_id, 0) + + # Reset the index so that users see monotonically increasing indexes + user_df.index = range(current_user_count, current_user_count + len(user_df)) + self._user_index_map[user_id] = current_user_count + len(user_df) + + output_messages.append(DFPMessageMeta(df=user_df, user_id=user_id)) + + # logger.debug("Emitting dataframe for user '%s'. Start: %s, End: %s, Count: %s", + # user, + # df_user[self._config.ae.timestamp_column_name].min(), + # df_user[self._config.ae.timestamp_column_name].max(), + # df_user[self._config.ae.timestamp_column_name].count()) + + rows_per_user = [len(x.df) for x in output_messages] + + if (len(output_messages) > 0): + log_info.set_log( + ("Batch split users complete. Input: %s rows from %s to %s. " + "Output: %s users, rows/user min: %s, max: %s, avg: %.2f. Duration: {duration:.2f} ms"), + len(message), + message[self._config.ae.timestamp_column_name].min(), + message[self._config.ae.timestamp_column_name].max(), + len(rows_per_user), + np.min(rows_per_user), + np.max(rows_per_user), + np.mean(rows_per_user), + ) + + return output_messages + + def _build_single(self, builder: srf.Builder, input_stream: StreamPair) -> StreamPair: + + def node_fn(obs: srf.Observable, sub: srf.Subscriber): + obs.pipe(ops.map(self.extract_users), ops.flatten()).subscribe(sub) + + stream = builder.make_node_full(self.unique_name, node_fn) + builder.make_edge(input_stream[0], stream) + + return stream, DFPMessageMeta diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_training.py b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_training.py new file mode 100644 index 0000000000..29da9e9afd --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_training.py @@ -0,0 +1,81 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import typing + +import srf +from dfencoder import AutoEncoder +from srf.core import operators as ops + +from morpheus.config import Config +from morpheus.messages.multi_ae_message import MultiAEMessage +from morpheus.pipeline.single_port_stage import SinglePortStage +from morpheus.pipeline.stream_pair import StreamPair + +from ..messages.multi_dfp_message import MultiDFPMessage +from ..utils.user_model_manager import UserModelManager + +logger = logging.getLogger("morpheus.{}".format(__name__)) + + +class DFPTraining(SinglePortStage): + + def __init__(self, c: Config): + super().__init__(c) + + self._user_models: typing.Dict[str, UserModelManager] = {} + + @property + def name(self) -> str: + return "dfp-training" + + def supports_cpp_node(self): + return False + + def accepted_types(self) -> typing.Tuple: + return (MultiDFPMessage, ) + + def on_data(self, message: MultiDFPMessage): + if (message is None or message.mess_count == 0): + return None + + user_id = message.user_id + model_manager = UserModelManager(self._config, + user_id=user_id, + save_model=False, + epochs=30, + min_history=300, + max_history=-1, + seed=42, + model_class=AutoEncoder) + + model = model_manager.train(message.get_meta_dataframe()) + + output_message = MultiAEMessage(message.meta, + mess_offset=message.mess_offset, + mess_count=message.mess_count, + model=model) + + return output_message + + def _build_single(self, builder: srf.Builder, input_stream: StreamPair) -> StreamPair: + + def node_fn(obs: srf.Observable, sub: srf.Subscriber): + obs.pipe(ops.map(self.on_data), ops.filter(lambda x: x is not None)).subscribe(sub) + + stream = builder.make_node_full(self.unique_name, node_fn) + builder.make_edge(input_stream[0], stream) + + return stream, MultiAEMessage diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/stages/multi_file_source.py b/examples/digital_fingerprinting/production/morpheus/dfp/stages/multi_file_source.py new file mode 100644 index 0000000000..06276bbd50 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/stages/multi_file_source.py @@ -0,0 +1,127 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import typing + +import fsspec +import fsspec.utils +import pandas as pd +import srf + +from morpheus.config import Config +from morpheus.io.deserializers import read_file_to_df +from morpheus.pipeline.single_output_source import SingleOutputSource +from morpheus.pipeline.stream_pair import StreamPair + +from ..utils.column_info import process_dataframe + +logger = logging.getLogger("morpheus.{}".format(__name__)) + + +class MultiFileSource(SingleOutputSource): + """ + Source stage is used to load messages from a file and dumping the contents into the pipeline immediately. Useful for + testing performance and accuracy of a pipeline. + + Parameters + ---------- + c : `morpheus.config.Config` + Pipeline configuration instance. + filename : str + Name of the file from which the messages will be read. + iterative: boolean + Iterative mode will emit dataframes one at a time. Otherwise a list of dataframes is emitted. Iterative mode is + good for interleaving source stages. + file_type : `morpheus._lib.file_types.FileTypes`, default = 'auto' + Indicates what type of file to read. Specifying 'auto' will determine the file type from the extension. + Supported extensions: 'json', 'csv' + repeat: int, default = 1 + Repeats the input dataset multiple times. Useful to extend small datasets for debugging. + filter_null: bool, default = True + Whether or not to filter rows with null 'data' column. Null values in the 'data' column can cause issues down + the line with processing. Setting this to True is recommended. + cudf_kwargs: dict, default=None + keyword args passed to underlying cuDF I/O function. See the cuDF documentation for `cudf.read_csv()` and + `cudf.read_json()` for the available options. With `file_type` == 'json', this defaults to ``{ "lines": True }`` + and with `file_type` == 'csv', this defaults to ``{}``. + """ + + def __init__( + self, + c: Config, + filenames: typing.List[str], + ): + super().__init__(c) + + self._batch_size = c.pipeline_batch_size + + self._filenames = filenames + + self._input_count = None + self._max_concurrent = c.num_threads + + @property + def name(self) -> str: + return "from-multi-file" + + @property + def input_count(self) -> int: + """Return None for no max intput count""" + return self._input_count + + def supports_cpp_node(self): + return False + + def _generate_frames_fsspec(self): + + files: fsspec.core.OpenFiles = fsspec.open_files(self._filenames, filecache={'cache_storage': './.cache/s3tmp'}) + + if (len(files) == 0): + raise RuntimeError(f"No files matched input strings: '{self._filenames}'. " + "Check your input pattern and ensure any credentials are correct") + + yield files + + def _generate_frames(self): + + loaded_dfs = [] + + for f in self._filenames: + + # Read the dataframe into memory + df = read_file_to_df(f, + self._file_type, + filter_nulls=True, + df_type="pandas", + parser_kwargs=self._parser_kwargs) + + df = process_dataframe(df, self._input_schema) + + loaded_dfs.append(df) + + combined_df = pd.concat(loaded_dfs) + + print("Sending {} rows".format(len(combined_df))) + + yield combined_df + + def _build_source(self, builder: srf.Builder) -> StreamPair: + + if self._build_cpp_node(): + raise RuntimeError("Does not support C++ nodes") + else: + out_stream = builder.make_source(self.unique_name, self._generate_frames_fsspec()) + + return out_stream, fsspec.core.OpenFiles diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/stages/write_to_s3_stage.py b/examples/digital_fingerprinting/production/morpheus/dfp/stages/write_to_s3_stage.py new file mode 100644 index 0000000000..1fcf87678d --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/stages/write_to_s3_stage.py @@ -0,0 +1,70 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import typing + +import srf + +from morpheus.config import Config +from morpheus.pipeline.single_port_stage import SinglePortStage +from morpheus.pipeline.stream_pair import StreamPair + + +class WriteToS3Stage(SinglePortStage): + """ + This class writes messages to an s3 bucket. + + Parameters + ---------- + c : `morpheus.config.Config` + Pipeline configuration instance. + bucket: str + Name of the s3 bucket to write to. + + """ + + def __init__(self, c: Config, s3_writer): + super().__init__(c) + + self._s3_writer = s3_writer + + @property + def name(self) -> str: + return "to-s3-bucket" + + def accepted_types(self) -> typing.Tuple: + """ + Returns accepted input types for this stage. + + Returns + ------- + typing.Tuple(`morpheus.pipeline.messages.MessageMeta`, ) + Accepted input types. + + """ + return (typing.Any, ) + + def supports_cpp_node(self): + return False + + def _build_single(self, builder: srf.Builder, input_stream: StreamPair) -> StreamPair: + stream = input_stream[0] + + node = builder.make_node(self.unique_name, self._s3_writer) + builder.make_edge(stream, node) + + stream = node + + # Return input unchanged to allow passthrough + return stream, input_stream[1] diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/utils/__init__.py b/examples/digital_fingerprinting/production/morpheus/dfp/utils/__init__.py new file mode 100644 index 0000000000..d11ef3c507 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/utils/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/utils/column_info.py b/examples/digital_fingerprinting/production/morpheus/dfp/utils/column_info.py new file mode 100644 index 0000000000..3209d2b27e --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/utils/column_info.py @@ -0,0 +1,280 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dataclasses +import logging +import re +import typing +from datetime import datetime + +import pandas as pd + +import cudf + +logger = logging.getLogger("morpheus.{}".format(__name__)) + + +def create_increment_col(df, column_name: str, groupby_column="username", timestamp_column="timestamp"): + DEFAULT_DATE = '1970-01-01T00:00:00.000000+00:00' + + # Ensure we are pandas for this + if (isinstance(df, cudf.DataFrame)): + df = df.to_pandas() + + time_col = pd.to_datetime(df[timestamp_column], errors='coerce', utc=True).fillna(pd.to_datetime(DEFAULT_DATE)) + + per_day = time_col.dt.to_period("D") + + cat_col: pd.Series = df.groupby([per_day, groupby_column + ])[column_name].transform(lambda x: pd.factorize(x.fillna("nan"))[0] + 1) + + increment_col = pd.concat([cat_col, df[groupby_column]], + axis=1).groupby([per_day, groupby_column + ])[column_name].expanding(1).max().droplevel(0).droplevel(0) + + return increment_col + + +def column_listjoin(df, col_name): + if col_name in df: + return df[col_name].transform(lambda x: ",".join(x)).astype('string') + else: + return pd.Series(None, dtype='string') + + +@dataclasses.dataclass +class ColumnInfo: + name: str + dtype: str # The final type + + def get_pandas_dtype(self): + + if (issubclass(self.dtype, datetime)): + return "datetime64[ns]" + else: + return self.dtype + + def process_column(self, df: pd.DataFrame) -> pd.Series: + if (self.name not in df.columns): + return pd.Series(None, index=df.index, dtype=self.get_pandas_dtype()) + + return df[self.name] + + +@dataclasses.dataclass +class CustomColumn(ColumnInfo): + process_column_fn: typing.Callable + + def process_column(self, df: pd.DataFrame) -> pd.Series: + return self.process_column_fn(df) + + +@dataclasses.dataclass +class RenameColumn(ColumnInfo): + input_name: str + + def process_column(self, df: pd.DataFrame) -> pd.Series: + + if (self.input_name not in df.columns): + return pd.Series(None, index=df.index, dtype=self.get_pandas_dtype()) + + return df[self.input_name] + + +@dataclasses.dataclass +class BoolColumn(RenameColumn): + value_map: typing.Dict[str, bool] = dataclasses.field(init=False, default_factory=dict) + + true_value: dataclasses.InitVar[str] = None + false_value: dataclasses.InitVar[str] = None + + true_values: dataclasses.InitVar[typing.List[str]] = None + false_values: dataclasses.InitVar[typing.List[str]] = None + + def __post_init__(self, + true_value: str, + false_value: str, + true_values: typing.List[str], + false_values: typing.List[str]): + if (true_value is not None): + self.value_map.update({true_value: True}) + + if (false_value is not None): + self.value_map.update({false_value: False}) + + if (true_values is not None): + self.value_map.update({v: True for v in true_values}) + + if (false_values is not None): + self.value_map.update({v: False for v in false_values}) + + def process_column(self, df: pd.DataFrame) -> pd.Series: + return super().process_column(df).map(self.value_map).astype(bool) + + +@dataclasses.dataclass +class DateTimeColumn(RenameColumn): + + def process_column(self, df: pd.DataFrame) -> pd.Series: + return pd.to_datetime(super().process_column(df), infer_datetime_format=True, utc=True) + + +@dataclasses.dataclass +class StringJoinColumn(RenameColumn): + + sep: str + + def process_column(self, df: pd.DataFrame) -> pd.Series: + + return super().process_column(df).str.join(sep=self.sep) + + +@dataclasses.dataclass +class StringCatColumn(ColumnInfo): + + input_columns: typing.List[str] + sep: str + + def process_column(self, df: pd.DataFrame) -> pd.Series: + + first_col = df[self.input_columns[0]] + + return first_col.str.cat(others=df[self.input_columns[1:]], sep=self.sep) + + +@dataclasses.dataclass +class IncrementColumn(DateTimeColumn): + groupby_column: str + period: str = "D" + + def process_column(self, df: pd.DataFrame) -> pd.Series: + per_day = super().process_column(df).dt.to_period(self.period) + + # Create the per-user, per-day log count + return df.groupby([self.groupby_column, per_day]).cumcount() + + +@dataclasses.dataclass +class DataFrameInputSchema: + json_columns: typing.List[str] = dataclasses.field(default_factory=list) + column_info: typing.List[ColumnInfo] = dataclasses.field(default_factory=list) + preserve_columns: re.Pattern = dataclasses.field(default_factory=list) + row_filter: typing.Callable[[pd.DataFrame], pd.DataFrame] = None + + def __post_init__(self): + + input_preserve_columns = self.preserve_columns + + # Ensure preserve_columns is a list + if (not isinstance(input_preserve_columns, list)): + input_preserve_columns = [input_preserve_columns] + + # Compile the regex + if (input_preserve_columns is not None and len(input_preserve_columns) > 0): + input_preserve_columns = re.compile("({})".format("|".join(input_preserve_columns))) + else: + input_preserve_columns = None + + self.preserve_columns = input_preserve_columns + + +def _process_columns(df_in: pd.DataFrame, input_schema: DataFrameInputSchema): + + # TODO(MDD): See what causes this to have such a perf impact over using df_in + output_df = pd.DataFrame() + + # Iterate over the column info + for ci in input_schema.column_info: + try: + output_df[ci.name] = ci.process_column(df_in) + except Exception: + logger.exception("Failed to process column '%s'. Dataframe: \n%s", ci.name, df_in, exc_info=True) + raise + + if (input_schema.preserve_columns is not None): + # Get the list of remaining columns not already added + df_in_columns = set(df_in.columns) - set(output_df.columns) + + # Finally, keep any columns that match the preserve filters + match_columns = [y for y in df_in_columns if input_schema.preserve_columns.match(y)] + + output_df[match_columns] = df_in[match_columns] + + return output_df + + +def _normalize_dataframe(df_in: pd.DataFrame, input_schema: DataFrameInputSchema): + + if (input_schema.json_columns is None or len(input_schema.json_columns) == 0): + return df_in + + convert_to_cudf = False + + # Check if we are cudf + if (isinstance(df_in, cudf.DataFrame)): + df_in = df_in.to_pandas() + convert_to_cudf = True + + json_normalized = [] + remaining_columns = list(df_in.columns) + + for j_column in input_schema.json_columns: + + if (j_column not in remaining_columns): + continue + + normalized = pd.json_normalize(df_in[j_column]) + + # Prefix the columns + normalized.rename(columns={n: f"{j_column}.{n}" for n in normalized.columns}, inplace=True) + + # Reset the index otherwise there is a conflict + normalized.reset_index(drop=True, inplace=True) + + json_normalized.append(normalized) + + # Remove from the list of remaining columns + remaining_columns.remove(j_column) + + # Also need to reset the original index + df_in.reset_index(drop=True, inplace=True) + + df_normalized = pd.concat([df_in[remaining_columns]] + json_normalized, axis=1) + + if (convert_to_cudf): + return cudf.from_pandas(df_normalized) + + return df_normalized + + +def _filter_rows(df_in: pd.DataFrame, input_schema: DataFrameInputSchema): + + if (input_schema.row_filter is None): + return df_in + + return input_schema.row_filter(df_in) + + +def process_dataframe(df_in: pd.DataFrame, input_schema: DataFrameInputSchema): + + # Step 1 is to normalize any columns + df_processed = _normalize_dataframe(df_in, input_schema) + + # Step 2 is to process columns + df_processed = _process_columns(df_processed, input_schema) + + # Step 3 is to run the row filter if needed + df_processed = _filter_rows(df_processed, input_schema) + + return df_processed diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/utils/file_utils.py b/examples/digital_fingerprinting/production/morpheus/dfp/utils/file_utils.py new file mode 100644 index 0000000000..ea07771033 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/utils/file_utils.py @@ -0,0 +1,51 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +from datetime import datetime +from datetime import timezone + +import fsspec + +iso_date_regex = re.compile(r"(?P\d{4})-(?P\d{1,2})-(?P\d{1,2})" + r"T(?P\d{1,2}):(?P\d{1,2}):(?P\d{1,2})(?P\.\d{1,6})?Z") + + +def date_extractor(file_object: fsspec.core.OpenFile, filename_regex: re.Pattern): + + assert isinstance(file_object, fsspec.core.OpenFile) + + file_path = file_object.path + + # Match regex with the pathname since that can be more accurate + match = filename_regex.search(file_path) + + if (match): + # Convert the regex match + groups = match.groupdict() + + if ("microsecond" in groups): + groups["microsecond"] = int(float(groups["microsecond"]) * 1000000) + + groups = {key: int(value) for key, value in groups.items()} + + groups["tzinfo"] = timezone.utc + + ts_object = datetime(**groups) + else: + # Otherwise, fallback to the file modified (created?) time + ts_object = file_object.fs.modified(file_object.path) + + return ts_object diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/utils/logging_timer.py b/examples/digital_fingerprinting/production/morpheus/dfp/utils/logging_timer.py new file mode 100644 index 0000000000..f673dfe2a7 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/utils/logging_timer.py @@ -0,0 +1,64 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dataclasses +import typing +import warnings +from contextlib import contextmanager + + +@dataclasses.dataclass +class LogTimeInfo(): + log_fn: typing.Callable + msg: str + args: typing.Tuple + kwargs: typing.Dict + disabled: bool = False + + def disable(self): + self.disabled = True + + def set_log(self, msg: str, *args, **kwargs): + self.msg = msg + self.args = args + self.kwargs = kwargs + + def _do_log_message(self, duration_ms: float): + + if (self.disabled): + return + + if (self.msg is None): + warnings.warn("Must set log msg before end of context! Skipping log") + return + + # Call the log function + self.log_fn(self.msg.format(**{"duration": duration_ms}), *self.args, **self.kwargs) + + +@contextmanager +def log_time(log_fn, msg: str = None, *args, **kwargs): + + # Create an info object to allow users to set the message in the context block + info = LogTimeInfo(log_fn=log_fn, msg=msg, args=args, kwargs=kwargs) + + import time + + start_time = time.time() + + yield info + + duration = (time.time() - start_time) * 1000.0 + + info._do_log_message(duration) diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/utils/model_cache.py b/examples/digital_fingerprinting/production/morpheus/dfp/utils/model_cache.py new file mode 100644 index 0000000000..efffe169a4 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/utils/model_cache.py @@ -0,0 +1,324 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import threading +import typing +from contextlib import contextmanager +from datetime import datetime + +import mlflow +from dfencoder import AutoEncoder +from mlflow.entities.model_registry import RegisteredModel +from mlflow.exceptions import MlflowException +from mlflow.store.entities.paged_list import PagedList +from mlflow.tracking.client import MlflowClient + +from .logging_timer import log_time + +logger = logging.getLogger("morpheus.{}".format(__name__)) + + +@contextmanager +def timed_acquire(lock: threading.Lock, timeout: float): + result = lock.acquire(timeout=timeout) + + if (not result): + # Did not get the lock. Raise + raise TimeoutError() + + # Got the lock + try: + yield + + finally: + lock.release() + + +class ModelCache: + + def __init__(self, reg_model_name: str, reg_model_version: str, model_uri: str) -> None: + + self._reg_model_name = reg_model_name + self._reg_model_version = reg_model_version + self._model_uri = model_uri + + self._last_checked: datetime = datetime.now() + self._last_used: datetime = self._last_checked + + self._lock = threading.Lock() + self._model: AutoEncoder = None + + @property + def reg_model_name(self): + return self._reg_model_name + + @property + def reg_model_version(self): + return self._reg_model_version + + @property + def model_uri(self): + return self._model_uri + + @property + def last_used(self): + return self._last_used + + @property + def last_checked(self): + return self._last_checked + + def load_model(self, client) -> AutoEncoder: + + now = datetime.now() + + # Ensure multiple people do not try to load at the same time + with self._lock: + + if (self._model is None): + + # Cache miss. Release the lock while we check + try: + with log_time( + logger.debug, + f"Downloaded model '{self.reg_model_name}:{self.reg_model_version}' in {{duration}} ms"): + self._model = mlflow.pytorch.load_model(model_uri=self._model_uri) + + except MlflowException: + logger.error("Error downloading model for URI: %s", self._model_uri, exc_info=True) + raise + + # Update the last time this was used + self._last_used = now + + return self._model + + +class UserModelMap: + + def __init__(self, manager: "ModelManager", user_id: str, fallback_user_ids: typing.List[str]): + + self._manager = manager + self._user_id = user_id + self._fallback_user_ids = fallback_user_ids + self._reg_model_name = manager.user_id_to_model(user_id) + self._last_checked = None + + self._lock = threading.RLock() + self._child_user_model_cache: UserModelMap = None + + def load_model_cache(self, client) -> ModelCache: + + now = datetime.now() + + # Lock to prevent additional access + try: + with timed_acquire(self._lock, timeout=1.0): + + # Check if we have checked before or if we need to check again + if (self._last_checked is None or (now - self._last_checked).seconds < self._manager.cache_timeout_sec): + + # Save the last checked time + self._last_checked = now + + # Try to load from the manager + model_cache = self._manager.load_model_cache(client=client, reg_model_name=self._reg_model_name) + + # If we have a hit, there is nothing else to do + if (model_cache is None and len(self._fallback_user_ids) > 0): + # Our model does not exist, use fallback + self._child_user_model_cache = self._manager.load_user_model_cache( + self._fallback_user_ids[0], fallback_user_ids=self._fallback_user_ids[1:]) + else: + return model_cache + + # See if we have a child cache and use that + if (self._child_user_model_cache is not None): + return self._child_user_model_cache.load_model_cache(client=client) + + # Otherwise load the model + model_cache = self._manager.load_model_cache(client=client, reg_model_name=self._reg_model_name) + + if (model_cache is None): + raise RuntimeError("Model was found but now no longer exists. Model: {}".format( + self._reg_model_name)) + + return model_cache + except TimeoutError: + logger.error("Deadlock detected while loading model cache. Please report this to the developers.") + raise RuntimeError("Deadlock detected while loading model cache") + + +class ModelManager: + + def __init__(self, model_name_formatter: str) -> None: + self._model_name_formatter = model_name_formatter + + self._user_model_cache: typing.Dict[str, UserModelMap] = {} + + self._model_cache: typing.Dict[str, ModelCache] = {} + self._model_cache_size_max = 100 + + self._cache_timeout_sec = 600 + + self._user_model_cache_lock = threading.RLock() + self._model_cache_lock = threading.RLock() + + self._existing_models: typing.Set[str] = set() + self._existing_models_updated = datetime(1970, 1, 1) + + # Force an update of the existing models + self._model_exists("") + + @property + def cache_timeout_sec(self): + return self._cache_timeout_sec + + def _model_exists(self, reg_model_name: str) -> bool: + + now = datetime.now() + + # See if the list of models needs to be updated + if ((now - self._existing_models_updated).seconds > self._cache_timeout_sec): + + try: + with timed_acquire(self._model_cache_lock, timeout=1.0): + + logger.debug("Updating list of available models...") + client = MlflowClient() + + results: PagedList[RegisteredModel] = PagedList([], token=None) + + # clear the set to hanfle the case where a model has been removed + self._existing_models.clear() + + # Loop over the registered models with the pagination + while ((results := client.list_registered_models(max_results=1000, page_token=results.token)) + is not None): + + self._existing_models.update(model.name for model in results) + + if (len(results.token) == 0): + break + + logger.debug("Updating list of available models... Done.") + + # Save the update time + self._existing_models_updated = now + + except TimeoutError: + logger.error("Deadlock detected checking for new models. Please report this to the developers.") + raise RuntimeError("Deadlock detected checking for new models") + except Exception: + logger.exception("Exception occurred when querying the list of available models", exc_info=True) + raise + + return reg_model_name in self._existing_models + + def user_id_to_model(self, user_id: str): + return self._model_name_formatter.format(user_id=user_id) + + def load_user_model(self, client, user_id: str, fallback_user_ids: typing.List[str] = []) -> ModelCache: + + # First get the UserModel + user_model_cache = self.load_user_model_cache(user_id=user_id, fallback_user_ids=fallback_user_ids) + + return user_model_cache.load_model_cache(client=client) + + def load_model_cache(self, client: MlflowClient, reg_model_name: str) -> ModelCache: + + now = datetime.now() + + try: + with timed_acquire(self._model_cache_lock, timeout=1.0): + + model_cache = self._model_cache.get(reg_model_name, None) + + # Make sure it hasnt been too long since we checked + if (model_cache is not None and (now - model_cache.last_checked).seconds < self._cache_timeout_sec): + + return model_cache + + # Cache miss. Try to check for a model + try: + if (not self._model_exists(reg_model_name)): + # Break early + return None + + latest_versions = client.get_latest_versions(reg_model_name) + + if (len(latest_versions) == 0): + # Databricks doesnt like the `get_latest_versions` method for some reason. Before failing, try + # to just get the model and then use latest versions + reg_model_obj = client.get_registered_model(reg_model_name) + + latest_versions = None if reg_model_obj is None else reg_model_obj.latest_versions + + if (len(latest_versions) == 0): + logger.warning( + ("Registered model with no versions detected. Consider deleting this registered model." + "Using fallback model. Model: %s, "), + reg_model_name) + return None + + # Default to the first returned one + latest_model_version = latest_versions[0] + + if (len(latest_versions) > 1): + logger.warning(("Multiple models in different stages detected. " + "Defaulting to first returned. Model: %s, Version: %s, Stage: %s"), + reg_model_name, + latest_model_version.version, + latest_model_version.current_stage) + + model_cache = ModelCache(reg_model_name=reg_model_name, + reg_model_version=latest_model_version.version, + model_uri=latest_model_version.source) + + except MlflowException as e: + if e.error_code == 'RESOURCE_DOES_NOT_EXIST': + # No user found + return None + + raise + + # Save the cache + self._model_cache[reg_model_name] = model_cache + + # Check if we need to push out a cache entry + if (len(self._model_cache) > self._model_cache_size_max): + time_sorted = sorted([(k, v) for k, v in self._model_cache.items()], key=lambda x: x[1].last_used) + to_delete = time_sorted[0][0] + self._model_cache.pop(to_delete) + + return model_cache + + except TimeoutError: + logger.error("Deadlock when trying to acquire model cache lock") + raise RuntimeError("Deadlock when trying to acquire model cache lock") + + def load_user_model_cache(self, user_id: str, fallback_user_ids: typing.List[str] = []) -> UserModelMap: + try: + with timed_acquire(self._user_model_cache_lock, timeout=1.0): + + if (user_id not in self._user_model_cache): + self._user_model_cache[user_id] = UserModelMap(manager=self, + user_id=user_id, + fallback_user_ids=fallback_user_ids) + + return self._user_model_cache[user_id] + except TimeoutError: + logger.error("Deadlock when trying to acquire user model cache lock") + raise RuntimeError("Deadlock when trying to acquire user model cache lock") diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/utils/user_model_manager.py b/examples/digital_fingerprinting/production/morpheus/dfp/utils/user_model_manager.py new file mode 100644 index 0000000000..dc766e7d54 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/utils/user_model_manager.py @@ -0,0 +1,251 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import typing + +import numpy as np +import pandas as pd +import torch +from dfencoder import AutoEncoder +from tqdm import tqdm + +from morpheus.config import Config + +logger = logging.getLogger("morpheus.{}".format(__name__)) + + +class DFPDataLoader: + + def __init__(self, batch_frames, filter_func, max_rows_per_batch=50000): + self._aggregate_cache = None + self._batch_frames = batch_frames + self._current_index = 0 + self._filter_func = filter_func + self._frame_count = len(self._batch_frames) + self._max_rows_per_batch = max_rows_per_batch + self._sample_frame = None + + def reset(self): + self._current_index = 0 + + def get_sample_frame(self): + return self._sample_frame + + def get_next_frame(self): + if (self._current_index == self._frame_count): + return None + + if (self._aggregate_cache is not None): + self._current_index = self._frame_count + return self._aggregate_cache + + total_frames = 0 + aggregate_rows = 0 + aggregate_frame = pd.DataFrame() + while (True): + df_frame = self._filter_func(pd.read_pickle(self._batch_frames[self._current_index])) + + # Save the first row and the last row from every batch. Helps with statistics down the line + if (self._sample_frame is None): + self._sample_frame = df_frame.head(1) + + self._sample_frame = self._sample_frame.append(df_frame.tail(1)) + + rows = df_frame.shape[0] + + if (aggregate_rows + rows < self._max_rows_per_batch): + aggregate_frame = pd.concat([aggregate_frame, df_frame]) + aggregate_rows += rows + total_frames += 1 + + self._current_index = min((self._current_index + 1), self._frame_count) + else: # Adding another frame would exceed our memory limit, return + if (total_frames == self._frame_count): + logger.debug("Caching full training set.") + self._aggregate_cache = aggregate_frame + + return aggregate_frame + + if (self._current_index != self._frame_count): + continue + + # Epoch rolled, return what we have + if (total_frames == self._frame_count): + logger.debug("Caching full training set.") + self._aggregate_cache = aggregate_frame + + return aggregate_frame + + +class InsufficientDataError(RuntimeError): + pass + + +class UserModelManager(object): + + def __init__(self, + c: Config, + user_id: str, + save_model: bool, + epochs: int, + min_history: int, + max_history: int, + seed: int = None, + batch_files: typing.List = [], + model_class=AutoEncoder) -> None: + super().__init__() + + self._user_id = user_id + self._history: pd.DataFrame = None + self._min_history: int = min_history + self._max_history: int = max_history + self._seed: int = seed + self._feature_columns = c.ae.feature_columns + self._epochs = epochs + self._save_model = save_model + self._model_class = model_class + self._batch_files = batch_files + + self._model: AutoEncoder = None + + self._last_train_count = 0 + + @property + def model(self): + return self._model + + def train_from_batch(self, filter_func=lambda df: df): + if (not self._batch_files): + return None + + # If the seed is set, enforce that here + if (self._seed is not None): + torch.manual_seed(self._seed) + torch.cuda.manual_seed(self._seed) + np.random.seed(self._seed) + torch.backends.cudnn.deterministic = True + + model = self._model_class( + encoder_layers=[512, 500], # layers of the encoding part + decoder_layers=[512], # layers of the decoding part + activation='relu', # activation function + swap_p=0.2, # noise parameter + lr=0.001, # learning rate + lr_decay=.99, # learning decay + batch_size=512, + # logger='ipynb', + verbose=False, + optimizer='sgd', # SGD optimizer is selected(Stochastic gradient descent) + scaler='standard', # feature scaling method + min_cats=1, # cut off for minority categories + progress_bar=False, + device="cuda") + + # Loop each epoch + logger.debug("Training AE model for user: '%s'...", self._user_id) + loader = DFPDataLoader(self._batch_files, filter_func) + try: + for _ in tqdm(range(self._epochs), desc="Training"): + batches = 0 + while (True): + df_batch = loader.get_next_frame() + if (df_batch is None): + break + + if (batches == 0 and (df_batch.shape[0] < self._min_history)): + raise InsufficientDataError("Insuffient training data.") + + if (df_batch.shape[0] < 10): # If we've already trained on some data, make sure we can tts this. + break + + model.fit(df_batch) + batches += 1 + + loader.reset() + + if (self._save_model): + self._model = model + + logger.debug("Training AE model for user: '%s'... Complete.", self._user_id) + + return model, loader.get_sample_frame() + except InsufficientDataError: + logger.debug(f"Training AE model for user: '{self._user_id}... Skipped") + return None, None + except Exception: + logger.exception("Error during training for user: %s", self._user_id, exc_info=True) + return None, None + + def train(self, df: pd.DataFrame) -> AutoEncoder: + + # Determine how much history to save + if (self._history is not None): + if (self._max_history > 0): + to_drop = max(len(df) + len(self._history) - self._max_history, 0) + else: + to_drop = 0 + + history = self._history.iloc[to_drop:, :] + + combined_df = pd.concat([history, df]) + else: + combined_df = df + + # Save the history for next time + if (self._max_history > 0): + self._history = combined_df.iloc[max(0, len(combined_df) - self._max_history):, :] + else: + self._history = combined_df + + # Ensure we have enough data + if (len(combined_df) < self._last_train_count + self._min_history): + return None + + # If the seed is set, enforce that here + if (self._seed is not None): + torch.manual_seed(self._seed) + torch.cuda.manual_seed(self._seed) + np.random.seed(self._seed) + torch.backends.cudnn.deterministic = True + + model = self._model_class( + encoder_layers=[512, 500], # layers of the encoding part + decoder_layers=[512], # layers of the decoding part + activation='relu', # activation function + swap_p=0.2, # noise parameter + lr=0.001, # learning rate + lr_decay=.99, # learning decay + batch_size=4096, + # logger='ipynb', + verbose=False, + optimizer='sgd', # SGD optimizer is selected(Stochastic gradient descent) + scaler='standard', # feature scaling method + min_cats=1, # cut off for minority categories + progress_bar=False, + device="cuda") + + final_df = combined_df[combined_df.columns.intersection(self._feature_columns)] + + logger.debug("Training AE model for user: '%s'...", self._user_id) + model.fit(final_df, epochs=self._epochs) + logger.debug("Training AE model for user: '%s'... Complete.", self._user_id) + + # Save the train count to prevent retrains + self._last_train_count = len(final_df) + + if (self._save_model): + self._model = model + + return model diff --git a/examples/digital_fingerprinting/production/morpheus/dfp_azure_inference.ipynb b/examples/digital_fingerprinting/production/morpheus/dfp_azure_inference.ipynb new file mode 100644 index 0000000000..d8e6a94ab0 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp_azure_inference.ipynb @@ -0,0 +1,453 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2941e94f-db20-44a5-ab87-2cab499825f7", + "metadata": {}, + "source": [ + "# Digital Finger Printing (DFP) with Morpheus - Azure Inference\n", + "## Introduction\n", + "\n", + "In this notebook, we will be building and running a DFP pipeline that performs inference on Azure logs. The goal is to use the pretrained models generated in the Duo Training notebook to generate anomaly scores for each log. These anomaly scores can be used by security teams to detect abnormal behavior when it happens so the proper action can be taken.\n", + "\n", + "
\n", + "Note: For more information on DFP, the Morpheus pipeline, and setup steps to run this notebook, please see the coresponding DFP training materials.\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6c1cb50-74f2-445d-b865-8c22c3b3798b", + "metadata": {}, + "outputs": [], + "source": [ + "# Ensure that the morpheus directory is in the python path. This may not need to be run depending on the environment setup\n", + "import sys\n", + "import os\n", + "sys.path.insert(0, os.path.abspath(\"./morpheus\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "102ce011-3ca3-4f96-a72d-de28fad32003", + "metadata": {}, + "outputs": [], + "source": [ + "import functools\n", + "import logging\n", + "import os\n", + "import typing\n", + "from datetime import datetime\n", + "from functools import partial\n", + "\n", + "import click\n", + "import mlflow\n", + "from dfp.stages.dfp_file_batcher_stage import DFPFileBatcherStage\n", + "from dfp.stages.dfp_file_to_df import DFPFileToDataFrameStage\n", + "from dfp.stages.dfp_inference_stage import DFPInferenceStage\n", + "from dfp.stages.dfp_mlflow_model_writer import DFPMLFlowModelWriterStage\n", + "from dfp.stages.dfp_postprocessing_stage import DFPPostprocessingStage\n", + "from dfp.stages.dfp_preprocessing_stage import DFPPreprocessingStage\n", + "from dfp.stages.dfp_rolling_window_stage import DFPRollingWindowStage\n", + "from dfp.stages.dfp_split_users_stage import DFPSplitUsersStage\n", + "from dfp.stages.dfp_training import DFPTraining\n", + "from dfp.stages.multi_file_source import MultiFileSource\n", + "from dfp.utils.column_info import BoolColumn\n", + "from dfp.utils.column_info import ColumnInfo\n", + "from dfp.utils.column_info import CustomColumn\n", + "from dfp.utils.column_info import DataFrameInputSchema\n", + "from dfp.utils.column_info import DateTimeColumn\n", + "from dfp.utils.column_info import IncrementColumn\n", + "from dfp.utils.column_info import RenameColumn\n", + "from dfp.utils.column_info import StringCatColumn\n", + "from dfp.utils.column_info import create_increment_col\n", + "from dfp.utils.file_utils import date_extractor\n", + "from dfp.utils.file_utils import iso_date_regex\n", + "\n", + "from morpheus._lib.file_types import FileTypes\n", + "from morpheus.cli.utils import get_package_relative_file\n", + "from morpheus.cli.utils import load_labels_file\n", + "from morpheus.config import Config\n", + "from morpheus.config import ConfigAutoEncoder\n", + "from morpheus.config import CppConfig\n", + "from morpheus.pipeline import LinearPipeline\n", + "from morpheus.stages.general.monitor_stage import MonitorStage\n", + "from morpheus.stages.output.write_to_file_stage import WriteToFileStage\n", + "from morpheus.utils.logger import configure_logging\n", + "from morpheus.utils.logger import get_log_levels\n", + "from morpheus.utils.logger import parse_log_level\n", + "\n", + "# Left align all tables\n", + "from IPython.core.display import HTML\n", + "table_css = 'table {align:left;display:block}'\n", + "HTML(''.format(table_css))" + ] + }, + { + "cell_type": "markdown", + "id": "ca38c1b7-ce84-43e0-ac53-280562dc1642", + "metadata": {}, + "source": [ + "## High Level Configuration\n", + "\n", + "The following options significantly alter the functionality of the pipeline. These options are separated from the individual stage options since they may effect more than one stage. Additionally, the matching python script to this notebook, `dfp_azure_pipeline.py`, configures these options via command line arguments.\n", + "\n", + "### Options\n", + "\n", + "| Name | Type | Description |\n", + "| --- | --- | :-- |\n", + "| `train_users` | One of `[\"none\"]` | For inference, this option should always be `\"none\"` |\n", + "| `skip_users` | List of strings | Any user in this list will be dropped from the pipeline. Useful for debugging to remove automated accounts with many logs. |\n", + "| `cache_dir` | string | The location to store cached files. To aid with development and reduce bandwidth, the Morpheus pipeline will cache data from several stages of the pipeline. This option configures the location for those caches. |\n", + "| `input_files` | List of strings | List of files to process. Can specificy multiple arguments for multiple files. Also accepts glob (\\*) wildcards and schema prefixes such as `s3://`. For example, to make a local cache of an s3 bucket, use `filecache::s3://mybucket/*`. See `fsspec` documentation for list of possible options. |\n", + "| `model_name_formatter` | string | A format string to use when building the model name. The model name is constructed by calling `model_name_formatter.format(user_id=user_id)`. For example, with `model_name_formatter=\"my_model-{user_id}\"` and a user ID of `\"first:last\"` would result in the model name of `\"my_model-first:last\"`. This should match the value used in `DFPMLFlowModelWriterStage`. Available keyword arguments: `user_id`, `user_md5`. |\n", + "| `experiment_name_formatter` | string | A format string (without the `f`) that will be used when creating an experiment in ML Flow. Available keyword arguments: `user_id`, `user_md5`, `reg_model_name`. |\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ee00703-75c5-46fc-890c-86733da906c4", + "metadata": {}, + "outputs": [], + "source": [ + "# Global options\n", + "train_users = \"none\"\n", + "\n", + "# Enter any users to skip here\n", + "skip_users: typing.List[str] = []\n", + "\n", + "# Location where cache objects will be saved\n", + "cache_dir = \"./.cache/dfp\"\n", + "\n", + "# Input files to read from\n", + "input_files = [\n", + " \"/work/examples/data/dfp/duo/duotest_pt1.json\",\n", + " \"/work/examples/data/dfp/duo/duotest_pt2.json\",\n", + " \"/work/examples/data/dfp/duo/duotest_pt3.json\",\n", + " \"/work/examples/data/dfp/duo/duotest_pt4.json\"\n", + "]\n", + "\n", + "# The format to use for models\n", + "model_name_formatter = \"DFP-azure-{user_id}\"\n", + "\n", + "# === Derived Options ===\n", + "# To include the generic, we must be training all or generic\n", + "include_generic = train_users == \"all\" or train_users == \"generic\"\n", + "\n", + "# To include individual, we must be either training or inferring\n", + "include_individual = train_users != \"generic\"\n", + "\n", + "# None indicates we arent training anything\n", + "is_training = train_users != \"none\"" + ] + }, + { + "cell_type": "markdown", + "id": "1cfc24c9-c85e-4977-a348-692c8f0aceaa", + "metadata": {}, + "source": [ + "### Global Config Object\n", + "Before creating the pipeline, we need to setup logging and set the parameters for the Morpheus config object. This config object is responsible for the following:\n", + " - Indicating whether to use C++ or Python stages\n", + " - C++ stages are not supported for the DFP pipeline. This should always be `False`\n", + " - Setting the number of threads to use in the pipeline. Defaults to the thread count of the OS.\n", + " - Sets the feature column names that will be used in model training\n", + " - This option allows extra columns to be used in the pipeline that will not be part of the training algorithm.\n", + " - The final features that the model will be trained on will be an intersection of this list with the log columns.\n", + " - The column name that indicates the user's unique identifier\n", + " - It is required for DFP to have a user ID column\n", + " - The column name that indicates the timestamp for the log\n", + " - It is required for DFP to know when each log occurred" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "01abd537-9162-49dc-8e83-d9465592f1d5", + "metadata": {}, + "outputs": [], + "source": [ + "# Enable the Morpheus logger\n", + "configure_logging(log_level=logging.DEBUG)\n", + "\n", + "config = Config()\n", + "\n", + "CppConfig.set_should_use_cpp(False)\n", + "\n", + "config.num_threads = os.cpu_count()\n", + "\n", + "config.ae = ConfigAutoEncoder()\n", + "\n", + "config.ae.feature_columns = [\n", + " \"appDisplayName\", \"clientAppUsed\", \"deviceDetailbrowser\", \"deviceDetaildisplayName\", \"deviceDetailoperatingSystem\", \"statusfailureReason\", \"appincrement\", \"locincrement\", \"logcount\", \n", + "]\n", + "config.ae.userid_column_name = \"username\"\n", + "config.ae.timestamp_column_name = \"timestamp\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a73a4d53-32b6-4ab8-a5d7-c0104b31c69b", + "metadata": {}, + "outputs": [], + "source": [ + "# Specify the column names to ensure all data is uniform\n", + "source_column_info = [\n", + " DateTimeColumn(name=config.ae.timestamp_column_name, dtype=datetime, input_name=\"time\"),\n", + " RenameColumn(name=config.ae.userid_column_name, dtype=str, input_name=\"properties.userPrincipalName\"),\n", + " RenameColumn(name=\"appDisplayName\", dtype=str, input_name=\"properties.appDisplayName\"),\n", + " ColumnInfo(name=\"category\", dtype=str),\n", + " RenameColumn(name=\"clientAppUsed\", dtype=str, input_name=\"properties.clientAppUsed\"),\n", + " RenameColumn(name=\"deviceDetailbrowser\", dtype=str, input_name=\"properties.deviceDetail.browser\"),\n", + " RenameColumn(name=\"deviceDetaildisplayName\", dtype=str, input_name=\"properties.deviceDetail.displayName\"),\n", + " RenameColumn(name=\"deviceDetailoperatingSystem\",\n", + " dtype=str,\n", + " input_name=\"properties.deviceDetail.operatingSystem\"),\n", + " StringCatColumn(name=\"location\",\n", + " dtype=str,\n", + " input_columns=[\n", + " \"properties.location.city\",\n", + " \"properties.location.countryOrRegion\",\n", + " ],\n", + " sep=\", \"),\n", + " RenameColumn(name=\"statusfailureReason\", dtype=str, input_name=\"properties.status.failureReason\"),\n", + "]\n", + "\n", + "source_schema = DataFrameInputSchema(json_columns=[\"properties\"], column_info=source_column_info)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7a0cb0a-e65a-444a-a06c-a4525d543790", + "metadata": {}, + "outputs": [], + "source": [ + "# Preprocessing schema\n", + "preprocess_column_info = [\n", + " ColumnInfo(name=config.ae.timestamp_column_name, dtype=datetime),\n", + " ColumnInfo(name=config.ae.userid_column_name, dtype=str),\n", + " ColumnInfo(name=\"appDisplayName\", dtype=str),\n", + " ColumnInfo(name=\"clientAppUsed\", dtype=str),\n", + " ColumnInfo(name=\"deviceDetailbrowser\", dtype=str),\n", + " ColumnInfo(name=\"deviceDetaildisplayName\", dtype=str),\n", + " ColumnInfo(name=\"deviceDetailoperatingSystem\", dtype=str),\n", + " ColumnInfo(name=\"statusfailureReason\", dtype=str),\n", + "\n", + " # Derived columns\n", + " IncrementColumn(name=\"logcount\",\n", + " dtype=int,\n", + " input_name=config.ae.timestamp_column_name,\n", + " groupby_column=config.ae.userid_column_name),\n", + " CustomColumn(name=\"locincrement\",\n", + " dtype=int,\n", + " process_column_fn=partial(create_increment_col, column_name=\"location\")),\n", + " CustomColumn(name=\"appincrement\",\n", + " dtype=int,\n", + " process_column_fn=partial(create_increment_col, column_name=\"appDisplayName\")),\n", + "]\n", + "\n", + "preprocess_schema = DataFrameInputSchema(column_info=preprocess_column_info, preserve_columns=[\"_batch_id\"])\n" + ] + }, + { + "cell_type": "markdown", + "id": "bdfc59de-dea8-4e5f-98e3-98eba1e4621d", + "metadata": {}, + "source": [ + "## Pipeline Construction\n", + "From this point on we begin constructing the stages that will make up the pipeline. To make testing easier, constructing the pipeline object, adding the stages, and running the pipeline, is provided as a single cell. The below cell can be rerun multiple times as needed for debugging.\n", + "\n", + "### Source Stage (`MultiFileSource`)\n", + "\n", + "This pipeline read input logs from one or more input files. This source stage will construct a list of files to be processed and pass to downstream stages. It is capable of reading files from many different source types, both local and remote. This is possible by utilizing the `fsspec` library (similar to `pandas`). See the [`fsspec`](https://filesystem-spec.readthedocs.io/) documentation for more information on the supported file types. Once all of the logs have been read, the source completes. \n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `filenames` | List of strings | | Any files to read into the pipeline. All files will be combined into a single `DataFrame` |\n", + "\n", + "### File Batcher Stage (`DFPFileBatcherStage`)\n", + "\n", + "To improve performance, multiple small input files can be batched together into a single DataFrame for processing. This stage is responsible for determining the timestamp of input files, grouping input files into batches by time, and sending the batches to be processed into a single DataFrame. Batches of files that have been seen before will be loaded from cache resulting in increased performance. For example, when performaing a 60 day training run, 59 days can be cached with a period of `\"D\"` and retraining once per day.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `period` | `str` | `\"D\"` | The period to create batches. See `pandas` windowing frequency documentation for available options. |\n", + "| `date_conversion_func` | Function of `typing.Callable[[fsspec.core.OpenFile], datetime]` | | A callback which is responsible for determining the date for a specified file. |\n", + "\n", + "### File to DataFrame Stage (`DFPFileToDataFrameStage`)\n", + "\n", + "After files have been batched into groups, this stage is responsible for reading the files and converting into a DataFrame. The specified input schema converts the raw DataFrame into one suitable for caching and processing. Any columns that are not needed should be excluded from the schema.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `schema` | `DataFrameInputSchema` | | After the raw `DataFrame` is read from each file, this schema will be applied to ensure a consisten output from the source. |\n", + "| `file_type` | `FileTypes` | `FileTypes.Auto` | Allows overriding the file type. When set to `Auto`, the file extension will be used. Options are `CSV`, `JSON`, `Auto`. |\n", + "| `parser_kwargs` | `dict` | `{}` | This dictionary will be passed to the `DataFrame` parser class. Allows for customization of log parsing. |\n", + "| `cache_dir` | `str` | `./.cache/dfp` | The location to write cached input files to. |\n", + "\n", + "### Split Users Stage (`DFPSplitUsersStage`)\n", + "\n", + "Once the input logs have been read into a `DataFrame`, this stage is responsible for breaking that single `DataFrame` with many users into multiple `DataFrame`s for each user. This is also where the pipeline chooses whether to train individual users or the generic user (or both).\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `include_generic` | `bool` | | Whether or not to combine all user logs into a single `DataFrame` with the username 'generic_user' |\n", + "| `include_individual` | `bool` | | Whether or not to output individual `DataFrame` objects for each user |\n", + "| `skip_users` | List of `str` | `[]` | Any users to remove from the `DataFrame`. Useful for debugging to remove automated accounts with many logs. Mutually exclusive with `only_users`. |\n", + "| `only_users` | List of `str` | `[]` | Only allow these users in the final `DataFrame`. Useful for debugging to focus on specific users. Mutually exclusive with `skip_users`. |\n", + "\n", + "### Rolling Window Stage (`DFPRollingWindowStage`)\n", + "\n", + "The Rolling Window Stage performs several key pieces of functionality for DFP.\n", + "1. This stage keeps a moving window of logs on a per user basis\n", + " 1. These logs are saved to disk to reduce memory requirements between logs from the same user\n", + "1. It only emits logs when the window history requirements are met\n", + " 1. Until all of the window history requirements are met, no messages will be sent to the rest of the pipeline.\n", + " 1. See the below options for configuring the window history requirements\n", + "1. It repeats the necessary logs to properly calculate log dependent features.\n", + " 1. To support all column feature types, incoming log messages can be combined with existing history and sent to downstream stages.\n", + " 1. For example, to calculate a feature that increments a counter for the number of logs a particular user has generated in a single day, we must have the user's log history for the past 24 hours. To support this, this stage will combine new logs with existing history into a single `DataFrame`.\n", + " 1. It is the responsibility of downstream stages to distinguish between new logs and existing history.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `min_history` | `int` | `1` | The minimum number of logs a user must have before emitting any messages. Logs below this threshold will be saved to disk. |\n", + "| `min_increment` | `int` or `str` | `0` | Once the min history requirement is met, this stage must receive `min_increment` *new* logs before emmitting another message. Logs received before this threshold is met will be saved to disk. Can be specified as an integer count or a string duration. |\n", + "| `max_history` | `int` or `str` | `\"1d\"` | Once `min_history` and `min_increment` requirements have been met, this puts an upper bound on the maximum number of messages to forward into the pipeline and also the maximum amount of messages to retain in the history. Can be specified as an integer count or a string duration. |\n", + "| `cache_dir` | `str` | `./.cache/dfp` | The location to write log history to disk. |\n", + "\n", + "### Preprocessing Stage (`DFPPreprocessingStage`)\n", + "\n", + "This stage performs the final, row dependent, feature calculations as specified by the input schema object. Once calculated, this stage can forward on all received logs, or optionally can only forward on new logs, removing any history information.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `input_schema` | `DataFrameInputSchema` | | The final, row dependent, schema to apply to the incoming columns |\n", + "| `only_new_batches` | `bool` | | Whether or not to foward on all received logs, or just new logs. |\n", + "\n", + "### Inference Stage (`DFPInference`)\n", + "\n", + "This stage performs several tasks to aid in performing inference. This stage will:\n", + "1. Download models as needed from MLFlow\n", + "1. Cache previously downloaded models to improve performance\n", + " 1. Models in the cache will be periodically refreshed from MLFlow at a configured rate\n", + "1. Perform inference using the downloaded model\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `model_name_formatter` | `str` | `\"\"` | A format string to use when building the model name. The model name is constructed by calling `model_name_formatter.format(user_id=user_id)`. For example, with `model_name_formatter=\"my_model-{user_id}\"` and a user ID of `\"first:last\"` would result in the model name of `\"my_model-first:last\"`. This should match the value used in `DFPMLFlowModelWriterStage` |\n", + "\n", + "### Post Processing Stage (`DFPPostprocessingStage`)\n", + "\n", + "This stage filters the output from the inference stage for any anomalous messages. Logs which exceed the specified Z-Score will be passed onto the next stage. All remaining logs which are below the threshold will be dropped.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `z_score_threshold` | `float` | `2.0` | The Z-Score used to separate anomalous logs from normal logs. All normal logs will be filterd out and anomalous logs will be passed on. |\n", + "\n", + "### Write to File Stage (`WriteToFileStage`)\n", + "\n", + "This final stage will write all received messages to a single output file in either CSV or JSON format.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `filename` | `str` | | The file to write anomalous log messages to. |\n", + "| `overwrite` | `bool` | `False` | If the file specified in `filename` already exists, it will be overwritten if this option is set to `True` |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "825390ad-ce64-4949-b324-33039ffdf264", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a linear pipeline object\n", + "pipeline = LinearPipeline(config)\n", + "\n", + "# Source stage\n", + "pipeline.set_source(MultiFileSource(config, filenames=input_files))\n", + "\n", + "# Batch files into buckets by time. Use the default ISO date extractor from the filename\n", + "pipeline.add_stage(\n", + " DFPFileBatcherStage(config,\n", + " period=\"D\",\n", + " date_conversion_func=functools.partial(date_extractor, filename_regex=iso_date_regex)))\n", + "\n", + "# Output is S3 Buckets. Convert to DataFrames. This caches downloaded S3 data\n", + "pipeline.add_stage(\n", + " DFPFileToDataFrameStage(config,\n", + " schema=source_schema,\n", + " file_type=FileTypes.JSON,\n", + " parser_kwargs={\n", + " \"lines\": False, \"orient\": \"records\"\n", + " },\n", + " cache_dir=cache_dir))\n", + "\n", + "\n", + "# This will split users or just use one single user\n", + "pipeline.add_stage(\n", + " DFPSplitUsersStage(config,\n", + " include_generic=include_generic,\n", + " include_individual=include_individual,\n", + " skip_users=skip_users))\n", + "\n", + "# Next, have a stage that will create rolling windows\n", + "pipeline.add_stage(\n", + " DFPRollingWindowStage(\n", + " config,\n", + " min_history=300 if is_training else 1,\n", + " min_increment=300 if is_training else 0,\n", + " # For inference, we only ever want 1 day max\n", + " max_history=\"60d\" if is_training else \"1d\",\n", + " cache_dir=cache_dir))\n", + "\n", + "# Output is UserMessageMeta -- Cached frame set\n", + "pipeline.add_stage(DFPPreprocessingStage(config, input_schema=preprocess_schema, only_new_batches=not is_training))\n", + "\n", + "# Perform inference on the preprocessed data\n", + "pipeline.add_stage(DFPInferenceStage(config, model_name_formatter=model_name_formatter))\n", + "\n", + "# Filter for only the anomalous logs\n", + "pipeline.add_stage(DFPPostprocessingStage(config, z_score_threshold=2.0))\n", + "\n", + "# Write all anomalies to a CSV file\n", + "pipeline.add_stage(WriteToFileStage(config, filename=\"dfp_detections_azure.csv\", overwrite=True))\n", + "\n", + "# Run the pipeline\n", + "await pipeline._do_run()\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:morpheus] *", + "language": "python", + "name": "conda-env-morpheus-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + }, + "vscode": { + "interpreter": { + "hash": "e26783b24f020aa0bcaa00e6ba122db5d0e3da2d892d80be664969895e06a7e1" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/digital_fingerprinting/production/morpheus/dfp_azure_pipeline.py b/examples/digital_fingerprinting/production/morpheus/dfp_azure_pipeline.py new file mode 100644 index 0000000000..4999bad60b --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp_azure_pipeline.py @@ -0,0 +1,293 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +import logging +import os +import typing +from datetime import datetime +from functools import partial + +import click +import mlflow +from dfp.stages.dfp_file_batcher_stage import DFPFileBatcherStage +from dfp.stages.dfp_file_to_df import DFPFileToDataFrameStage +from dfp.stages.dfp_inference_stage import DFPInferenceStage +from dfp.stages.dfp_mlflow_model_writer import DFPMLFlowModelWriterStage +from dfp.stages.dfp_postprocessing_stage import DFPPostprocessingStage +from dfp.stages.dfp_preprocessing_stage import DFPPreprocessingStage +from dfp.stages.dfp_rolling_window_stage import DFPRollingWindowStage +from dfp.stages.dfp_split_users_stage import DFPSplitUsersStage +from dfp.stages.dfp_training import DFPTraining +from dfp.stages.multi_file_source import MultiFileSource +from dfp.utils.column_info import ColumnInfo +from dfp.utils.column_info import CustomColumn +from dfp.utils.column_info import DataFrameInputSchema +from dfp.utils.column_info import DateTimeColumn +from dfp.utils.column_info import IncrementColumn +from dfp.utils.column_info import RenameColumn +from dfp.utils.column_info import StringCatColumn +from dfp.utils.column_info import create_increment_col +from dfp.utils.file_utils import date_extractor +from dfp.utils.file_utils import iso_date_regex + +from morpheus._lib.file_types import FileTypes +from morpheus.cli.utils import get_package_relative_file +from morpheus.cli.utils import load_labels_file +from morpheus.config import Config +from morpheus.config import ConfigAutoEncoder +from morpheus.config import CppConfig +from morpheus.pipeline import LinearPipeline +from morpheus.stages.general.monitor_stage import MonitorStage +from morpheus.stages.output.write_to_file_stage import WriteToFileStage +from morpheus.utils.logger import configure_logging +from morpheus.utils.logger import get_log_levels +from morpheus.utils.logger import parse_log_level + + +@click.command() +@click.option( + "--train_users", + type=click.Choice(["all", "generic", "individual", "none"], case_sensitive=False), + help="Indicates whether or not to train per user or a generic model for all users", +) +@click.option( + "--skip_user", + multiple=True, + type=str, + help="User IDs to skip. Mutually exclusive with only_user", +) +@click.option( + "--only_user", + multiple=True, + type=str, + help="Only users specified by this option will be included. Mutually exclusive with skip_user", +) +@click.option( + "--duration", + type=str, + default="60d", + help="The duration to run starting from now", +) +@click.option( + "--cache_dir", + type=str, + default="./.cache/dfp", + show_envvar=True, + help="The location to cache data such as S3 downloads and pre-processed data", +) +@click.option("--log_level", + default=logging.getLevelName(Config().log_level), + type=click.Choice(get_log_levels(), case_sensitive=False), + callback=parse_log_level, + help="Specify the logging level to use.") +@click.option("--sample_rate_s", + type=int, + default=0, + show_envvar=True, + help="Minimum time step, in milliseconds, between object logs.") +@click.option( + "--input_file", + "-f", + type=str, + multiple=True, + help=("List of files to process. Can specificy multiple arguments for multiple files. " + "Also accepts glob (*) wildcards and schema prefixes such as `s3://`. " + "For example, to make a local cache of an s3 bucket, use `filecache::s3://mybucket/*`. " + "See fsspec documentation for list of possible options."), +) +@click.option('--tracking_uri', + type=str, + default="http://localhost:5000", + help=("The ML Flow tracking URI to connect to the tracking backend. If not speficied, MF Flow will use " + "'file:///mlruns' relative to the current directory")) +def run_pipeline(train_users, + skip_user: typing.Tuple[str], + only_user: typing.Tuple[str], + duration, + cache_dir, + log_level, + sample_rate_s, + **kwargs): + # To include the generic, we must be training all or generic + include_generic = train_users == "all" or train_users == "generic" + + # To include individual, we must be either training or inferring + include_individual = train_users != "generic" + + # None indicates we arent training anything + is_training = train_users != "none" + + skip_users = list(skip_user) + only_users = list(only_user) + + # Enable the Morpheus logger + configure_logging(log_level=log_level) + + if (len(skip_users) > 0 and len(only_users) > 0): + logging.error("Option --skip_user and --only_user are mutually exclusive. Exiting") + + logger = logging.getLogger("morpheus.{}".format(__name__)) + + logger.info("Running training pipeline with the following options: ") + logger.info("Train generic_user: %s", include_generic) + logger.info("Skipping users: %s", skip_users) + logger.info("Duration: %s", duration) + logger.info("Cache Dir: %s", cache_dir) + + if ("tracking_uri" in kwargs): + # Initialize ML Flow + mlflow.set_tracking_uri(kwargs["tracking_uri"]) + logger.info("Tracking URI: %s", mlflow.get_tracking_uri()) + + config = Config() + + CppConfig.set_should_use_cpp(False) + + config.num_threads = os.cpu_count() + + config.ae = ConfigAutoEncoder() + + config.ae.feature_columns = load_labels_file(get_package_relative_file("data/columns_ae_azure.txt")) + config.ae.userid_column_name = "username" + config.ae.timestamp_column_name = "timestamp" + + # Specify the column names to ensure all data is uniform + source_column_info = [ + DateTimeColumn(name=config.ae.timestamp_column_name, dtype=datetime, input_name="time"), + RenameColumn(name=config.ae.userid_column_name, dtype=str, input_name="properties.userPrincipalName"), + RenameColumn(name="appDisplayName", dtype=str, input_name="properties.appDisplayName"), + ColumnInfo(name="category", dtype=str), + RenameColumn(name="clientAppUsed", dtype=str, input_name="properties.clientAppUsed"), + RenameColumn(name="deviceDetailbrowser", dtype=str, input_name="properties.deviceDetail.browser"), + RenameColumn(name="deviceDetaildisplayName", dtype=str, input_name="properties.deviceDetail.displayName"), + RenameColumn(name="deviceDetailoperatingSystem", + dtype=str, + input_name="properties.deviceDetail.operatingSystem"), + StringCatColumn(name="location", + dtype=str, + input_columns=[ + "properties.location.city", + "properties.location.countryOrRegion", + ], + sep=", "), + RenameColumn(name="statusfailureReason", dtype=str, input_name="properties.status.failureReason"), + ] + + source_schema = DataFrameInputSchema(json_columns=["properties"], column_info=source_column_info) + + # Preprocessing schema + preprocess_column_info = [ + ColumnInfo(name=config.ae.timestamp_column_name, dtype=datetime), + ColumnInfo(name=config.ae.userid_column_name, dtype=str), + ColumnInfo(name="appDisplayName", dtype=str), + ColumnInfo(name="clientAppUsed", dtype=str), + ColumnInfo(name="deviceDetailbrowser", dtype=str), + ColumnInfo(name="deviceDetaildisplayName", dtype=str), + ColumnInfo(name="deviceDetailoperatingSystem", dtype=str), + ColumnInfo(name="statusfailureReason", dtype=str), + + # Derived columns + IncrementColumn(name="logcount", + dtype=int, + input_name=config.ae.timestamp_column_name, + groupby_column=config.ae.userid_column_name), + CustomColumn(name="locincrement", + dtype=int, + process_column_fn=partial(create_increment_col, column_name="location")), + CustomColumn(name="appincrement", + dtype=int, + process_column_fn=partial(create_increment_col, column_name="appDisplayName")), + ] + + preprocess_schema = DataFrameInputSchema(column_info=preprocess_column_info, preserve_columns=["_batch_id"]) + + # Create a linear pipeline object + pipeline = LinearPipeline(config) + + pipeline.set_source(MultiFileSource(config, filenames=list(kwargs["input_file"]))) + + # Batch files into buckets by time. Use the default ISO date extractor from the filename + pipeline.add_stage( + DFPFileBatcherStage(config, + period="D", + sampling_rate_s=sample_rate_s, + date_conversion_func=functools.partial(date_extractor, filename_regex=iso_date_regex))) + + # Output is S3 Buckets. Convert to DataFrames. This caches downloaded S3 data + pipeline.add_stage( + DFPFileToDataFrameStage(config, + schema=source_schema, + file_type=FileTypes.JSON, + parser_kwargs={ + "lines": False, "orient": "records" + }, + cache_dir=cache_dir)) + + pipeline.add_stage(MonitorStage(config, description="Input data rate")) + + # This will split users or just use one single user + pipeline.add_stage( + DFPSplitUsersStage(config, + include_generic=include_generic, + include_individual=include_individual, + skip_users=skip_users, + only_users=only_users)) + + # Next, have a stage that will create rolling windows + pipeline.add_stage( + DFPRollingWindowStage( + config, + min_history=300 if is_training else 1, + min_increment=300 if is_training else 0, + # For inference, we only ever want 1 day max + max_history="60d" if is_training else "1d", + cache_dir=cache_dir)) + + # Output is UserMessageMeta -- Cached frame set + pipeline.add_stage(DFPPreprocessingStage(config, input_schema=preprocess_schema, only_new_batches=not is_training)) + + model_name_formatter = "DFP-azure-{user_id}" + experiment_name_formatter = "dfp/azure/training/{reg_model_name}" + + if (is_training): + + # Finally, perform training which will output a model + pipeline.add_stage(DFPTraining(config)) + + pipeline.add_stage(MonitorStage(config, description="Training rate", smoothing=0.001)) + + # Write that model to MLFlow + pipeline.add_stage( + DFPMLFlowModelWriterStage(config, + model_name_formatter=model_name_formatter, + experiment_name_formatter=experiment_name_formatter)) + else: + # Perform inference on the preprocessed data + pipeline.add_stage(DFPInferenceStage(config, model_name_formatter=model_name_formatter)) + + pipeline.add_stage(MonitorStage(config, description="Inference rate", smoothing=0.001)) + + # Filter for only the anomalous logs + pipeline.add_stage(DFPPostprocessingStage(config, z_score_threshold=2.0)) + + # Write all anomalies to a CSV file + pipeline.add_stage(WriteToFileStage(config, filename="dfp_detections_azure.csv", overwrite=True)) + + # Run the pipeline + pipeline.run() + + +if __name__ == "__main__": + run_pipeline(obj={}, auto_envvar_prefix='DFP', show_default=True, prog_name="dfp") diff --git a/examples/digital_fingerprinting/production/morpheus/dfp_azure_training.ipynb b/examples/digital_fingerprinting/production/morpheus/dfp_azure_training.ipynb new file mode 100644 index 0000000000..30d47c276d --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp_azure_training.ipynb @@ -0,0 +1,444 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2941e94f-db20-44a5-ab87-2cab499825f7", + "metadata": {}, + "source": [ + "# Digital Finger Printing (DFP) with Morpheus - Azure Training\n", + "## Introduction\n", + "\n", + "In this notebook, we will be building and running a DFP pipeline that performs training on Azure logs. The goal is to train an autoencoder PyTorch model to recogize the patterns of users in the sample data. The model will then be used by a second Morpheus pipeline to generate anomaly scores for each individual log. These anomaly scores can be used by security teams to detect abnormal behavior when it happens so the proper action can be taken.\n", + "\n", + "
\n", + "Note: For more information on DFP, the Morpheus pipeline, and setup steps to run this notebook, please see the coresponding DFP training materials.\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6c1cb50-74f2-445d-b865-8c22c3b3798b", + "metadata": {}, + "outputs": [], + "source": [ + "# Ensure that the morpheus directory is in the python path. This may not need to be run depending on the environment setup\n", + "import sys\n", + "import os\n", + "sys.path.insert(0, os.path.abspath(\"./morpheus\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "102ce011-3ca3-4f96-a72d-de28fad32003", + "metadata": {}, + "outputs": [], + "source": [ + "import functools\n", + "import logging\n", + "import os\n", + "import typing\n", + "from datetime import datetime\n", + "from functools import partial\n", + "\n", + "import click\n", + "import mlflow\n", + "from dfp.stages.dfp_file_batcher_stage import DFPFileBatcherStage\n", + "from dfp.stages.dfp_file_to_df import DFPFileToDataFrameStage\n", + "from dfp.stages.dfp_inference_stage import DFPInferenceStage\n", + "from dfp.stages.dfp_mlflow_model_writer import DFPMLFlowModelWriterStage\n", + "from dfp.stages.dfp_postprocessing_stage import DFPPostprocessingStage\n", + "from dfp.stages.dfp_preprocessing_stage import DFPPreprocessingStage\n", + "from dfp.stages.dfp_rolling_window_stage import DFPRollingWindowStage\n", + "from dfp.stages.dfp_split_users_stage import DFPSplitUsersStage\n", + "from dfp.stages.dfp_training import DFPTraining\n", + "from dfp.stages.multi_file_source import MultiFileSource\n", + "from dfp.utils.column_info import BoolColumn\n", + "from dfp.utils.column_info import ColumnInfo\n", + "from dfp.utils.column_info import CustomColumn\n", + "from dfp.utils.column_info import DataFrameInputSchema\n", + "from dfp.utils.column_info import DateTimeColumn\n", + "from dfp.utils.column_info import IncrementColumn\n", + "from dfp.utils.column_info import RenameColumn\n", + "from dfp.utils.column_info import StringCatColumn\n", + "from dfp.utils.column_info import create_increment_col\n", + "from dfp.utils.file_utils import date_extractor\n", + "from dfp.utils.file_utils import iso_date_regex\n", + "\n", + "from morpheus._lib.file_types import FileTypes\n", + "from morpheus.cli.utils import get_package_relative_file\n", + "from morpheus.cli.utils import load_labels_file\n", + "from morpheus.config import Config\n", + "from morpheus.config import ConfigAutoEncoder\n", + "from morpheus.config import CppConfig\n", + "from morpheus.pipeline import LinearPipeline\n", + "from morpheus.stages.general.monitor_stage import MonitorStage\n", + "from morpheus.stages.output.write_to_file_stage import WriteToFileStage\n", + "from morpheus.utils.logger import configure_logging\n", + "from morpheus.utils.logger import get_log_levels\n", + "from morpheus.utils.logger import parse_log_level\n", + "\n", + "# Left align all tables\n", + "from IPython.core.display import HTML\n", + "table_css = 'table {align:left;display:block}'\n", + "HTML(''.format(table_css))" + ] + }, + { + "cell_type": "markdown", + "id": "ca38c1b7-ce84-43e0-ac53-280562dc1642", + "metadata": {}, + "source": [ + "## High Level Configuration\n", + "\n", + "The following options significantly alter the functionality of the pipeline. These options are separated from the individual stage options since they may effect more than one stage. Additionally, the matching python script to this notebook, `dfp_azure_pipeline.py`, configures these options via command line arguments.\n", + "\n", + "### Options\n", + "\n", + "| Name | Type | Description |\n", + "| --- | --- | :-- |\n", + "| `train_users` | One of `[\"all\", \"generic\", \"individual\"]` | This indicates which users to train for this pipeline:
  • `\"generic\"`: Combine all users into a single model with the username 'generic_user'. Skips individual users.
  • `\"individual\"`: Trains a separate model for each individual user. Skips 'generic_user'.
  • `\"all\"`: Combination of `\"generic\"` and `\"individual\"`. Both the 'generic_user' and individual users are trained in the same pipeline.
|\n", + "| `skip_users` | List of strings | Any user in this list will be dropped from the pipeline. Useful for debugging to remove automated accounts with many logs. |\n", + "| `cache_dir` | string | The location to store cached files. To aid with development and reduce bandwidth, the Morpheus pipeline will cache data from several stages of the pipeline. This option configures the location for those caches. |\n", + "| `input_files` | List of strings | List of files to process. Can specificy multiple arguments for multiple files. Also accepts glob (\\*) wildcards and schema prefixes such as `s3://`. For example, to make a local cache of an s3 bucket, use `filecache::s3://mybucket/*`. See `fsspec` documentation for list of possible options. |\n", + "| `model_name_formatter` | string | A format string to use when building the model name. The model name is constructed by calling `model_name_formatter.format(user_id=user_id)`. For example, with `model_name_formatter=\"my_model-{user_id}\"` and a user ID of `\"first:last\"` would result in the model name of `\"my_model-first:last\"`. This should match the value used in `DFPMLFlowModelWriterStage`. Available keyword arguments: `user_id`, `user_md5`. |\n", + "| `experiment_name_formatter` | string | A format string (without the `f`) that will be used when creating an experiment in ML Flow. Available keyword arguments: `user_id`, `user_md5`, `reg_model_name`. |\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ee00703-75c5-46fc-890c-86733da906c4", + "metadata": {}, + "outputs": [], + "source": [ + "# Global options\n", + "train_users = \"all\"\n", + "\n", + "# Enter any users to skip here\n", + "skip_users: typing.List[str] = []\n", + "\n", + "# Location where cache objects will be saved\n", + "cache_dir = \"./.cache/dfp\"\n", + "\n", + "# Input files to read from\n", + "input_files = [\n", + " \"/work/examples/data/dfp/duo/duotest_pt1.json\",\n", + " \"/work/examples/data/dfp/duo/duotest_pt2.json\",\n", + " \"/work/examples/data/dfp/duo/duotest_pt3.json\",\n", + " \"/work/examples/data/dfp/duo/duotest_pt4.json\"\n", + "]\n", + "\n", + "# The format to use for models\n", + "model_name_formatter = \"DFP-azure-{user_id}\"\n", + "\n", + "# The format to use for experiment names\n", + "experiment_name_formatter = \"dfp/azure/training/{reg_model_name}\"\n", + "\n", + "# === Derived Options ===\n", + "# To include the generic, we must be training all or generic\n", + "include_generic = train_users == \"all\" or train_users == \"generic\"\n", + "\n", + "# To include individual, we must be either training or inferring\n", + "include_individual = train_users != \"generic\"\n", + "\n", + "# None indicates we arent training anything\n", + "is_training = train_users != \"none\"" + ] + }, + { + "cell_type": "markdown", + "id": "1cfc24c9-c85e-4977-a348-692c8f0aceaa", + "metadata": {}, + "source": [ + "### Global Config Object\n", + "Before creating the pipeline, we need to setup logging and set the parameters for the Morpheus config object. This config object is responsible for the following:\n", + " - Indicating whether to use C++ or Python stages\n", + " - C++ stages are not supported for the DFP pipeline. This should always be `False`\n", + " - Setting the number of threads to use in the pipeline. Defaults to the thread count of the OS.\n", + " - Sets the feature column names that will be used in model training\n", + " - This option allows extra columns to be used in the pipeline that will not be part of the training algorithm.\n", + " - The final features that the model will be trained on will be an intersection of this list with the log columns.\n", + " - The column name that indicates the user's unique identifier\n", + " - It is required for DFP to have a user ID column\n", + " - The column name that indicates the timestamp for the log\n", + " - It is required for DFP to know when each log occurred" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "01abd537-9162-49dc-8e83-d9465592f1d5", + "metadata": {}, + "outputs": [], + "source": [ + "# Enable the Morpheus logger\n", + "configure_logging(log_level=logging.DEBUG)\n", + "\n", + "config = Config()\n", + "\n", + "CppConfig.set_should_use_cpp(False)\n", + "\n", + "config.num_threads = os.cpu_count()\n", + "\n", + "config.ae = ConfigAutoEncoder()\n", + "\n", + "config.ae.feature_columns = [\n", + " \"appDisplayName\", \"clientAppUsed\", \"deviceDetailbrowser\", \"deviceDetaildisplayName\", \"deviceDetailoperatingSystem\", \"statusfailureReason\", \"appincrement\", \"locincrement\", \"logcount\", \n", + "]\n", + "config.ae.userid_column_name = \"username\"\n", + "config.ae.timestamp_column_name = \"timestamp\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a73a4d53-32b6-4ab8-a5d7-c0104b31c69b", + "metadata": {}, + "outputs": [], + "source": [ + "# Specify the column names to ensure all data is uniform\n", + "source_column_info = [\n", + " DateTimeColumn(name=config.ae.timestamp_column_name, dtype=datetime, input_name=\"time\"),\n", + " RenameColumn(name=config.ae.userid_column_name, dtype=str, input_name=\"properties.userPrincipalName\"),\n", + " RenameColumn(name=\"appDisplayName\", dtype=str, input_name=\"properties.appDisplayName\"),\n", + " ColumnInfo(name=\"category\", dtype=str),\n", + " RenameColumn(name=\"clientAppUsed\", dtype=str, input_name=\"properties.clientAppUsed\"),\n", + " RenameColumn(name=\"deviceDetailbrowser\", dtype=str, input_name=\"properties.deviceDetail.browser\"),\n", + " RenameColumn(name=\"deviceDetaildisplayName\", dtype=str, input_name=\"properties.deviceDetail.displayName\"),\n", + " RenameColumn(name=\"deviceDetailoperatingSystem\",\n", + " dtype=str,\n", + " input_name=\"properties.deviceDetail.operatingSystem\"),\n", + " StringCatColumn(name=\"location\",\n", + " dtype=str,\n", + " input_columns=[\n", + " \"properties.location.city\",\n", + " \"properties.location.countryOrRegion\",\n", + " ],\n", + " sep=\", \"),\n", + " RenameColumn(name=\"statusfailureReason\", dtype=str, input_name=\"properties.status.failureReason\"),\n", + "]\n", + "\n", + "source_schema = DataFrameInputSchema(json_columns=[\"properties\"], column_info=source_column_info)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7a0cb0a-e65a-444a-a06c-a4525d543790", + "metadata": {}, + "outputs": [], + "source": [ + "# Preprocessing schema\n", + "preprocess_column_info = [\n", + " ColumnInfo(name=config.ae.timestamp_column_name, dtype=datetime),\n", + " ColumnInfo(name=config.ae.userid_column_name, dtype=str),\n", + " ColumnInfo(name=\"appDisplayName\", dtype=str),\n", + " ColumnInfo(name=\"clientAppUsed\", dtype=str),\n", + " ColumnInfo(name=\"deviceDetailbrowser\", dtype=str),\n", + " ColumnInfo(name=\"deviceDetaildisplayName\", dtype=str),\n", + " ColumnInfo(name=\"deviceDetailoperatingSystem\", dtype=str),\n", + " ColumnInfo(name=\"statusfailureReason\", dtype=str),\n", + "\n", + " # Derived columns\n", + " IncrementColumn(name=\"logcount\",\n", + " dtype=int,\n", + " input_name=config.ae.timestamp_column_name,\n", + " groupby_column=config.ae.userid_column_name),\n", + " CustomColumn(name=\"locincrement\",\n", + " dtype=int,\n", + " process_column_fn=partial(create_increment_col, column_name=\"location\")),\n", + " CustomColumn(name=\"appincrement\",\n", + " dtype=int,\n", + " process_column_fn=partial(create_increment_col, column_name=\"appDisplayName\")),\n", + "]\n", + "\n", + "preprocess_schema = DataFrameInputSchema(column_info=preprocess_column_info, preserve_columns=[\"_batch_id\"])\n" + ] + }, + { + "cell_type": "markdown", + "id": "bdfc59de-dea8-4e5f-98e3-98eba1e4621d", + "metadata": {}, + "source": [ + "## Pipeline Construction\n", + "From this point on we begin constructing the stages that will make up the pipeline. To make testing easier, constructing the pipeline object, adding the stages, and running the pipeline, is provided as a single cell. The below cell can be rerun multiple times as needed for debugging.\n", + "\n", + "### Source Stage (`MultiFileSource`)\n", + "\n", + "This pipeline read input logs from one or more input files. This source stage will construct a list of files to be processed and pass to downstream stages. It is capable of reading files from many different source types, both local and remote. This is possible by utilizing the `fsspec` library (similar to `pandas`). See the [`fsspec`](https://filesystem-spec.readthedocs.io/) documentation for more information on the supported file types. Once all of the logs have been read, the source completes. \n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `filenames` | List of strings | | Any files to read into the pipeline. All files will be combined into a single `DataFrame` |\n", + "\n", + "### File Batcher Stage (`DFPFileBatcherStage`)\n", + "\n", + "To improve performance, multiple small input files can be batched together into a single DataFrame for processing. This stage is responsible for determining the timestamp of input files, grouping input files into batches by time, and sending the batches to be processed into a single DataFrame. Batches of files that have been seen before will be loaded from cache resulting in increased performance. For example, when performaing a 60 day training run, 59 days can be cached with a period of `\"D\"` and retraining once per day.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `period` | `str` | `\"D\"` | The period to create batches. See `pandas` windowing frequency documentation for available options. |\n", + "| `date_conversion_func` | Function of `typing.Callable[[fsspec.core.OpenFile], datetime]` | | A callback which is responsible for determining the date for a specified file. |\n", + "\n", + "### File to DataFrame Stage (`DFPFileToDataFrameStage`)\n", + "\n", + "After files have been batched into groups, this stage is responsible for reading the files and converting into a DataFrame. The specified input schema converts the raw DataFrame into one suitable for caching and processing. Any columns that are not needed should be excluded from the schema.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `schema` | `DataFrameInputSchema` | | After the raw `DataFrame` is read from each file, this schema will be applied to ensure a consisten output from the source. |\n", + "| `file_type` | `FileTypes` | `FileTypes.Auto` | Allows overriding the file type. When set to `Auto`, the file extension will be used. Options are `CSV`, `JSON`, `Auto`. |\n", + "| `parser_kwargs` | `dict` | `{}` | This dictionary will be passed to the `DataFrame` parser class. Allows for customization of log parsing. |\n", + "| `cache_dir` | `str` | `./.cache/dfp` | The location to write cached input files to. |\n", + "\n", + "### Split Users Stage (`DFPSplitUsersStage`)\n", + "\n", + "Once the input logs have been read into a `DataFrame`, this stage is responsible for breaking that single `DataFrame` with many users into multiple `DataFrame`s for each user. This is also where the pipeline chooses whether to train individual users or the generic user (or both).\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `include_generic` | `bool` | | Whether or not to combine all user logs into a single `DataFrame` with the username 'generic_user' |\n", + "| `include_individual` | `bool` | | Whether or not to output individual `DataFrame` objects for each user |\n", + "| `skip_users` | List of `str` | `[]` | Any users to remove from the `DataFrame`. Useful for debugging to remove automated accounts with many logs. Mutually exclusive with `only_users`. |\n", + "| `only_users` | List of `str` | `[]` | Only allow these users in the final `DataFrame`. Useful for debugging to focus on specific users. Mutually exclusive with `skip_users`. |\n", + "\n", + "### Rolling Window Stage (`DFPRollingWindowStage`)\n", + "\n", + "The Rolling Window Stage performs several key pieces of functionality for DFP.\n", + "1. This stage keeps a moving window of logs on a per user basis\n", + " 1. These logs are saved to disk to reduce memory requirements between logs from the same user\n", + "1. It only emits logs when the window history requirements are met\n", + " 1. Until all of the window history requirements are met, no messages will be sent to the rest of the pipeline.\n", + " 1. See the below options for configuring the window history requirements\n", + "1. It repeats the necessary logs to properly calculate log dependent features.\n", + " 1. To support all column feature types, incoming log messages can be combined with existing history and sent to downstream stages.\n", + " 1. For example, to calculate a feature that increments a counter for the number of logs a particular user has generated in a single day, we must have the user's log history for the past 24 hours. To support this, this stage will combine new logs with existing history into a single `DataFrame`.\n", + " 1. It is the responsibility of downstream stages to distinguish between new logs and existing history.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `min_history` | `int` | `300` | The minimum number of logs a user must have before emitting any messages. Logs below this threshold will be saved to disk. |\n", + "| `min_increment` | `int` or `str` | `300` | Once the min history requirement is met, this stage must receive `min_increment` *new* logs before emmitting another message. Logs received before this threshold is met will be saved to disk. Can be specified as an integer count or a string duration. |\n", + "| `max_history` | `int` or `str` | `\"60d\"` | Once `min_history` and `min_increment` requirements have been met, this puts an upper bound on the maximum number of messages to forward into the pipeline and also the maximum amount of messages to retain in the history. Can be specified as an integer count or a string duration. |\n", + "| `cache_dir` | `str` | `./.cache/dfp` | The location to write log history to disk. |\n", + "\n", + "### Preprocessing Stage (`DFPPreprocessingStage`)\n", + "\n", + "This stage performs the final, row dependent, feature calculations as specified by the input schema object. Once calculated, this stage can forward on all received logs, or optionally can only forward on new logs, removing any history information.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `input_schema` | `DataFrameInputSchema` | | The final, row dependent, schema to apply to the incoming columns |\n", + "| `only_new_batches` | `bool` | | Whether or not to foward on all received logs, or just new logs. |\n", + "\n", + "### Training Stage (`DFPTraining`)\n", + "\n", + "This stage is responsible for performing the actual training calculations. Training will be performed on all received data. Resulting message will contain the input data paired with the trained model.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `model_kwargs` | `dict` | `{}` | The options to use when creating a new model instance. See `DFPAutoEncoder` for information on the available options. |\n", + "\n", + "### MLFlow Model Writer Stage (`DFPMLFlowModelWriterStage`)\n", + "\n", + "This stage is the last step in training. It will upload the trained model from the previous stage to MLFlow. The tracking URI for which MLFlow instance to use is configured using the static method `mlflow.set_tracking_uri()`.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `model_name_formatter` | `str` | `\"\"` | A format string to use when building the model name. The model name is constructed by calling `model_name_formatter.format(user_id=user_id)`. For example, with `model_name_formatter=\"my_model-{user_id}\"` and a user ID of `\"first:last\"` would result in the model name of `\"my_model-first:last\"` |\n", + "| `experiment_name` | `str` | | All models are created inside of an experiment to allow metrics to be saved with each model. This option specifies the experiment name. The final experiment name for each model will be in the form of `{experiment_name}/{model_name}` |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "825390ad-ce64-4949-b324-33039ffdf264", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a linear pipeline object\n", + "pipeline = LinearPipeline(config)\n", + "\n", + "# Source stage\n", + "pipeline.set_source(MultiFileSource(config, filenames=input_files))\n", + "\n", + "# Batch files into buckets by time. Use the default ISO date extractor from the filename\n", + "pipeline.add_stage(\n", + " DFPFileBatcherStage(config,\n", + " period=\"D\",\n", + " date_conversion_func=functools.partial(date_extractor, filename_regex=iso_date_regex)))\n", + "\n", + "# Output is S3 Buckets. Convert to DataFrames. This caches downloaded S3 data\n", + "pipeline.add_stage(\n", + " DFPFileToDataFrameStage(config,\n", + " schema=source_schema,\n", + " file_type=FileTypes.JSON,\n", + " parser_kwargs={\n", + " \"lines\": False, \"orient\": \"records\"\n", + " },\n", + " cache_dir=cache_dir))\n", + "\n", + "\n", + "# This will split users or just use one single user\n", + "pipeline.add_stage(\n", + " DFPSplitUsersStage(config,\n", + " include_generic=include_generic,\n", + " include_individual=include_individual,\n", + " skip_users=skip_users))\n", + "\n", + "# Next, have a stage that will create rolling windows\n", + "pipeline.add_stage(\n", + " DFPRollingWindowStage(\n", + " config,\n", + " min_history=300 if is_training else 1,\n", + " min_increment=300 if is_training else 0,\n", + " # For inference, we only ever want 1 day max\n", + " max_history=\"60d\" if is_training else \"1d\",\n", + " cache_dir=cache_dir))\n", + "\n", + "# Output is UserMessageMeta -- Cached frame set\n", + "pipeline.add_stage(DFPPreprocessingStage(config, input_schema=preprocess_schema, only_new_batches=not is_training))\n", + "\n", + "# Finally, perform training which will output a model\n", + "pipeline.add_stage(DFPTraining(config))\n", + "\n", + "# Write that model to MLFlow\n", + "pipeline.add_stage(\n", + " DFPMLFlowModelWriterStage(config,\n", + " model_name_formatter=model_name_formatter,\n", + " experiment_name_formatter=experiment_name_formatter))\n", + "\n", + "# Run the pipeline\n", + "await pipeline._do_run()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.8.13 ('morpheus_dev')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + }, + "vscode": { + "interpreter": { + "hash": "f7a30172b4be85fcd6fc3717815fa43e2969e39e7c3ddd169e51bb2fb4d7b2e9" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/digital_fingerprinting/production/morpheus/dfp_duo_inference.ipynb b/examples/digital_fingerprinting/production/morpheus/dfp_duo_inference.ipynb new file mode 100644 index 0000000000..2dda95e41f --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp_duo_inference.ipynb @@ -0,0 +1,557 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2941e94f-db20-44a5-ab87-2cab499825f7", + "metadata": {}, + "source": [ + "# Digital Finger Printing (DFP) with Morpheus - DUO Inference\n", + "## Introduction\n", + "\n", + "In this notebook, we will be building and running a DFP pipeline that performs inference on Duo authentication logs. The goal is to use the pretrained models generated in the Duo Training notebook to generate anomaly scores for each log. These anomaly scores can be used by security teams to detect abnormal behavior when it happens so the proper action can be taken.\n", + "\n", + "
\n", + "Note: For more information on DFP, the Morpheus pipeline, and setup steps to run this notebook, please see the coresponding DFP training materials.\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b6c1cb50-74f2-445d-b865-8c22c3b3798b", + "metadata": {}, + "outputs": [], + "source": [ + "# Ensure that the morpheus directory is in the python path. This may not need to be run depending on the environment setup\n", + "import sys\n", + "import os\n", + "sys.path.insert(0, os.path.abspath(\"./morpheus\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "102ce011-3ca3-4f96-a72d-de28fad32003", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import functools\n", + "import logging\n", + "import os\n", + "import typing\n", + "from datetime import datetime\n", + "from functools import partial\n", + "\n", + "import click\n", + "import mlflow\n", + "from dfp.stages.dfp_file_batcher_stage import DFPFileBatcherStage\n", + "from dfp.stages.dfp_file_to_df import DFPFileToDataFrameStage\n", + "from dfp.stages.dfp_inference_stage import DFPInferenceStage\n", + "from dfp.stages.dfp_mlflow_model_writer import DFPMLFlowModelWriterStage\n", + "from dfp.stages.dfp_postprocessing_stage import DFPPostprocessingStage\n", + "from dfp.stages.dfp_preprocessing_stage import DFPPreprocessingStage\n", + "from dfp.stages.dfp_rolling_window_stage import DFPRollingWindowStage\n", + "from dfp.stages.dfp_split_users_stage import DFPSplitUsersStage\n", + "from dfp.stages.dfp_training import DFPTraining\n", + "from dfp.stages.multi_file_source import MultiFileSource\n", + "from dfp.utils.column_info import BoolColumn\n", + "from dfp.utils.column_info import ColumnInfo\n", + "from dfp.utils.column_info import CustomColumn\n", + "from dfp.utils.column_info import DataFrameInputSchema\n", + "from dfp.utils.column_info import DateTimeColumn\n", + "from dfp.utils.column_info import IncrementColumn\n", + "from dfp.utils.column_info import RenameColumn\n", + "from dfp.utils.column_info import StringCatColumn\n", + "from dfp.utils.column_info import create_increment_col\n", + "from dfp.utils.file_utils import date_extractor\n", + "from dfp.utils.file_utils import iso_date_regex\n", + "\n", + "from morpheus._lib.file_types import FileTypes\n", + "from morpheus.cli.utils import get_package_relative_file\n", + "from morpheus.cli.utils import load_labels_file\n", + "from morpheus.config import Config\n", + "from morpheus.config import ConfigAutoEncoder\n", + "from morpheus.config import CppConfig\n", + "from morpheus.pipeline import LinearPipeline\n", + "from morpheus.stages.general.monitor_stage import MonitorStage\n", + "from morpheus.stages.output.write_to_file_stage import WriteToFileStage\n", + "from morpheus.utils.logger import configure_logging\n", + "from morpheus.utils.logger import get_log_levels\n", + "from morpheus.utils.logger import parse_log_level\n", + "\n", + "# Left align all tables\n", + "from IPython.core.display import HTML\n", + "table_css = 'table {align:left;display:block}'\n", + "HTML(''.format(table_css))" + ] + }, + { + "cell_type": "markdown", + "id": "ca38c1b7-ce84-43e0-ac53-280562dc1642", + "metadata": {}, + "source": [ + "## High Level Configuration\n", + "\n", + "The following options significantly alter the functionality of the pipeline. These options are separated from the individual stage options since they may effect more than one stage. Additionally, the matching python script to this notebook, `dfp_duo_pipeline.py`, configures these options via command line arguments.\n", + "\n", + "### Options\n", + "\n", + "| Name | Type | Description |\n", + "| --- | --- | :-- |\n", + "| `train_users` | One of `[\"none\"]` | For inference, this option should always be `\"none\"` |\n", + "| `skip_users` | List of strings | Any user in this list will be dropped from the pipeline. Useful for debugging to remove automated accounts with many logs. |\n", + "| `cache_dir` | string | The location to store cached files. To aid with development and reduce bandwidth, the Morpheus pipeline will cache data from several stages of the pipeline. This option configures the location for those caches. |\n", + "| `input_files` | List of strings | List of files to process. Can specificy multiple arguments for multiple files. Also accepts glob (\\*) wildcards and schema prefixes such as `s3://`. For example, to make a local cache of an s3 bucket, use `filecache::s3://mybucket/*`. See `fsspec` documentation for list of possible options. |\n", + "| `model_name_formatter` | string | A format string to use when building the model name. The model name is constructed by calling `model_name_formatter.format(user_id=user_id)`. For example, with `model_name_formatter=\"my_model-{user_id}\"` and a user ID of `\"first:last\"` would result in the model name of `\"my_model-first:last\"`. This should match the value used in `DFPMLFlowModelWriterStage`. Available keyword arguments: `user_id`, `user_md5`. |\n", + "| `experiment_name_formatter` | string | A format string (without the `f`) that will be used when creating an experiment in ML Flow. Available keyword arguments: `user_id`, `user_md5`, `reg_model_name`. |\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "9ee00703-75c5-46fc-890c-86733da906c4", + "metadata": {}, + "outputs": [], + "source": [ + "# Global options\n", + "train_users = \"none\"\n", + "\n", + "# Enter any users to skip here\n", + "skip_users: typing.List[str] = []\n", + "\n", + "# Location where cache objects will be saved\n", + "cache_dir = \"./.cache/dfp\"\n", + "\n", + "# Input files to read from\n", + "input_files = [\n", + " \"/work/examples/data/dfp/duo/duotest_pt1.json\",\n", + " \"/work/examples/data/dfp/duo/duotest_pt2.json\",\n", + " \"/work/examples/data/dfp/duo/duotest_pt3.json\",\n", + " \"/work/examples/data/dfp/duo/duotest_pt4.json\"\n", + "]\n", + "\n", + "# The format to use for models\n", + "model_name_formatter = \"DFP-duo-{user_id}\"\n", + "\n", + "# === Derived Options ===\n", + "# To include the generic, we must be training all or generic\n", + "include_generic = train_users == \"all\" or train_users == \"generic\"\n", + "\n", + "# To include individual, we must be either training or inferring\n", + "include_individual = train_users != \"generic\"\n", + "\n", + "# None indicates we arent training anything\n", + "is_training = train_users != \"none\"" + ] + }, + { + "cell_type": "markdown", + "id": "1cfc24c9-c85e-4977-a348-692c8f0aceaa", + "metadata": {}, + "source": [ + "### Global Config Object\n", + "Before creating the pipeline, we need to setup logging and set the parameters for the Morpheus config object. This config object is responsible for the following:\n", + " - Indicating whether to use C++ or Python stages\n", + " - C++ stages are not supported for the DFP pipeline. This should always be `False`\n", + " - Setting the number of threads to use in the pipeline. Defaults to the thread count of the OS.\n", + " - Sets the feature column names that will be used in model training\n", + " - This option allows extra columns to be used in the pipeline that will not be part of the training algorithm.\n", + " - The final features that the model will be trained on will be an intersection of this list with the log columns.\n", + " - The column name that indicates the user's unique identifier\n", + " - It is required for DFP to have a user ID column\n", + " - The column name that indicates the timestamp for the log\n", + " - It is required for DFP to know when each log occurred" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "01abd537-9162-49dc-8e83-d9465592f1d5", + "metadata": {}, + "outputs": [], + "source": [ + "# Enable the Morpheus logger\n", + "configure_logging(log_level=logging.DEBUG)\n", + "\n", + "config = Config()\n", + "\n", + "CppConfig.set_should_use_cpp(False)\n", + "\n", + "config.num_threads = os.cpu_count()\n", + "\n", + "config.ae = ConfigAutoEncoder()\n", + "\n", + "config.ae.feature_columns = [\n", + " 'accessdevicebrowser', 'accessdeviceos', 'device', 'result', 'reason', 'logcount', \"locincrement\"\n", + "]\n", + "config.ae.userid_column_name = \"username\"\n", + "config.ae.timestamp_column_name = \"timestamp\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a73a4d53-32b6-4ab8-a5d7-c0104b31c69b", + "metadata": {}, + "outputs": [], + "source": [ + "# Specify the column names to ensure all data is uniform\n", + "source_column_info = [\n", + " DateTimeColumn(name=config.ae.timestamp_column_name, dtype=datetime, input_name=\"timestamp\"),\n", + " RenameColumn(name=config.ae.userid_column_name, dtype=str, input_name=\"user.name\"),\n", + " RenameColumn(name=\"accessdevicebrowser\", dtype=str, input_name=\"access_device.browser\"),\n", + " RenameColumn(name=\"accessdeviceos\", dtype=str, input_name=\"access_device.os\"),\n", + " StringCatColumn(name=\"location\",\n", + " dtype=str,\n", + " input_columns=[\n", + " \"access_device.location.city\",\n", + " \"access_device.location.state\",\n", + " \"access_device.location.country\"\n", + " ],\n", + " sep=\", \"),\n", + " RenameColumn(name=\"authdevicename\", dtype=str, input_name=\"auth_device.name\"),\n", + " BoolColumn(name=\"result\",\n", + " dtype=bool,\n", + " input_name=\"result\",\n", + " true_values=[\"success\", \"SUCCESS\"],\n", + " false_values=[\"denied\", \"DENIED\", \"FRAUD\"]),\n", + " ColumnInfo(name=\"reason\", dtype=str),\n", + " # CustomColumn(name=\"user.groups\", dtype=str, process_column_fn=partial(column_listjoin, col_name=\"user.groups\"))\n", + "]\n", + "\n", + "source_schema = DataFrameInputSchema(json_columns=[\"access_device\", \"application\", \"auth_device\", \"user\"],\n", + " column_info=source_column_info)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f7a0cb0a-e65a-444a-a06c-a4525d543790", + "metadata": {}, + "outputs": [], + "source": [ + "# Preprocessing schema\n", + "preprocess_column_info = [\n", + " ColumnInfo(name=config.ae.timestamp_column_name, dtype=datetime),\n", + " ColumnInfo(name=config.ae.userid_column_name, dtype=str),\n", + " ColumnInfo(name=\"accessdevicebrowser\", dtype=str),\n", + " ColumnInfo(name=\"accessdeviceos\", dtype=str),\n", + " ColumnInfo(name=\"authdevicename\", dtype=str),\n", + " ColumnInfo(name=\"result\", dtype=bool),\n", + " ColumnInfo(name=\"reason\", dtype=str),\n", + " # Derived columns\n", + " IncrementColumn(name=\"logcount\",\n", + " dtype=int,\n", + " input_name=config.ae.timestamp_column_name,\n", + " groupby_column=config.ae.userid_column_name),\n", + " CustomColumn(name=\"locincrement\",\n", + " dtype=int,\n", + " process_column_fn=partial(create_increment_col, column_name=\"location\")),\n", + "]\n", + "\n", + "preprocess_schema = DataFrameInputSchema(column_info=preprocess_column_info, preserve_columns=[\"_batch_id\"])\n" + ] + }, + { + "cell_type": "markdown", + "id": "bdfc59de-dea8-4e5f-98e3-98eba1e4621d", + "metadata": {}, + "source": [ + "## Pipeline Construction\n", + "From this point on we begin constructing the stages that will make up the pipeline. To make testing easier, constructing the pipeline object, adding the stages, and running the pipeline, is provided as a single cell. The below cell can be rerun multiple times as needed for debugging.\n", + "\n", + "### Source Stage (`MultiFileSource`)\n", + "\n", + "This pipeline read input logs from one or more input files. This source stage will read all specified log files, combine them into a single `DataFrame`, and pass it into the pipeline. Once all of the logs have been read, the source completes. \n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `filenames` | List of strings | | Any files to read into the pipeline. All files will be combined into a single `DataFrame` |\n", + "\n", + "### File Batcher Stage (`DFPFileBatcherStage`)\n", + "\n", + "To improve performance, multiple small input files can be batched together into a single DataFrame for processing. This stage is responsible for determining the timestamp of input files, grouping input files into batches by time, and sending the batches to be processed into a single DataFrame. Batches of files that have been seen before will be loaded from cache resulting in increased performance. For example, when performaing a 60 day training run, 59 days can be cached with a period of `\"D\"` and retraining once per day.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `period` | `str` | `\"D\"` | The period to create batches. See `pandas` windowing frequency documentation for available options. |\n", + "| `date_conversion_func` | Function of `typing.Callable[[fsspec.core.OpenFile], datetime]` | | A callback which is responsible for determining the date for a specified file. |\n", + "\n", + "### File to DataFrame Stage (`DFPFileToDataFrameStage`)\n", + "\n", + "After files have been batched into groups, this stage is responsible for reading the files and converting into a DataFrame. The specified input schema converts the raw DataFrame into one suitable for caching and processing. Any columns that are not needed should be excluded from the schema.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `schema` | `DataFrameInputSchema` | | After the raw `DataFrame` is read from each file, this schema will be applied to ensure a consisten output from the source. |\n", + "| `file_type` | `FileTypes` | `FileTypes.Auto` | Allows overriding the file type. When set to `Auto`, the file extension will be used. Options are `CSV`, `JSON`, `Auto`. |\n", + "| `parser_kwargs` | `dict` | `{}` | This dictionary will be passed to the `DataFrame` parser class. Allows for customization of log parsing. |\n", + "| `cache_dir` | `str` | `./.cache/dfp` | The location to write cached input files to. |\n", + "\n", + "### Split Users Stage (`DFPSplitUsersStage`)\n", + "\n", + "Once the input logs have been read into a `DataFrame`, this stage is responsible for breaking that single `DataFrame` with many users into multiple `DataFrame`s for each user. This is also where the pipeline chooses whether to train individual users or the generic user (or both).\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `include_generic` | `bool` | | Whether or not to combine all user logs into a single `DataFrame` with the username 'generic_user' |\n", + "| `include_individual` | `bool` | | Whether or not to output individual `DataFrame` objects for each user |\n", + "| `skip_users` | List of `str` | `[]` | Any users to remove from the `DataFrame`. Useful for debugging to remove automated accounts with many logs. Mutually exclusive with `only_users`. |\n", + "| `only_users` | List of `str` | `[]` | Only allow these users in the final `DataFrame`. Useful for debugging to focus on specific users. Mutually exclusive with `skip_users`. |\n", + "\n", + "### Rolling Window Stage (`DFPRollingWindowStage`)\n", + "\n", + "The Rolling Window Stage performs several key pieces of functionality for DFP.\n", + "1. This stage keeps a moving window of logs on a per user basis\n", + " 1. These logs are saved to disk to reduce memory requirements between logs from the same user\n", + "1. It only emits logs when the window history requirements are met\n", + " 1. Until all of the window history requirements are met, no messages will be sent to the rest of the pipeline.\n", + " 1. See the below options for configuring the window history requirements\n", + "1. It repeats the necessary logs to properly calculate log dependent features.\n", + " 1. To support all column feature types, incoming log messages can be combined with existing history and sent to downstream stages.\n", + " 1. For example, to calculate a feature that increments a counter for the number of logs a particular user has generated in a single day, we must have the user's log history for the past 24 hours. To support this, this stage will combine new logs with existing history into a single `DataFrame`.\n", + " 1. It is the responsibility of downstream stages to distinguish between new logs and existing history.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `min_history` | `int` | `1` | The minimum number of logs a user must have before emitting any messages. Logs below this threshold will be saved to disk. |\n", + "| `min_increment` | `int` or `str` | `0` | Once the min history requirement is met, this stage must receive `min_increment` *new* logs before emmitting another message. Logs received before this threshold is met will be saved to disk. Can be specified as an integer count or a string duration. |\n", + "| `max_history` | `int` or `str` | `\"1d\"` | Once `min_history` and `min_increment` requirements have been met, this puts an upper bound on the maximum number of messages to forward into the pipeline and also the maximum amount of messages to retain in the history. Can be specified as an integer count or a string duration. |\n", + "| `cache_dir` | `str` | `./.cache/dfp` | The location to write log history to disk. |\n", + "\n", + "### Preprocessing Stage (`DFPPreprocessingStage`)\n", + "\n", + "This stage performs the final, row dependent, feature calculations as specified by the input schema object. Once calculated, this stage can forward on all received logs, or optionally can only forward on new logs, removing any history information.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `input_schema` | `DataFrameInputSchema` | | The final, row dependent, schema to apply to the incoming columns |\n", + "| `only_new_batches` | `bool` | | Whether or not to foward on all received logs, or just new logs. |\n", + "\n", + "### Inference Stage (`DFPInference`)\n", + "\n", + "This stage performs several tasks to aid in performing inference. This stage will:\n", + "1. Download models as needed from MLFlow\n", + "1. Cache previously downloaded models to improve performance\n", + " 1. Models in the cache will be periodically refreshed from MLFlow at a configured rate\n", + "1. Perform inference using the downloaded model\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `model_name_formatter` | `str` | `\"\"` | A format string to use when building the model name. The model name is constructed by calling `model_name_formatter.format(user_id=user_id)`. For example, with `model_name_formatter=\"my_model-{user_id}\"` and a user ID of `\"first:last\"` would result in the model name of `\"my_model-first:last\"`. This should match the value used in `DFPMLFlowModelWriterStage` |\n", + "\n", + "### Post Processing Stage (`DFPPostprocessingStage`)\n", + "\n", + "This stage filters the output from the inference stage for any anomalous messages. Logs which exceed the specified Z-Score will be passed onto the next stage. All remaining logs which are below the threshold will be dropped.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `z_score_threshold` | `float` | `2.0` | The Z-Score used to separate anomalous logs from normal logs. All normal logs will be filterd out and anomalous logs will be passed on. |\n", + "\n", + "### Write to File Stage (`WriteToFileStage`)\n", + "\n", + "This final stage will write all received messages to a single output file in either CSV or JSON format.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `filename` | `str` | | The file to write anomalous log messages to. |\n", + "| `overwrite` | `bool` | `False` | If the file specified in `filename` already exists, it will be overwritten if this option is set to `True` |" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "825390ad-ce64-4949-b324-33039ffdf264", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "====Registering Pipeline====\u001b[0m\n", + "====Registering Pipeline Complete!====\u001b[0m\n", + "====Starting Pipeline====\u001b[0m\n", + "====Pipeline Started====\u001b[0m\n", + "====Building Pipeline====\u001b[0m\n", + "Added source: , input_name='access_device.browser'), RenameColumn(name='accessdeviceos', dtype=, input_name='access_device.os'), RenameColumn(name='locationcity', dtype=, input_name='auth_device.location.city'), RenameColumn(name='device', dtype=, input_name='auth_device.name'), BoolColumn(name='result', dtype=, input_name='result', value_map={'success': True, 'SUCCESS': True, 'denied': False, 'DENIED': False, 'FRAUD': False}), RenameColumn(name='reason', dtype=, input_name='reason'), RenameColumn(name='username', dtype=, input_name='user.name'), RenameColumn(name='timestamp', dtype=, input_name='timestamp')], preserve_columns=None), filenames=['/work/examples/data/dfp/duo/duotest_pt1.json', '/work/examples/data/dfp/duo/duotest_pt2.json', '/work/examples/data/dfp/duo/duotest_pt3.json', '/work/examples/data/dfp/duo/duotest_pt4.json'], file_type=FileTypes.Auto, parser_kwargs={'lines': False, 'orient': 'records'})>\n", + " └─> cudf.DataFrame\u001b[0m\n", + "Added stage: \n", + " └─ cudf.DataFrame -> dfp.DFPMessageMeta\u001b[0m\n", + "Added stage: \n", + " └─ dfp.DFPMessageMeta -> dfp.MultiDFPMessage\u001b[0m\n", + "Added stage: , input_name='accessdevicebrowser'), RenameColumn(name='accessdeviceos', dtype=, input_name='accessdeviceos'), RenameColumn(name='device', dtype=, input_name='device'), RenameColumn(name='result', dtype=, input_name='result'), RenameColumn(name='reason', dtype=, input_name='reason'), CustomColumn(name='logcount', dtype=, process_column_fn=), CustomColumn(name='locincrement', dtype=, process_column_fn=), RenameColumn(name='username', dtype=, input_name='username'), RenameColumn(name='timestamp', dtype=, input_name='timestamp')], preserve_columns=re.compile('(_batch_id)')), return_format=data, only_new_batches=True)>\n", + " └─ dfp.MultiDFPMessage -> dfp.MultiDFPMessage\u001b[0m\n", + "Added stage: \n", + " └─ dfp.MultiDFPMessage -> morpheus.MultiAEMessage\u001b[0m\n", + "Added stage: \n", + " └─ morpheus.MultiAEMessage -> dfp.DFPMessageMeta\u001b[0m\n", + "Added stage: \n", + " └─ dfp.DFPMessageMeta -> dfp.DFPMessageMeta\u001b[0m\n", + "====Building Pipeline Complete!====\u001b[0m\n", + "\u001b[2mStarting! Time: 1661583516.3895624\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "W20220827 06:58:36.361177 496 thread.cpp:138] unable to set memory policy - if using docker use: --cap-add=sys_nice to allow membind\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Sending 4000 rows\n", + "\u001b[2mBatch split users complete. Input: 4000 rows from 2021-09-14 00:52:14 to 2022-01-24 15:23:41. Output: 10 users, rows/user min: 176, max: 700, avg: 400.00. Duration: 5.55 ms\u001b[0m\n", + "\u001b[2mRolling window complete for badguy in 16.03 ms. Input: 490 rows from 2021-09-14 00:52:14 to 2022-01-24 09:30:26. Output: 490 rows from 2021-09-14 00:52:14 to 2022-01-24 09:30:26\u001b[0m\n", + "\u001b[2mPreprocessed 490 data for logs in 2021-09-14 00:52:14 to 2022-01-24 09:30:26 in 34.24859046936035 ms\u001b[0m\n", + "\u001b[2mRolling window complete for maliciousactor in 37.56 ms. Input: 700 rows from 2021-09-14 16:58:15 to 2022-01-24 15:23:41. Output: 700 rows from 2021-09-14 16:58:15 to 2022-01-24 15:23:41\u001b[0m\n", + "\u001b[2mRolling window complete for usera in 24.63 ms. Input: 215 rows from 2021-10-07 10:53:23 to 2022-01-24 15:21:18. Output: 215 rows from 2021-10-07 10:53:23 to 2022-01-24 15:21:18\u001b[0m\n", + "\u001b[2mRolling window complete for userb in 28.06 ms. Input: 457 rows from 2021-10-07 04:02:07 to 2022-01-24 15:22:10. Output: 457 rows from 2021-10-07 04:02:07 to 2022-01-24 15:22:10\u001b[0m\n", + "\u001b[2mPreprocessed 700 data for logs in 2021-09-14 16:58:15 to 2022-01-24 15:23:41 in 52.93703079223633 ms\u001b[0m\n", + "\u001b[2mPreprocessed 215 data for logs in 2021-10-07 10:53:23 to 2022-01-24 15:21:18 in 25.563955307006836 ms\u001b[0m\n", + "\u001b[2mRolling window complete for userc in 45.67 ms. Input: 500 rows from 2021-10-06 20:29:50 to 2022-01-24 15:23:00. Output: 500 rows from 2021-10-06 20:29:50 to 2022-01-24 15:23:00\u001b[0m\n", + "\u001b[2mPreprocessed 457 data for logs in 2021-10-07 04:02:07 to 2022-01-24 15:22:10 in 42.35053062438965 ms\u001b[0m\n", + "\u001b[2mRolling window complete for userd in 39.30 ms. Input: 537 rows from 2021-10-07 02:25:27 to 2022-01-24 12:26:34. Output: 537 rows from 2021-10-07 02:25:27 to 2022-01-24 12:26:34\u001b[0m\n", + "\u001b[2mRolling window complete for usere in 25.54 ms. Input: 176 rows from 2021-12-15 12:12:25 to 2022-01-24 07:56:46. Output: 176 rows from 2021-12-15 12:12:25 to 2022-01-24 07:56:46\u001b[0m\n", + "\u001b[2mPreprocessed 500 data for logs in 2021-10-06 20:29:50 to 2022-01-24 15:23:00 in 49.77822303771973 ms\u001b[0m\n", + "\u001b[2mRolling window complete for userf in 35.04 ms. Input: 304 rows from 2021-10-07 10:55:23 to 2022-01-24 14:08:00. Output: 304 rows from 2021-10-07 10:55:23 to 2022-01-24 14:08:00\u001b[0m\n", + "\u001b[2mRolling window complete for userg in 36.67 ms. Input: 275 rows from 2021-10-07 10:55:01 to 2022-01-24 15:23:25. Output: 275 rows from 2021-10-07 10:55:01 to 2022-01-24 15:23:25\u001b[0m\n", + "\u001b[2mPreprocessed 537 data for logs in 2021-10-07 02:25:27 to 2022-01-24 12:26:34 in 71.11096382141113 ms\u001b[0m\n", + "\u001b[2mRolling window complete for userh in 25.03 ms. Input: 346 rows from 2021-10-07 10:54:07 to 2022-01-24 15:22:10. Output: 346 rows from 2021-10-07 10:54:07 to 2022-01-24 15:22:10\u001b[0m\n", + "\u001b[2mPreprocessed 176 data for logs in 2021-12-15 12:12:25 to 2022-01-24 07:56:46 in 23.290157318115234 ms\u001b[0m\n", + "\u001b[2mPreprocessed 304 data for logs in 2021-10-07 10:55:23 to 2022-01-24 14:08:00 in 27.817487716674805 ms\u001b[0m\n", + "\u001b[2mPreprocessed 275 data for logs in 2021-10-07 10:55:01 to 2022-01-24 15:23:25 in 35.3856086730957 ms\u001b[0m\n", + "\u001b[2mPreprocessed 346 data for logs in 2021-10-07 10:54:07 to 2022-01-24 15:22:10 in 26.59463882446289 ms\u001b[0m\n", + "\u001b[2mCompleted inference for user badguy. Model load: 4450.737953186035 ms, Model infer: 17.392873764038086 ms. Start: 2021-09-14 00:52:14, End: 2022-01-24 09:30:26\u001b[0m\n", + "\u001b[2mCompleted postprocessing for user badguy in 6.493568420410156 ms. Event count: 12. Start: 2021-09-14 00:52:14, End: 2022-01-24 09:30:26\u001b[0m\n", + "\u001b[2mCompleted inference for user maliciousactor. Model load: 168.58458518981934 ms, Model infer: 14.253854751586914 ms. Start: 2021-09-14 16:58:15, End: 2022-01-24 15:23:41\u001b[0m\n", + "\u001b[2mCompleted postprocessing for user maliciousactor in 6.427764892578125 ms. Event count: 0. Start: 2021-09-14 16:58:15, End: 2022-01-24 15:23:41\u001b[0m\n", + "\u001b[2mCompleted inference for user usera. Model load: 162.7488136291504 ms, Model infer: 13.43989372253418 ms. Start: 2021-10-07 10:53:23, End: 2022-01-24 15:21:18\u001b[0m\n", + "\u001b[2mCompleted postprocessing for user usera in 4.646778106689453 ms. Event count: 0. Start: 2021-10-07 10:53:23, End: 2022-01-24 15:21:18\u001b[0m\n", + "\u001b[2mCompleted inference for user userb. Model load: 159.15226936340332 ms, Model infer: 14.02425765991211 ms. Start: 2021-10-07 04:02:07, End: 2022-01-24 15:22:10\u001b[0m\n", + "\u001b[2mCompleted postprocessing for user userb in 6.743431091308594 ms. Event count: 1. Start: 2021-10-07 04:02:07, End: 2022-01-24 15:22:10\u001b[0m\n", + "\u001b[2mCompleted inference for user userc. Model load: 162.69850730895996 ms, Model infer: 14.590740203857422 ms. Start: 2021-10-06 20:29:50, End: 2022-01-24 15:23:00\u001b[0m\n", + "\u001b[2mCompleted postprocessing for user userc in 6.506443023681641 ms. Event count: 3. Start: 2021-10-06 20:29:50, End: 2022-01-24 15:23:00\u001b[0m\n", + "\u001b[2mCompleted inference for user userd. Model load: 195.6191062927246 ms, Model infer: 14.337778091430664 ms. Start: 2021-10-07 02:25:27, End: 2022-01-24 12:26:34\u001b[0m\n", + "\u001b[2mCompleted postprocessing for user userd in 14.258861541748047 ms. Event count: 24. Start: 2021-10-07 02:25:27, End: 2022-01-24 12:26:34\u001b[0m\n", + "\u001b[2mCompleted inference for user usere. Model load: 0.6737709045410156 ms, Model infer: 24.237871170043945 ms. Start: 2021-12-15 12:12:25, End: 2022-01-24 07:56:46\u001b[0m\n", + "\u001b[2mCompleted postprocessing for user usere in 4.172325134277344 ms. Event count: 0. Start: 2021-12-15 12:12:25, End: 2022-01-24 07:56:46\u001b[0m\n", + "\u001b[2mCompleted inference for user userf. Model load: 163.68651390075684 ms, Model infer: 14.430522918701172 ms. Start: 2021-10-07 10:55:23, End: 2022-01-24 14:08:00\u001b[0m\n", + "\u001b[2mCompleted postprocessing for user userf in 13.100862503051758 ms. Event count: 9. Start: 2021-10-07 10:55:23, End: 2022-01-24 14:08:00\u001b[0m\n", + "\u001b[2mCompleted inference for user userg. Model load: 1.8045902252197266 ms, Model infer: 23.801803588867188 ms. Start: 2021-10-07 10:55:01, End: 2022-01-24 15:23:25\u001b[0m\n", + "\u001b[2mCompleted postprocessing for user userg in 5.267620086669922 ms. Event count: 0. Start: 2021-10-07 10:55:01, End: 2022-01-24 15:23:25\u001b[0m\n", + "\u001b[2mCompleted inference for user userh. Model load: 191.9386386871338 ms, Model infer: 15.669107437133789 ms. Start: 2021-10-07 10:54:07, End: 2022-01-24 15:22:10\u001b[0m\n", + "\u001b[2mCompleted postprocessing for user userh in 8.036375045776367 ms. Event count: 45. Start: 2021-10-07 10:54:07, End: 2022-01-24 15:22:10\u001b[0m\n", + "====Pipeline Complete====\u001b[0m\n" + ] + } + ], + "source": [ + "# Create a linear pipeline object\n", + "pipeline = LinearPipeline(config)\n", + "\n", + "# Source stage\n", + "pipeline.set_source(MultiFileSource(config, filenames=input_files))\n", + "\n", + "# Batch files into buckets by time. Use the default ISO date extractor from the filename\n", + "pipeline.add_stage(\n", + " DFPFileBatcherStage(config,\n", + " period=\"D\",\n", + " date_conversion_func=functools.partial(date_extractor, filename_regex=iso_date_regex)))\n", + "\n", + "# Output is S3 Buckets. Convert to DataFrames. This caches downloaded S3 data\n", + "pipeline.add_stage(\n", + " DFPFileToDataFrameStage(config,\n", + " schema=source_schema,\n", + " file_type=FileTypes.JSON,\n", + " parser_kwargs={\n", + " \"lines\": False, \"orient\": \"records\"\n", + " },\n", + " cache_dir=cache_dir))\n", + "\n", + "\n", + "# This will split users or just use one single user\n", + "pipeline.add_stage(\n", + " DFPSplitUsersStage(config,\n", + " include_generic=include_generic,\n", + " include_individual=include_individual,\n", + " skip_users=skip_users))\n", + "\n", + "# Next, have a stage that will create rolling windows\n", + "pipeline.add_stage(\n", + " DFPRollingWindowStage(\n", + " config,\n", + " min_history=300 if is_training else 1,\n", + " min_increment=300 if is_training else 0,\n", + " # For inference, we only ever want 1 day max\n", + " max_history=\"60d\" if is_training else \"1d\",\n", + " cache_dir=cache_dir))\n", + "\n", + "# Output is UserMessageMeta -- Cached frame set\n", + "pipeline.add_stage(DFPPreprocessingStage(config, input_schema=preprocess_schema, only_new_batches=not is_training))\n", + "\n", + "# Perform inference on the preprocessed data\n", + "pipeline.add_stage(DFPInferenceStage(config, model_name_formatter=model_name_formatter))\n", + "\n", + "# Filter for only the anomalous logs\n", + "pipeline.add_stage(DFPPostprocessingStage(config, z_score_threshold=2.0))\n", + "\n", + "# Write all anomalies to a CSV file\n", + "pipeline.add_stage(WriteToFileStage(config, filename=\"dfp_detections_duo.csv\", overwrite=True))\n", + "\n", + "# Run the pipeline\n", + "await pipeline._do_run()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75c0cf6b-8255-4d90-b67c-151518c7423b", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:morpheus] *", + "language": "python", + "name": "conda-env-morpheus-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + }, + "vscode": { + "interpreter": { + "hash": "e26783b24f020aa0bcaa00e6ba122db5d0e3da2d892d80be664969895e06a7e1" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/digital_fingerprinting/production/morpheus/dfp_duo_pipeline.py b/examples/digital_fingerprinting/production/morpheus/dfp_duo_pipeline.py new file mode 100644 index 0000000000..59a5f0d298 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp_duo_pipeline.py @@ -0,0 +1,290 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +import logging +import os +import typing +from datetime import datetime +from functools import partial + +import click +import mlflow +from dfp.stages.dfp_file_batcher_stage import DFPFileBatcherStage +from dfp.stages.dfp_file_to_df import DFPFileToDataFrameStage +from dfp.stages.dfp_inference_stage import DFPInferenceStage +from dfp.stages.dfp_mlflow_model_writer import DFPMLFlowModelWriterStage +from dfp.stages.dfp_postprocessing_stage import DFPPostprocessingStage +from dfp.stages.dfp_preprocessing_stage import DFPPreprocessingStage +from dfp.stages.dfp_rolling_window_stage import DFPRollingWindowStage +from dfp.stages.dfp_split_users_stage import DFPSplitUsersStage +from dfp.stages.dfp_training import DFPTraining +from dfp.stages.multi_file_source import MultiFileSource +from dfp.utils.column_info import BoolColumn +from dfp.utils.column_info import ColumnInfo +from dfp.utils.column_info import CustomColumn +from dfp.utils.column_info import DataFrameInputSchema +from dfp.utils.column_info import DateTimeColumn +from dfp.utils.column_info import IncrementColumn +from dfp.utils.column_info import RenameColumn +from dfp.utils.column_info import StringCatColumn +from dfp.utils.column_info import create_increment_col +from dfp.utils.file_utils import date_extractor +from dfp.utils.file_utils import iso_date_regex + +from morpheus._lib.file_types import FileTypes +from morpheus.cli.utils import get_package_relative_file +from morpheus.cli.utils import load_labels_file +from morpheus.config import Config +from morpheus.config import ConfigAutoEncoder +from morpheus.config import CppConfig +from morpheus.pipeline import LinearPipeline +from morpheus.stages.general.monitor_stage import MonitorStage +from morpheus.stages.output.write_to_file_stage import WriteToFileStage +from morpheus.utils.logger import configure_logging +from morpheus.utils.logger import get_log_levels +from morpheus.utils.logger import parse_log_level + + +@click.command() +@click.option( + "--train_users", + type=click.Choice(["all", "generic", "individual", "none"], case_sensitive=False), + help="Indicates whether or not to train per user or a generic model for all users", +) +@click.option( + "--skip_user", + multiple=True, + type=str, + help="User IDs to skip. Mutually exclusive with only_user", +) +@click.option( + "--only_user", + multiple=True, + type=str, + help="Only users specified by this option will be included. Mutually exclusive with skip_user", +) +@click.option( + "--duration", + type=str, + default="60d", + help="The duration to run starting from now", +) +@click.option( + "--cache_dir", + type=str, + default="./.cache/dfp", + show_envvar=True, + help="The location to cache data such as S3 downloads and pre-processed data", +) +@click.option("--log_level", + default=logging.getLevelName(Config().log_level), + type=click.Choice(get_log_levels(), case_sensitive=False), + callback=parse_log_level, + help="Specify the logging level to use.") +@click.option("--sample_rate_s", + type=int, + default=0, + show_envvar=True, + help="Minimum time step, in milliseconds, between object logs.") +@click.option( + "--input_file", + "-f", + type=str, + multiple=True, + help=("List of files to process. Can specificy multiple arguments for multiple files. " + "Also accepts glob (*) wildcards and schema prefixes such as `s3://`. " + "For example, to make a local cache of an s3 bucket, use `filecache::s3://mybucket/*`. " + "See fsspec documentation for list of possible options."), +) +@click.option('--tracking_uri', + type=str, + default="http://localhost:5000", + help=("The ML Flow tracking URI to connect to the tracking backend. If not speficied, MF Flow will use " + "'file:///mlruns' relative to the current directory")) +def run_pipeline(train_users, + skip_user: typing.Tuple[str], + only_user: typing.Tuple[str], + duration, + cache_dir, + log_level, + sample_rate_s, + **kwargs): + # To include the generic, we must be training all or generic + include_generic = train_users == "all" or train_users == "generic" + + # To include individual, we must be either training or inferring + include_individual = train_users != "generic" + + # None indicates we arent training anything + is_training = train_users != "none" + + skip_users = list(skip_user) + only_users = list(only_user) + + # Enable the Morpheus logger + configure_logging(log_level=log_level) + + if (len(skip_users) > 0 and len(only_users) > 0): + logging.error("Option --skip_user and --only_user are mutually exclusive. Exiting") + + logger = logging.getLogger("morpheus.{}".format(__name__)) + + logger.info("Running training pipeline with the following options: ") + logger.info("Train generic_user: %s", include_generic) + logger.info("Skipping users: %s", skip_users) + logger.info("Duration: %s", duration) + logger.info("Cache Dir: %s", cache_dir) + + if ("tracking_uri" in kwargs): + # Initialize ML Flow + mlflow.set_tracking_uri(kwargs["tracking_uri"]) + logger.info("Tracking URI: %s", mlflow.get_tracking_uri()) + + config = Config() + + CppConfig.set_should_use_cpp(False) + + config.num_threads = os.cpu_count() + + config.ae = ConfigAutoEncoder() + + config.ae.feature_columns = load_labels_file(get_package_relative_file("data/columns_ae_duo.txt")) + config.ae.userid_column_name = "username" + config.ae.timestamp_column_name = "timestamp" + + # Specify the column names to ensure all data is uniform + source_column_info = [ + DateTimeColumn(name=config.ae.timestamp_column_name, dtype=datetime, input_name="timestamp"), + RenameColumn(name=config.ae.userid_column_name, dtype=str, input_name="user.name"), + RenameColumn(name="accessdevicebrowser", dtype=str, input_name="access_device.browser"), + RenameColumn(name="accessdeviceos", dtype=str, input_name="access_device.os"), + StringCatColumn(name="location", + dtype=str, + input_columns=[ + "access_device.location.city", + "access_device.location.state", + "access_device.location.country" + ], + sep=", "), + RenameColumn(name="authdevicename", dtype=str, input_name="auth_device.name"), + BoolColumn(name="result", + dtype=bool, + input_name="result", + true_values=["success", "SUCCESS"], + false_values=["denied", "DENIED", "FRAUD"]), + ColumnInfo(name="reason", dtype=str), + # CustomColumn(name="user.groups", dtype=str, process_column_fn=partial(column_listjoin, + # col_name="user.groups")) + ] + + source_schema = DataFrameInputSchema(json_columns=["access_device", "application", "auth_device", "user"], + column_info=source_column_info) + + # Preprocessing schema + preprocess_column_info = [ + ColumnInfo(name=config.ae.timestamp_column_name, dtype=datetime), + ColumnInfo(name=config.ae.userid_column_name, dtype=str), + ColumnInfo(name="accessdevicebrowser", dtype=str), + ColumnInfo(name="accessdeviceos", dtype=str), + ColumnInfo(name="authdevicename", dtype=str), + ColumnInfo(name="result", dtype=bool), + ColumnInfo(name="reason", dtype=str), + # Derived columns + IncrementColumn(name="logcount", + dtype=int, + input_name=config.ae.timestamp_column_name, + groupby_column=config.ae.userid_column_name), + CustomColumn(name="locincrement", + dtype=int, + process_column_fn=partial(create_increment_col, column_name="location")), + ] + + preprocess_schema = DataFrameInputSchema(column_info=preprocess_column_info, preserve_columns=["_batch_id"]) + + # Create a linear pipeline object + pipeline = LinearPipeline(config) + + pipeline.set_source(MultiFileSource(config, filenames=list(kwargs["input_file"]))) + + # Batch files into buckets by time. Use the default ISO date extractor from the filename + pipeline.add_stage( + DFPFileBatcherStage(config, + period="D", + sampling_rate_s=sample_rate_s, + date_conversion_func=functools.partial(date_extractor, filename_regex=iso_date_regex))) + + # Output is S3 Buckets. Convert to DataFrames. This caches downloaded S3 data + pipeline.add_stage( + DFPFileToDataFrameStage(config, + schema=source_schema, + file_type=FileTypes.JSON, + parser_kwargs={ + "lines": False, "orient": "records" + }, + cache_dir=cache_dir)) + + pipeline.add_stage(MonitorStage(config, description="Input data rate")) + + # This will split users or just use one single user + pipeline.add_stage( + DFPSplitUsersStage(config, + include_generic=include_generic, + include_individual=include_individual, + skip_users=skip_users, + only_users=only_users)) + + # Next, have a stage that will create rolling windows + pipeline.add_stage( + DFPRollingWindowStage( + config, + min_history=300 if is_training else 1, + min_increment=300 if is_training else 0, + # For inference, we only ever want 1 day max + max_history="60d" if is_training else "1d", + cache_dir=cache_dir)) + + # Output is UserMessageMeta -- Cached frame set + pipeline.add_stage(DFPPreprocessingStage(config, input_schema=preprocess_schema, only_new_batches=not is_training)) + + model_name_formatter = "DFP-duo-{user_id}" + experiment_name_formatter = "dfp/duo/training/{reg_model_name}" + + if (is_training): + + # Finally, perform training which will output a model + pipeline.add_stage(DFPTraining(config)) + + pipeline.add_stage(MonitorStage(config, description="Training rate", smoothing=0.001)) + + # Write that model to MLFlow + pipeline.add_stage( + DFPMLFlowModelWriterStage(config, + model_name_formatter=model_name_formatter, + experiment_name_formatter=experiment_name_formatter)) + else: + pipeline.add_stage(DFPInferenceStage(config, model_name_formatter=model_name_formatter)) + + pipeline.add_stage(MonitorStage(config, description="Inference rate", smoothing=0.001)) + + pipeline.add_stage(DFPPostprocessingStage(config, z_score_threshold=2.0)) + + pipeline.add_stage(WriteToFileStage(config, filename="dfp_detections_duo.csv", overwrite=True)) + + # Run the pipeline + pipeline.run() + + +if __name__ == "__main__": + run_pipeline(obj={}, auto_envvar_prefix='DFP', show_default=True, prog_name="dfp") diff --git a/examples/digital_fingerprinting/production/morpheus/dfp_duo_training.ipynb b/examples/digital_fingerprinting/production/morpheus/dfp_duo_training.ipynb new file mode 100644 index 0000000000..ee9cc10326 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp_duo_training.ipynb @@ -0,0 +1,642 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2941e94f-db20-44a5-ab87-2cab499825f7", + "metadata": {}, + "source": [ + "# Digital Finger Printing (DFP) with Morpheus - DUO Training\n", + "## Introduction\n", + "\n", + "In this notebook, we will be building and running a DFP pipeline that performs training on Duo authentication logs. The goal is to train an autoencoder PyTorch model to recogize the patterns of users in the sample data. The model will then be used by a second Morpheus pipeline to generate anomaly scores for each individual log. These anomaly scores can be used by security teams to detect abnormal behavior when it happens so the proper action can be taken.\n", + "\n", + "
\n", + "Note: For more information on DFP, the Morpheus pipeline, and setup steps to run this notebook, please see the coresponding DFP training materials.\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b6c1cb50-74f2-445d-b865-8c22c3b3798b", + "metadata": {}, + "outputs": [], + "source": [ + "# Ensure that the morpheus directory is in the python path. This may not need to be run depending on the environment setup\n", + "import sys\n", + "import os\n", + "sys.path.insert(0, os.path.abspath(\"./morpheus\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "102ce011-3ca3-4f96-a72d-de28fad32003", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import functools\n", + "import logging\n", + "import os\n", + "import typing\n", + "from datetime import datetime\n", + "from functools import partial\n", + "\n", + "import click\n", + "import mlflow\n", + "from dfp.stages.dfp_file_batcher_stage import DFPFileBatcherStage\n", + "from dfp.stages.dfp_file_to_df import DFPFileToDataFrameStage\n", + "from dfp.stages.dfp_inference_stage import DFPInferenceStage\n", + "from dfp.stages.dfp_mlflow_model_writer import DFPMLFlowModelWriterStage\n", + "from dfp.stages.dfp_postprocessing_stage import DFPPostprocessingStage\n", + "from dfp.stages.dfp_preprocessing_stage import DFPPreprocessingStage\n", + "from dfp.stages.dfp_rolling_window_stage import DFPRollingWindowStage\n", + "from dfp.stages.dfp_split_users_stage import DFPSplitUsersStage\n", + "from dfp.stages.dfp_training import DFPTraining\n", + "from dfp.stages.multi_file_source import MultiFileSource\n", + "from dfp.utils.column_info import BoolColumn\n", + "from dfp.utils.column_info import ColumnInfo\n", + "from dfp.utils.column_info import CustomColumn\n", + "from dfp.utils.column_info import DataFrameInputSchema\n", + "from dfp.utils.column_info import DateTimeColumn\n", + "from dfp.utils.column_info import IncrementColumn\n", + "from dfp.utils.column_info import RenameColumn\n", + "from dfp.utils.column_info import StringCatColumn\n", + "from dfp.utils.column_info import create_increment_col\n", + "from dfp.utils.file_utils import date_extractor\n", + "from dfp.utils.file_utils import iso_date_regex\n", + "\n", + "from morpheus._lib.file_types import FileTypes\n", + "from morpheus.cli.utils import get_package_relative_file\n", + "from morpheus.cli.utils import load_labels_file\n", + "from morpheus.config import Config\n", + "from morpheus.config import ConfigAutoEncoder\n", + "from morpheus.config import CppConfig\n", + "from morpheus.pipeline import LinearPipeline\n", + "from morpheus.stages.general.monitor_stage import MonitorStage\n", + "from morpheus.stages.output.write_to_file_stage import WriteToFileStage\n", + "from morpheus.utils.logger import configure_logging\n", + "from morpheus.utils.logger import get_log_levels\n", + "from morpheus.utils.logger import parse_log_level\n", + "\n", + "# Left align all tables\n", + "from IPython.core.display import HTML\n", + "table_css = 'table {align:left;display:block}'\n", + "HTML(''.format(table_css))" + ] + }, + { + "cell_type": "markdown", + "id": "ca38c1b7-ce84-43e0-ac53-280562dc1642", + "metadata": {}, + "source": [ + "## High Level Configuration\n", + "\n", + "The following options significantly alter the functionality of the pipeline. These options are separated from the individual stage options since they may effect more than one stage. Additionally, the matching python script to this notebook, `dfp_pipeline_duo.py`, configures these options via command line arguments.\n", + "\n", + "### Options\n", + "\n", + "| Name | Type | Description |\n", + "| --- | --- | :-- |\n", + "| `train_users` | One of `[\"all\", \"generic\", \"individual\"]` | This indicates which users to train for this pipeline:
  • `\"generic\"`: Combine all users into a single model with the username 'generic_user'. Skips individual users.
  • `\"individual\"`: Trains a separate model for each individual user. Skips 'generic_user'.
  • `\"all\"`: Combination of `\"generic\"` and `\"individual\"`. Both the 'generic_user' and individual users are trained in the same pipeline.
|\n", + "| `skip_users` | List of strings | Any user in this list will be dropped from the pipeline. Useful for debugging to remove automated accounts with many logs. |\n", + "| `cache_dir` | string | The location to store cached files. To aid with development and reduce bandwidth, the Morpheus pipeline will cache data from several stages of the pipeline. This option configures the location for those caches. |\n", + "| `input_files` | List of strings | List of files to process. Can specificy multiple arguments for multiple files. Also accepts glob (\\*) wildcards and schema prefixes such as `s3://`. For example, to make a local cache of an s3 bucket, use `filecache::s3://mybucket/*`. See `fsspec` documentation for list of possible options. |\n", + "| `model_name_formatter` | string | A format string to use when building the model name. The model name is constructed by calling `model_name_formatter.format(user_id=user_id)`. For example, with `model_name_formatter=\"my_model-{user_id}\"` and a user ID of `\"first:last\"` would result in the model name of `\"my_model-first:last\"`. This should match the value used in `DFPMLFlowModelWriterStage`. Available keyword arguments: `user_id`, `user_md5`. |\n", + "| `experiment_name_formatter` | string | A format string (without the `f`) that will be used when creating an experiment in ML Flow. Available keyword arguments: `user_id`, `user_md5`, `reg_model_name`. |\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "9ee00703-75c5-46fc-890c-86733da906c4", + "metadata": {}, + "outputs": [], + "source": [ + "# Global options\n", + "train_users = \"all\"\n", + "\n", + "# Enter any users to skip here\n", + "skip_users: typing.List[str] = []\n", + "\n", + "# Location where cache objects will be saved\n", + "cache_dir = \"./.cache/dfp\"\n", + "\n", + "# Input files to read from\n", + "input_files = [\n", + " \"/work/examples/data/dfp/duo/duotest_pt1.json\",\n", + " \"/work/examples/data/dfp/duo/duotest_pt2.json\",\n", + " \"/work/examples/data/dfp/duo/duotest_pt3.json\",\n", + " \"/work/examples/data/dfp/duo/duotest_pt4.json\"\n", + "]\n", + "\n", + "# The format to use for models\n", + "model_name_formatter = \"DFP-duo-{user_id}\"\n", + "\n", + "# The format to use for experiment names\n", + "experiment_name_formatter = \"dfp/duo/training/{reg_model_name}\"\n", + "\n", + "# === Derived Options ===\n", + "# To include the generic, we must be training all or generic\n", + "include_generic = train_users == \"all\" or train_users == \"generic\"\n", + "\n", + "# To include individual, we must be either training or inferring\n", + "include_individual = train_users != \"generic\"\n", + "\n", + "# None indicates we arent training anything\n", + "is_training = train_users != \"none\"" + ] + }, + { + "cell_type": "markdown", + "id": "1cfc24c9-c85e-4977-a348-692c8f0aceaa", + "metadata": {}, + "source": [ + "### Global Config Object\n", + "Before creating the pipeline, we need to setup logging and set the parameters for the Morpheus config object. This config object is responsible for the following:\n", + " - Indicating whether to use C++ or Python stages\n", + " - C++ stages are not supported for the DFP pipeline. This should always be `False`\n", + " - Setting the number of threads to use in the pipeline. Defaults to the thread count of the OS.\n", + " - Sets the feature column names that will be used in model training\n", + " - This option allows extra columns to be used in the pipeline that will not be part of the training algorithm.\n", + " - The final features that the model will be trained on will be an intersection of this list with the log columns.\n", + " - The column name that indicates the user's unique identifier\n", + " - It is required for DFP to have a user ID column\n", + " - The column name that indicates the timestamp for the log\n", + " - It is required for DFP to know when each log occurred" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "01abd537-9162-49dc-8e83-d9465592f1d5", + "metadata": {}, + "outputs": [], + "source": [ + "# Enable the Morpheus logger\n", + "configure_logging(log_level=logging.DEBUG)\n", + "\n", + "config = Config()\n", + "\n", + "CppConfig.set_should_use_cpp(False)\n", + "\n", + "config.num_threads = os.cpu_count()\n", + "\n", + "config.ae = ConfigAutoEncoder()\n", + "\n", + "config.ae.feature_columns = [\n", + " 'accessdevicebrowser', 'accessdeviceos', 'authdevicename', 'result', 'reason', 'logcount', \"locincrement\"\n", + "]\n", + "config.ae.userid_column_name = \"username\"\n", + "config.ae.timestamp_column_name = \"timestamp\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a73a4d53-32b6-4ab8-a5d7-c0104b31c69b", + "metadata": {}, + "outputs": [], + "source": [ + "# Specify the column names to ensure all data is uniform\n", + "source_column_info = [\n", + " DateTimeColumn(name=config.ae.timestamp_column_name, dtype=datetime, input_name=\"timestamp\"),\n", + " RenameColumn(name=config.ae.userid_column_name, dtype=str, input_name=\"user.name\"),\n", + " RenameColumn(name=\"accessdevicebrowser\", dtype=str, input_name=\"access_device.browser\"),\n", + " RenameColumn(name=\"accessdeviceos\", dtype=str, input_name=\"access_device.os\"),\n", + " StringCatColumn(name=\"location\",\n", + " dtype=str,\n", + " input_columns=[\n", + " \"access_device.location.city\",\n", + " \"access_device.location.state\",\n", + " \"access_device.location.country\"\n", + " ],\n", + " sep=\", \"),\n", + " RenameColumn(name=\"authdevicename\", dtype=str, input_name=\"auth_device.name\"),\n", + " BoolColumn(name=\"result\",\n", + " dtype=bool,\n", + " input_name=\"result\",\n", + " true_values=[\"success\", \"SUCCESS\"],\n", + " false_values=[\"denied\", \"DENIED\", \"FRAUD\"]),\n", + " ColumnInfo(name=\"reason\", dtype=str),\n", + " # CustomColumn(name=\"user.groups\", dtype=str, process_column_fn=partial(column_listjoin, col_name=\"user.groups\"))\n", + "]\n", + "\n", + "source_schema = DataFrameInputSchema(json_columns=[\"access_device\", \"application\", \"auth_device\", \"user\"],\n", + " column_info=source_column_info)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f7a0cb0a-e65a-444a-a06c-a4525d543790", + "metadata": {}, + "outputs": [], + "source": [ + "# Preprocessing schema\n", + "preprocess_column_info = [\n", + " ColumnInfo(name=config.ae.timestamp_column_name, dtype=datetime),\n", + " ColumnInfo(name=config.ae.userid_column_name, dtype=str),\n", + " ColumnInfo(name=\"accessdevicebrowser\", dtype=str),\n", + " ColumnInfo(name=\"accessdeviceos\", dtype=str),\n", + " ColumnInfo(name=\"authdevicename\", dtype=str),\n", + " ColumnInfo(name=\"result\", dtype=bool),\n", + " ColumnInfo(name=\"reason\", dtype=str),\n", + " # Derived columns\n", + " IncrementColumn(name=\"logcount\",\n", + " dtype=int,\n", + " input_name=config.ae.timestamp_column_name,\n", + " groupby_column=config.ae.userid_column_name),\n", + " CustomColumn(name=\"locincrement\",\n", + " dtype=int,\n", + " process_column_fn=partial(create_increment_col, column_name=\"location\")),\n", + "]\n", + "\n", + "preprocess_schema = DataFrameInputSchema(column_info=preprocess_column_info, preserve_columns=[\"_batch_id\"])\n" + ] + }, + { + "cell_type": "markdown", + "id": "bdfc59de-dea8-4e5f-98e3-98eba1e4621d", + "metadata": {}, + "source": [ + "## Pipeline Construction\n", + "From this point on we begin constructing the stages that will make up the pipeline. To make testing easier, constructing the pipeline object, adding the stages, and running the pipeline, is provided as a single cell. The below cell can be rerun multiple times as needed for debugging.\n", + "\n", + "### Source Stage (`MultiFileSource`)\n", + "\n", + "This pipeline read input logs from one or more input files. This source stage will construct a list of files to be processed and pass to downstream stages. It is capable of reading files from many different source types, both local and remote. This is possible by utilizing the `fsspec` library (similar to `pandas`). See the [`fsspec`](https://filesystem-spec.readthedocs.io/) documentation for more information on the supported file types. Once all of the logs have been read, the source completes. \n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `filenames` | List of strings | | Any files to read into the pipeline. All files will be combined into a single `DataFrame` |\n", + "\n", + "### File Batcher Stage (`DFPFileBatcherStage`)\n", + "\n", + "To improve performance, multiple small input files can be batched together into a single DataFrame for processing. This stage is responsible for determining the timestamp of input files, grouping input files into batches by time, and sending the batches to be processed into a single DataFrame. Batches of files that have been seen before will be loaded from cache resulting in increased performance. For example, when performaing a 60 day training run, 59 days can be cached with a period of `\"D\"` and retraining once per day.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `period` | `str` | `\"D\"` | The period to create batches. See `pandas` windowing frequency documentation for available options. |\n", + "| `date_conversion_func` | Function of `typing.Callable[[fsspec.core.OpenFile], datetime]` | | A callback which is responsible for determining the date for a specified file. |\n", + "\n", + "### File to DataFrame Stage (`DFPFileToDataFrameStage`)\n", + "\n", + "After files have been batched into groups, this stage is responsible for reading the files and converting into a DataFrame. The specified input schema converts the raw DataFrame into one suitable for caching and processing. Any columns that are not needed should be excluded from the schema.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `schema` | `DataFrameInputSchema` | | After the raw `DataFrame` is read from each file, this schema will be applied to ensure a consisten output from the source. |\n", + "| `file_type` | `FileTypes` | `FileTypes.Auto` | Allows overriding the file type. When set to `Auto`, the file extension will be used. Options are `CSV`, `JSON`, `Auto`. |\n", + "| `parser_kwargs` | `dict` | `{}` | This dictionary will be passed to the `DataFrame` parser class. Allows for customization of log parsing. |\n", + "| `cache_dir` | `str` | `./.cache/dfp` | The location to write cached input files to. |\n", + "\n", + "### Split Users Stage (`DFPSplitUsersStage`)\n", + "\n", + "Once the input logs have been read into a `DataFrame`, this stage is responsible for breaking that single `DataFrame` with many users into multiple `DataFrame`s for each user. This is also where the pipeline chooses whether to train individual users or the generic user (or both).\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `include_generic` | `bool` | | Whether or not to combine all user logs into a single `DataFrame` with the username 'generic_user' |\n", + "| `include_individual` | `bool` | | Whether or not to output individual `DataFrame` objects for each user |\n", + "| `skip_users` | List of `str` | `[]` | Any users to remove from the `DataFrame`. Useful for debugging to remove automated accounts with many logs. Mutually exclusive with `only_users`. |\n", + "| `only_users` | List of `str` | `[]` | Only allow these users in the final `DataFrame`. Useful for debugging to focus on specific users. Mutually exclusive with `skip_users`. |\n", + "\n", + "### Rolling Window Stage (`DFPRollingWindowStage`)\n", + "\n", + "The Rolling Window Stage performs several key pieces of functionality for DFP.\n", + "1. This stage keeps a moving window of logs on a per user basis\n", + " 1. These logs are saved to disk to reduce memory requirements between logs from the same user\n", + "1. It only emits logs when the window history requirements are met\n", + " 1. Until all of the window history requirements are met, no messages will be sent to the rest of the pipeline.\n", + " 1. See the below options for configuring the window history requirements\n", + "1. It repeats the necessary logs to properly calculate log dependent features.\n", + " 1. To support all column feature types, incoming log messages can be combined with existing history and sent to downstream stages.\n", + " 1. For example, to calculate a feature that increments a counter for the number of logs a particular user has generated in a single day, we must have the user's log history for the past 24 hours. To support this, this stage will combine new logs with existing history into a single `DataFrame`.\n", + " 1. It is the responsibility of downstream stages to distinguish between new logs and existing history.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `min_history` | `int` | `300` | The minimum number of logs a user must have before emitting any messages. Logs below this threshold will be saved to disk. |\n", + "| `min_increment` | `int` or `str` | `300` | Once the min history requirement is met, this stage must receive `min_increment` *new* logs before emmitting another message. Logs received before this threshold is met will be saved to disk. Can be specified as an integer count or a string duration. |\n", + "| `max_history` | `int` or `str` | `\"60d\"` | Once `min_history` and `min_increment` requirements have been met, this puts an upper bound on the maximum number of messages to forward into the pipeline and also the maximum amount of messages to retain in the history. Can be specified as an integer count or a string duration. |\n", + "| `cache_dir` | `str` | `./.cache/dfp` | The location to write log history to disk. |\n", + "\n", + "### Preprocessing Stage (`DFPPreprocessingStage`)\n", + "\n", + "This stage performs the final, row dependent, feature calculations as specified by the input schema object. Once calculated, this stage can forward on all received logs, or optionally can only forward on new logs, removing any history information.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `input_schema` | `DataFrameInputSchema` | | The final, row dependent, schema to apply to the incoming columns |\n", + "| `only_new_batches` | `bool` | | Whether or not to foward on all received logs, or just new logs. |\n", + "\n", + "### Training Stage (`DFPTraining`)\n", + "\n", + "This stage is responsible for performing the actual training calculations. Training will be performed on all received data. Resulting message will contain the input data paired with the trained model.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `model_kwargs` | `dict` | `{}` | The options to use when creating a new model instance. See `DFPAutoEncoder` for information on the available options. |\n", + "\n", + "### MLFlow Model Writer Stage (`DFPMLFlowModelWriterStage`)\n", + "\n", + "This stage is the last step in training. It will upload the trained model from the previous stage to MLFlow. The tracking URI for which MLFlow instance to use is configured using the static method `mlflow.set_tracking_uri()`.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `model_name_formatter` | `str` | `\"\"` | A format string to use when building the model name. The model name is constructed by calling `model_name_formatter.format(user_id=user_id)`. For example, with `model_name_formatter=\"my_model-{user_id}\"` and a user ID of `\"first:last\"` would result in the model name of `\"my_model-first:last\"` |\n", + "| `experiment_name` | `str` | | All models are created inside of an experiment to allow metrics to be saved with each model. This option specifies the experiment name. The final experiment name for each model will be in the form of `{experiment_name}/{model_name}` |" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "825390ad-ce64-4949-b324-33039ffdf264", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "====Registering Pipeline====\u001b[0m\n", + "====Registering Pipeline Complete!====\u001b[0m\n", + "====Starting Pipeline====\u001b[0m\n", + "====Pipeline Started====\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "W20220827 06:37:12.021986 136 thread.cpp:138] unable to set memory policy - if using docker use: --cap-add=sys_nice to allow membind\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "====Building Pipeline====\u001b[0m\n", + "Added source: , input_name='access_device.browser'), RenameColumn(name='accessdeviceos', dtype=, input_name='access_device.os'), RenameColumn(name='locationcity', dtype=, input_name='auth_device.location.city'), RenameColumn(name='device', dtype=, input_name='auth_device.name'), BoolColumn(name='result', dtype=, input_name='result', value_map={'success': True, 'SUCCESS': True, 'denied': False, 'DENIED': False, 'FRAUD': False}), RenameColumn(name='reason', dtype=, input_name='reason'), RenameColumn(name='username', dtype=, input_name='user.name'), RenameColumn(name='timestamp', dtype=, input_name='timestamp')], preserve_columns=None), filenames=['/work/examples/data/dfp/duo/duotest_pt1.json', '/work/examples/data/dfp/duo/duotest_pt2.json', '/work/examples/data/dfp/duo/duotest_pt3.json', '/work/examples/data/dfp/duo/duotest_pt4.json'], file_type=FileTypes.Auto, parser_kwargs={'lines': False, 'orient': 'records'})>\n", + " └─> cudf.DataFrame\u001b[0m\n", + "Added stage: \n", + " └─ cudf.DataFrame -> dfp.DFPMessageMeta\u001b[0m\n", + "Added stage: \n", + " └─ dfp.DFPMessageMeta -> dfp.MultiDFPMessage\u001b[0m\n", + "Added stage: , input_name='accessdevicebrowser'), RenameColumn(name='accessdeviceos', dtype=, input_name='accessdeviceos'), RenameColumn(name='device', dtype=, input_name='device'), RenameColumn(name='result', dtype=, input_name='result'), RenameColumn(name='reason', dtype=, input_name='reason'), CustomColumn(name='logcount', dtype=, process_column_fn=), CustomColumn(name='locincrement', dtype=, process_column_fn=), RenameColumn(name='username', dtype=, input_name='username'), RenameColumn(name='timestamp', dtype=, input_name='timestamp')], preserve_columns=re.compile('(_batch_id)')), return_format=data, only_new_batches=False)>\n", + " └─ dfp.MultiDFPMessage -> dfp.MultiDFPMessage\u001b[0m\n", + "Added stage: \n", + " └─ dfp.MultiDFPMessage -> morpheus.MultiAEMessage\u001b[0m\n", + "Added stage: \n", + " └─ morpheus.MultiAEMessage -> morpheus.MultiAEMessage\u001b[0m\n", + "====Building Pipeline Complete!====\u001b[0m\n", + "\u001b[2mStarting! Time: 1661582232.0283692\u001b[0m\n", + "Sending 4000 rows\n", + "\u001b[2mBatch split users complete. Input: 4000 rows from 2021-09-14 00:52:14 to 2022-01-24 15:23:41. Output: 11 users, rows/user min: 176, max: 4000, avg: 727.27. Duration: 8.41 ms\u001b[0m\n", + "\u001b[2mRolling window complete for badguy in 20.79 ms. Input: 490 rows from 2021-09-14 00:52:14 to 2022-01-24 09:30:26. Output: 490 rows from 2021-09-14 00:52:14 to 2022-01-24 09:30:26\u001b[0m\n", + "\u001b[2mPreprocessed 490 data for logs in 2021-09-14 00:52:14 to 2022-01-24 09:30:26 in 26.08013153076172 ms\u001b[0m\n", + "\u001b[2mTraining AE model for user: 'badguy'...\u001b[0m\n", + "\u001b[2mRolling window complete for generic_user in 56.47 ms. Input: 4000 rows from 2021-09-14 00:52:14 to 2022-01-24 15:23:41. Output: 4000 rows from 2021-09-14 00:52:14 to 2022-01-24 15:23:41\u001b[0m\n", + "\u001b[2mRolling window complete for maliciousactor in 4967.55 ms. Input: 700 rows from 2021-09-14 16:58:15 to 2022-01-24 15:23:41. Output: 700 rows from 2021-09-14 16:58:15 to 2022-01-24 15:23:41\u001b[0m\n", + "\u001b[2mRolling window complete for userb in 26.85 ms. Input: 457 rows from 2021-10-07 04:02:07 to 2022-01-24 15:22:10. Output: 457 rows from 2021-10-07 04:02:07 to 2022-01-24 15:22:10\u001b[0m\n", + "\u001b[2mRolling window complete for userc in 34.86 ms. Input: 500 rows from 2021-10-06 20:29:50 to 2022-01-24 15:23:00. Output: 500 rows from 2021-10-06 20:29:50 to 2022-01-24 15:23:00\u001b[0m\n", + "\u001b[2mRolling window complete for userd in 22.95 ms. Input: 537 rows from 2021-10-07 02:25:27 to 2022-01-24 12:26:34. Output: 537 rows from 2021-10-07 02:25:27 to 2022-01-24 12:26:34\u001b[0m\n", + "\u001b[2mPreprocessed 4000 data for logs in 2021-09-14 00:52:14 to 2022-01-24 15:23:41 in 5057.710647583008 ms\u001b[0m\n", + "\u001b[2mRolling window complete for userf in 25.65 ms. Input: 304 rows from 2021-10-07 10:55:23 to 2022-01-24 14:08:00. Output: 304 rows from 2021-10-07 10:55:23 to 2022-01-24 14:08:00\u001b[0m\n", + "\u001b[2mPreprocessed 700 data for logs in 2021-09-14 16:58:15 to 2022-01-24 15:23:41 in 49.933433532714844 ms\u001b[0m\n", + "\u001b[2mPreprocessed 457 data for logs in 2021-10-07 04:02:07 to 2022-01-24 15:22:10 in 38.03825378417969 ms\u001b[0m\n", + "\u001b[2mRolling window complete for userh in 35.21 ms. Input: 346 rows from 2021-10-07 10:54:07 to 2022-01-24 15:22:10. Output: 346 rows from 2021-10-07 10:54:07 to 2022-01-24 15:22:10\u001b[0m\n", + "\u001b[2mPreprocessed 500 data for logs in 2021-10-06 20:29:50 to 2022-01-24 15:23:00 in 22.253036499023438 ms\u001b[0m\n", + "\u001b[2mPreprocessed 537 data for logs in 2021-10-07 02:25:27 to 2022-01-24 12:26:34 in 19.069433212280273 ms\u001b[0m\n", + "\u001b[2mPreprocessed 304 data for logs in 2021-10-07 10:55:23 to 2022-01-24 14:08:00 in 18.454790115356445 ms\u001b[0m\n", + "\u001b[2mPreprocessed 346 data for logs in 2021-10-07 10:54:07 to 2022-01-24 15:22:10 in 18.633365631103516 ms\u001b[0m\n", + "\u001b[2mTraining AE model for user: 'badguy'... Complete.\u001b[0m\n", + "\u001b[2mTraining AE model for user: 'generic_user'...\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022/08/27 06:37:20 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: AE-duo-badguy, version 1\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2mML Flow model upload complete. User: badguy, Version: 1\u001b[0m\n", + "\u001b[2mTraining AE model for user: 'generic_user'... Complete.\u001b[0m\n", + "\u001b[2mTraining AE model for user: 'maliciousactor'...\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022/08/27 06:37:20 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: AE-duo-generic_user, version 4\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2mML Flow model upload complete. User: generic_user, Version: 4\u001b[0m\n", + "\u001b[2mTraining AE model for user: 'maliciousactor'... Complete.\u001b[0m\n", + "\u001b[2mTraining AE model for user: 'userb'...\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022/08/27 06:37:21 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: AE-duo-maliciousactor, version 1\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2mML Flow model upload complete. User: maliciousactor, Version: 1\u001b[0m\n", + "\u001b[2mTraining AE model for user: 'userb'... Complete.\u001b[0m\n", + "\u001b[2mTraining AE model for user: 'userc'...\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022/08/27 06:37:23 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: AE-duo-userb, version 1\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2mML Flow model upload complete. User: userb, Version: 1\u001b[0m\n", + "\u001b[2mTraining AE model for user: 'userc'... Complete.\u001b[0m\n", + "\u001b[2mTraining AE model for user: 'userd'...\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022/08/27 06:37:23 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: AE-duo-userc, version 1\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2mML Flow model upload complete. User: userc, Version: 1\u001b[0m\n", + "\u001b[2mTraining AE model for user: 'userd'... Complete.\u001b[0m\n", + "\u001b[2mTraining AE model for user: 'userf'...\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022/08/27 06:37:24 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: AE-duo-userd, version 1\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2mML Flow model upload complete. User: userd, Version: 1\u001b[0m\n", + "\u001b[2mTraining AE model for user: 'userf'... Complete.\u001b[0m\n", + "\u001b[2mTraining AE model for user: 'userh'...\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022/08/27 06:37:26 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: AE-duo-userf, version 1\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2mML Flow model upload complete. User: userf, Version: 1\u001b[0m\n", + "\u001b[2mTraining AE model for user: 'userh'... Complete.\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022/08/27 06:37:26 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: AE-duo-userh, version 1\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2mML Flow model upload complete. User: userh, Version: 1\u001b[0m\n", + "====Pipeline Complete====\u001b[0m\n" + ] + } + ], + "source": [ + "# Create a linear pipeline object\n", + "pipeline = LinearPipeline(config)\n", + "\n", + "# Source stage\n", + "pipeline.set_source(MultiFileSource(config, filenames=input_files))\n", + "\n", + "# Batch files into buckets by time. Use the default ISO date extractor from the filename\n", + "pipeline.add_stage(\n", + " DFPFileBatcherStage(config,\n", + " period=\"D\",\n", + " date_conversion_func=functools.partial(date_extractor, filename_regex=iso_date_regex)))\n", + "\n", + "# Output is S3 Buckets. Convert to DataFrames. This caches downloaded S3 data\n", + "pipeline.add_stage(\n", + " DFPFileToDataFrameStage(config,\n", + " schema=source_schema,\n", + " file_type=FileTypes.JSON,\n", + " parser_kwargs={\n", + " \"lines\": False, \"orient\": \"records\"\n", + " },\n", + " cache_dir=cache_dir))\n", + "\n", + "\n", + "# This will split users or just use one single user\n", + "pipeline.add_stage(\n", + " DFPSplitUsersStage(config,\n", + " include_generic=include_generic,\n", + " include_individual=include_individual,\n", + " skip_users=skip_users))\n", + "\n", + "# Next, have a stage that will create rolling windows\n", + "pipeline.add_stage(\n", + " DFPRollingWindowStage(\n", + " config,\n", + " min_history=300 if is_training else 1,\n", + " min_increment=300 if is_training else 0,\n", + " # For inference, we only ever want 1 day max\n", + " max_history=\"60d\" if is_training else \"1d\",\n", + " cache_dir=cache_dir))\n", + "\n", + "# Output is UserMessageMeta -- Cached frame set\n", + "pipeline.add_stage(DFPPreprocessingStage(config, input_schema=preprocess_schema, only_new_batches=not is_training))\n", + "\n", + "# Finally, perform training which will output a model\n", + "pipeline.add_stage(DFPTraining(config))\n", + "\n", + "# Write that model to MLFlow\n", + "pipeline.add_stage(\n", + " DFPMLFlowModelWriterStage(config,\n", + " model_name_formatter=model_name_formatter,\n", + " experiment_name_formatter=experiment_name_formatter))\n", + "\n", + "# Run the pipeline\n", + "await pipeline._do_run()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:morpheus] *", + "language": "python", + "name": "conda-env-morpheus-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + }, + "vscode": { + "interpreter": { + "hash": "e26783b24f020aa0bcaa00e6ba122db5d0e3da2d892d80be664969895e06a7e1" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/digital_fingerprinting/production/morpheus/launch.sh b/examples/digital_fingerprinting/production/morpheus/launch.sh new file mode 100755 index 0000000000..643d475259 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/launch.sh @@ -0,0 +1,21 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Run the tool to get AWS credentials +source ./get_aws_credentials.sh + +# Run the training forwarding any args +python dfp_pipeline_duo.py "$@" diff --git a/examples/digital_fingerprinting/starter/README.md b/examples/digital_fingerprinting/starter/README.md new file mode 100644 index 0000000000..cc12234388 --- /dev/null +++ b/examples/digital_fingerprinting/starter/README.md @@ -0,0 +1,293 @@ + + +# "Starter" Digital Fingerprinting Pipeline + +We show here how to set up and run the DFP pipeline for three log types: CloudTrail, Duo and Azure. Each of these log types uses a built-in source stage that handles that specific data format. New source stages can be added to allow the DFP pipeline to process different log types. All stages after the source stages are identical across all log types but can be configured differently via pipeline or stage configuration options. + +## Environment Setup + +Follow the instructions [here](https://github.com/nv-morpheus/Morpheus/blob/branch-22.09/CONTRIBUTING.md) to set up your development environment in either a Docker container or conda environment. + +## Morpheus CLI + +DFP pipelines can be constructed and run using the Morpheus CLI command `morpheus run pipeline-ae ...` + +Use `--help` to display information about the autoencoder pipeline command line options: + +``` +morpheus run pipeline-ae --help + +Usage: morpheus run pipeline-ae [OPTIONS] COMMAND1 [ARGS]... [COMMAND2 + [ARGS]...]... + + Configure and run the pipeline. To configure the pipeline, list the stages + in the order that data should flow. The output of each stage will become the + input for the next stage. For example, to read, classify and write to a + file, the following stages could be used + + pipeline from-file --filename=my_dataset.json deserialize preprocess inf-triton --model_name=my_model + --server_url=localhost:8001 filter --threshold=0.5 to-file --filename=classifications.json + + Pipelines must follow a few rules: + 1. Data must originate in a source stage. Current options are `from-file` or `from-kafka` + 2. A `deserialize` stage must be placed between the source stages and the rest of the pipeline + 3. Only one inference stage can be used. Zero is also fine + 4. The following stages must come after an inference stage: `add-class`, `filter`, `gen-viz` + +Options: + --columns_file FILE [default: /my_data/gitrepos/efajardo-nv/Morp + heus/morpheus/data/columns_ae_cloudtrail.txt + ] + --labels_file FILE Specifies a file to read labels from in + order to convert class IDs into labels. A + label file is a simple text file where each + line corresponds to a label. If unspecified, + only a single output label is created for + FIL + --userid_column_name TEXT Which column to use as the User ID. + [default: userIdentityaccountId; required] + --userid_filter TEXT Specifying this value will filter all + incoming data to only use rows with matching + User IDs. Which column is used for the User + ID is specified by `userid_column_name` + --feature_scaler TEXT Autoencoder feature scaler [default: + standard] + --use_generic_model BOOLEAN Whether to use a generic model when user does + not have minimum number of training rows + [default: False] + --viz_file FILE Save a visualization of the pipeline at the + specified location + --help Show this message and exit. + +Commands: + add-class Add detected classifications to each message + add-scores Add probability scores to each message + buffer (Deprecated) Buffer results + delay (Deprecated) Delay results for a certain duration + filter Filter message by a classification threshold + from-azure Load messages from a Duo directory + from-cloudtrail Load messages from a Cloudtrail directory + from-duo Load messages from a Duo directory + gen-viz (Deprecated) Write out vizualization data frames + inf-pytorch Perform inference with PyTorch + inf-triton Perform inference with Triton + monitor Display throughput numbers at a specific point in the + pipeline + preprocess Convert messages to tokens + serialize Include & exclude columns from messages + timeseries Perform time series anomaly detection and add prediction. + to-file Write all messages to a file + to-kafka Write all messages to a Kafka cluster + train-ae Deserialize source data from JSON + validate Validates pipeline output against an expected output +``` +The commands above correspond to the Morpheus stages that can be used to construct your DFP pipeline. Options are available to configure pipeline and stages. +The following table shows mapping between the main Morpheus CLI commands and underlying Morpheus Python stage classes: + +| CLI Command | Stage Class | Python File | +| ---------------| -------------------------| --------------------------------------------------------- +| from-azure | AzureSourceStage | morpheus/stages/input/azure_source_stage.py +| from-cloudtrail| CloudTrailSourceStage | morpheus/stages/input/clout_trail_source_stage.py +| from-duo | DuoSourceStage | morpheus/stages/input/duo_source_stage.py +| train-ae | TrainAEStage | morpheus/stages/preprocess/train_ae_stage.py +| preprocess | PreprocessAEStage | morpheus/stages/preprocess/preprocess_ae_stage.py +| inf-pytorch | AutoEncoderInferenceStage| morpheus/stages/inference/auto_encoder_inference_stage.py +| add-scores | AddScoresStage | morpheus/stages/postprocess/add_scores_stage.py +| serialize | SerializeStage | morpheus/stages/postprocess/serialize_stage.py +| to-file | WriteToFileStage | morpheus/stages/output/write_to_file_stage.py + + +## Morpheus DFP Stages + +**Source stages** - These include `AzureSourceStage`, `CloudTrailSourceStage` and `DuoSourceStage`. They are responsible for reading log file(s) that match provided `--input_glob` (e.g. `/duo_logs/*.json`). Data is grouped by user so that each batch processed by the pipeline will only contain rows corresponding to a single user. Feature engineering also happens in this stage. All DFP source stages must extend `AutoencoderSourceStage` and implement the `files_to_dfs_per_user` abstract method. Feature columns can be managed by overriding the `derive_features` method. Otherwise, all columns from input data pass through to next stage. + +**Preprocessing stages** + +`TrainAEStage` can either train user models using data matching a provided `--train_data_glob` or load pre-trained models from file using `--pretrained_filename`. When using `--train_data_glob`, user models can be saved using the `--models_output_filename` option. The `--source_stage_class` must also be used with `--train_data_glob` so that the training stage knows how to read the training data. The autoencoder implementation from this [fork](https://github.com/efajardo-nv/dfencoder/tree/morpheus-22.08) is used for user model training. The following are the available CLI options for the `TrainAEStage` (train-ae): + +| Option | Description +| ----------------------| --------------------------------------------------------- +| pretrained_filename | File path to pickled user models saved from previous training run using `--models_output_filename`. +| train_data_glob | Glob path to training data. +| source_stage_class | Source stage so that training stage knows how to read/parse training data. +| train_epochs | Number of training epochs. Default is 25. +| min_train_rows | Minimum number of training rows required to train user model. Default is 300. +| train_max_history | Maximum number of training rows per user. Default is 1000. +| seed | When not None, ensure random number generators are seeded with `seed` to control reproducibility of user model. +| sort_glob | If true the list of files matching `input_glob` will be processed in sorted order. Default is False. +| models_output_filename| Can be used with `--train_data_glob` to save trained user models to file using provided file path. Models can be loaded later using `--pretrained_filename`. + +The `PreprocessAEStage` is responsible for creating a Morpheus message that contains everything needed by the inference stage. For DFP inference, this stage must pass a `MultiInferenceAEMessage` to the inference stage. Each message will correspond to a single user and include the input feature columns, the user's model and training data anomaly scores. + +**Inference stage** - `AutoEncoderInferenceStage` calculates anomaly scores (i.e. reconstruction loss) and z-scores for each user input dataset. + +**Postprocessing stage** - The DFP pipeline uses the `AddScoresStage` for postprocessing to add anomaly scores and zscores from previous inference stage with matching labels. + +**Serialize stage** - `SerializeStage` is used to convert `MultiResponseProbsMessage` from previous stage to a `MessageMeta` to make it suitable for output (i.e. write to file or Kafka). + +**Write stage** - `WriteToFileStage` writes input data with inference results to an output file path. + + +## CloudTrail DFP Pipeline + +Run the following in your Morpheus container to start the CloudTrail DFP pipeline: + +``` +morpheus --log_level=DEBUG \ +run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=False \ +pipeline-ae \ +--columns_file=morpheus/data/columns_ae_cloudtrail.txt \ +--userid_column_name=userIdentitysessionContextsessionIssueruserName \ +--userid_filter=user123 \ +--feature_scaler=standard \ +from-cloudtrail \ +--input_glob=models/datasets/validation-data/dfp-cloudtrail-*-input.csv \ +--max_files=200 \ +train-ae \ +--train_data_glob=models/datasets/training-data/dfp-cloudtrail-*.csv \ +--source_stage_class=morpheus.stages.input.cloud_trail_source_stage.CloudTrailSourceStage \ +--seed=42 \ +preprocess \ +inf-pytorch \ +add-scores \ +serialize \ +to-file --filename=./cloudtrail-dfp-detections.csv --overwrite +``` + +## Duo DFP Pipeline + +First, trains user models from files in `models/datasets/training-data/duo` and saves user models to file. Pipeline then uses these models to run inference +on validation data in `models/datasets/validation-data/duo`. Inference results are written to `duo-detections.csv`. +``` +morpheus --log_level=DEBUG \ +run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=False \ +pipeline-ae \ +--columns_file=morpheus/data/columns_ae_duo.txt \ +--userid_column_name=username \ +--feature_scaler=standard \ +from-duo \ +--input_glob=models/datasets/validation-data/duo/*.json \ +--max_files=200 \ +monitor --description='Input rate' \ +train-ae \ +--train_data_glob=models/datasets/training-data/duo/*.json \ +--source_stage_class=morpheus.stages.input.duo_source_stage.DuoSourceStage \ +--seed=42 \ +--train_epochs=1 \ +--models_output_filename=models/dfp-models/duo_ae_user_models.pkl \ +preprocess \ +inf-pytorch \ +monitor --description='Inference rate' --unit inf \ +add-scores \ +serialize \ +to-file --filename=./duo-detections.csv --overwrite +``` + +The following example shows how we can load pre-trained user models from the file (`models/dfp-models/duo_ae_user_models.pkl`) we created in the previous example. Pipeline then uses these models to run inference on validation data in `models/datasets/validation-data/duo`. Inference results are written to `duo-detections.csv`. +``` +morpheus --log_level=DEBUG \ +run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=False \ +pipeline-ae \ +--columns_file=morpheus/data/columns_ae_duo.txt \ +--userid_column_name=username \ +--feature_scaler=standard \ +from-duo \ +--input_glob=models/datasets/validation-data/duo/*.json \ +--max_files=200 \ +monitor --description='Input rate' \ +train-ae \ +--pretrained_filename=models/dfp-models/duo_ae_user_models.pkl \ +preprocess \ +inf-pytorch \ +monitor --description='Inference rate' --unit inf \ +add-scores \ +serialize \ +to-file --filename=./duo-detections.csv --overwrite +``` + +## Azure DFP Pipeline + +First, trains user models from files in `models/datasets/training-data/azure` and saves user models to file. Pipeline then uses these models to run inference +on validation data in `models/datasets/validation-data/azure`. Inference results are written to `azure-detections.csv`. +``` +morpheus --log_level=DEBUG \ +run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=False \ +pipeline-ae \ +--columns_file=morpheus/data/columns_ae_azure.txt \ +--userid_column_name=userPrincipalName \ +--feature_scaler=standard \ +from-azure \ +--input_glob=models/datasets/validation-data/azure/*.json \ +--max_files=200 \ +train-ae \ +--train_data_glob=models/datasets/training-data/azure/*.json \ +--source_stage_class=morpheus.stages.input.azure_source_stage.AzureSourceStage \ +--seed=42 \ +--models_output_filename=models/dfp-models/azure_ae_user_models.pkl \ +preprocess \ +inf-pytorch \ +monitor --description='Inference rate' --unit inf \ +add-scores \ +serialize \ +to-file --filename=./azure-detections.csv --overwrite +``` + +The following example shows how we can load pre-trained user models from the file (`models/dfp-models/azure_ae_user_models.pkl`) we created in the previous example. Pipeline then uses these models to run inference on validation data in `models/datasets/validation-data/azure`. Inference results are written to `azure-detections.csv`. +``` +morpheus --log_level=DEBUG \ +run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=False \ +pipeline-ae \ +--columns_file=morpheus/data/columns_ae_azure.txt \ +--userid_column_name=userPrincipalName \ +--feature_scaler=standard \ +from-azure \ +--input_glob=models/datasets/validation-data/azure/*.json \ +--max_files=200 \ +train-ae \ +--pretrained_filename=models/dfp-models/azure_ae_user_models.pkl \ +preprocess \ +inf-pytorch \ +monitor --description='Inference rate' --unit inf \ +add-scores \ +serialize \ +to-file --filename=./azure-detections.csv --overwrite +``` + + +## Using Morpheus Python API + +The DFP pipelines can also be constructed and run via the Morpheus Python API. An [example](./run_cloudtrail_dfp.py) is included for the Cloudtrail DFP pipeline. The following are some commands to +run the example. + +Train user models from files in `models/datasets/training-data/dfp-cloudtrail-*.csv` and saves user models to file. Pipeline then uses these models to run inference on Cloudtrail validation data in `models/datasets/validation-data/dfp-cloudtrail-*-input.csv`. Inference results are written to `cloudtrail-dfp-results.csv`. +``` +python ./examples/digital_fingerprinting/run_cloudtrail_dfp.py \ + --columns_file=morpheus/data/columns_ae_cloudtrail.txt \ + --input_glob=models/datasets/validation-data/dfp-cloudtrail-*-input.csv \ + --train_data_glob=models/datasets/training-data/dfp-*.csv \ + --models_output_filename=models/dfp-models/cloudtrail_ae_user_models.pkl \ + --output_file ./cloudtrail-dfp-results.csv +``` + +Here we load pre-trained user models from the file (`models/dfp-models/cloudtrail_ae_user_models.pkl`) we created in the previous example. Pipeline then uses these models to run inference on validation data in `models/datasets/validation-data/dfp-cloudtrail-*-input.csv`. Inference results are written to `cloudtrail-dfp-results.csv`. +``` +python ./examples/digital_fingerprinting/run_cloudtrail_dfp.py \ + --columns_file=morpheus/data/columns_ae_cloudtrail.txt \ + --input_glob=models/datasets/validation-data/dfp-cloudtrail-*-input.csv \ + --pretrained_filename=models/dfp-models/cloudtrail_ae_user_models.pkl \ + --output_file=./cloudtrail-dfp-results.csv +``` diff --git a/examples/digital_fingerprinting/starter/run_cloudtrail_dfp.py b/examples/digital_fingerprinting/starter/run_cloudtrail_dfp.py new file mode 100644 index 0000000000..783a586c37 --- /dev/null +++ b/examples/digital_fingerprinting/starter/run_cloudtrail_dfp.py @@ -0,0 +1,156 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os + +import click + +from morpheus.config import AEFeatureScalar +from morpheus.config import Config +from morpheus.config import ConfigAutoEncoder +from morpheus.config import CppConfig +from morpheus.config import PipelineModes +from morpheus.pipeline import LinearPipeline +from morpheus.stages.general.monitor_stage import MonitorStage +from morpheus.stages.inference.auto_encoder_inference_stage import AutoEncoderInferenceStage +from morpheus.stages.input.cloud_trail_source_stage import CloudTrailSourceStage +from morpheus.stages.output.write_to_file_stage import WriteToFileStage +from morpheus.stages.postprocess.add_scores_stage import AddScoresStage +from morpheus.stages.postprocess.serialize_stage import SerializeStage +from morpheus.stages.preprocess.preprocess_ae_stage import PreprocessAEStage +from morpheus.stages.preprocess.train_ae_stage import TrainAEStage +from morpheus.utils.logger import configure_logging + + +@click.command() +@click.option( + "--num_threads", + default=os.cpu_count(), + type=click.IntRange(min=1), + help="Number of internal pipeline threads to use", +) +@click.option( + "--pipeline_batch_size", + default=1024, + type=click.IntRange(min=1), + help=("Internal batch size for the pipeline. Can be much larger than the model batch size. " + "Also used for Kafka consumers"), +) +@click.option( + "--model_max_batch_size", + default=1024, + type=click.IntRange(min=1), + help="Max batch size to use for the model", +) +@click.option( + "--columns_file", + type=click.Path(exists=True, readable=True), + required=True, + help="Feature columns file", +) +@click.option( + "--input_glob", + type=str, + required=True, + help="Inference input glob", +) +@click.option( + "--train_data_glob", + type=str, + required=False, + help="Train data glob", +) +@click.option( + "--pretrained_filename", + type=click.Path(exists=True, readable=True), + required=False, + help="File with pre-trained user models", +) +@click.option( + "--models_output_filename", + help="The path to the file where the inference output will be saved.", +) +@click.option( + "--output_file", + default="./cloudtrail-detections.csv", + help="The path to the file where the inference output will be saved.", +) +def run_pipeline(num_threads, + pipeline_batch_size, + model_max_batch_size, + columns_file, + input_glob, + train_data_glob, + pretrained_filename, + models_output_filename, + output_file): + + configure_logging(log_level=logging.DEBUG) + + CppConfig.set_should_use_cpp(False) + + config = Config() + config.mode = PipelineModes.AE + config.ae = ConfigAutoEncoder() + config.ae.userid_column_name = "userIdentityaccountId" + config.ae.feature_scaler = AEFeatureScalar.STANDARD + + with open(columns_file, "r") as lf: + config.ae.feature_columns = [x.strip() for x in lf.readlines()] + + config.num_threads = num_threads + config.pipeline_batch_size = pipeline_batch_size + config.model_max_batch_size = model_max_batch_size + config.class_labels = ["reconstruct_loss", "zscore"] + + # Create a pipeline object + pipeline = LinearPipeline(config) + + # Add a source stage + pipeline.set_source(CloudTrailSourceStage(config, input_glob=input_glob)) + + # Add a training stage + pipeline.add_stage( + TrainAEStage(config, + pretrained_filename=pretrained_filename, + train_data_glob=train_data_glob, + source_stage_class="morpheus.stages.input.cloud_trail_source_stage.CloudTrailSourceStage", + models_output_filename=models_output_filename, + seed=42, + sort_glob=True)) + + # Add a preprocessing stage + pipeline.add_stage(PreprocessAEStage(config)) + + # Add a inference stage + pipeline.add_stage(AutoEncoderInferenceStage(config)) + + # Add anomaly scores and z-scores to each message + pipeline.add_stage(AddScoresStage(config)) + + # Add serialize stage + pipeline.add_stage(SerializeStage(config)) + + # Add a write file stage + pipeline.add_stage(WriteToFileStage(config, filename=output_file, overwrite=True)) + + pipeline.add_stage(MonitorStage(config, description="Postprocessing rate")) + + # Run the pipeline + pipeline.run() + + +if __name__ == "__main__": + run_pipeline() diff --git a/examples/digital_fingerprinting/starter/run_duo_dfp.py b/examples/digital_fingerprinting/starter/run_duo_dfp.py new file mode 100644 index 0000000000..783a586c37 --- /dev/null +++ b/examples/digital_fingerprinting/starter/run_duo_dfp.py @@ -0,0 +1,156 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os + +import click + +from morpheus.config import AEFeatureScalar +from morpheus.config import Config +from morpheus.config import ConfigAutoEncoder +from morpheus.config import CppConfig +from morpheus.config import PipelineModes +from morpheus.pipeline import LinearPipeline +from morpheus.stages.general.monitor_stage import MonitorStage +from morpheus.stages.inference.auto_encoder_inference_stage import AutoEncoderInferenceStage +from morpheus.stages.input.cloud_trail_source_stage import CloudTrailSourceStage +from morpheus.stages.output.write_to_file_stage import WriteToFileStage +from morpheus.stages.postprocess.add_scores_stage import AddScoresStage +from morpheus.stages.postprocess.serialize_stage import SerializeStage +from morpheus.stages.preprocess.preprocess_ae_stage import PreprocessAEStage +from morpheus.stages.preprocess.train_ae_stage import TrainAEStage +from morpheus.utils.logger import configure_logging + + +@click.command() +@click.option( + "--num_threads", + default=os.cpu_count(), + type=click.IntRange(min=1), + help="Number of internal pipeline threads to use", +) +@click.option( + "--pipeline_batch_size", + default=1024, + type=click.IntRange(min=1), + help=("Internal batch size for the pipeline. Can be much larger than the model batch size. " + "Also used for Kafka consumers"), +) +@click.option( + "--model_max_batch_size", + default=1024, + type=click.IntRange(min=1), + help="Max batch size to use for the model", +) +@click.option( + "--columns_file", + type=click.Path(exists=True, readable=True), + required=True, + help="Feature columns file", +) +@click.option( + "--input_glob", + type=str, + required=True, + help="Inference input glob", +) +@click.option( + "--train_data_glob", + type=str, + required=False, + help="Train data glob", +) +@click.option( + "--pretrained_filename", + type=click.Path(exists=True, readable=True), + required=False, + help="File with pre-trained user models", +) +@click.option( + "--models_output_filename", + help="The path to the file where the inference output will be saved.", +) +@click.option( + "--output_file", + default="./cloudtrail-detections.csv", + help="The path to the file where the inference output will be saved.", +) +def run_pipeline(num_threads, + pipeline_batch_size, + model_max_batch_size, + columns_file, + input_glob, + train_data_glob, + pretrained_filename, + models_output_filename, + output_file): + + configure_logging(log_level=logging.DEBUG) + + CppConfig.set_should_use_cpp(False) + + config = Config() + config.mode = PipelineModes.AE + config.ae = ConfigAutoEncoder() + config.ae.userid_column_name = "userIdentityaccountId" + config.ae.feature_scaler = AEFeatureScalar.STANDARD + + with open(columns_file, "r") as lf: + config.ae.feature_columns = [x.strip() for x in lf.readlines()] + + config.num_threads = num_threads + config.pipeline_batch_size = pipeline_batch_size + config.model_max_batch_size = model_max_batch_size + config.class_labels = ["reconstruct_loss", "zscore"] + + # Create a pipeline object + pipeline = LinearPipeline(config) + + # Add a source stage + pipeline.set_source(CloudTrailSourceStage(config, input_glob=input_glob)) + + # Add a training stage + pipeline.add_stage( + TrainAEStage(config, + pretrained_filename=pretrained_filename, + train_data_glob=train_data_glob, + source_stage_class="morpheus.stages.input.cloud_trail_source_stage.CloudTrailSourceStage", + models_output_filename=models_output_filename, + seed=42, + sort_glob=True)) + + # Add a preprocessing stage + pipeline.add_stage(PreprocessAEStage(config)) + + # Add a inference stage + pipeline.add_stage(AutoEncoderInferenceStage(config)) + + # Add anomaly scores and z-scores to each message + pipeline.add_stage(AddScoresStage(config)) + + # Add serialize stage + pipeline.add_stage(SerializeStage(config)) + + # Add a write file stage + pipeline.add_stage(WriteToFileStage(config, filename=output_file, overwrite=True)) + + pipeline.add_stage(MonitorStage(config, description="Postprocessing rate")) + + # Run the pipeline + pipeline.run() + + +if __name__ == "__main__": + run_pipeline() diff --git a/morpheus/cli/commands.py b/morpheus/cli/commands.py index c3050c2e7c..46c7d6ca26 100644 --- a/morpheus/cli/commands.py +++ b/morpheus/cli/commands.py @@ -27,6 +27,7 @@ from morpheus.cli.utils import get_enum_values from morpheus.cli.utils import get_log_levels from morpheus.cli.utils import get_pipeline_from_ctx +from morpheus.cli.utils import load_labels_file from morpheus.cli.utils import parse_enum from morpheus.cli.utils import parse_log_level from morpheus.cli.utils import prepare_command @@ -340,9 +341,8 @@ def pipeline_nlp(ctx: click.Context, **kwargs): if len(labels): config.class_labels = list(labels) else: - with open(kwargs["labels_file"], "r") as lf: - config.class_labels = [x.strip() for x in lf.readlines()] - logger.debug("Loaded labels file. Current labels: [%s]", str(config.class_labels)) + config.class_labels = load_labels_file(kwargs["labels_file"]) + logger.debug("Loaded labels file. Current labels: [%s]", str(config.class_labels)) from morpheus.pipeline import LinearPipeline @@ -409,16 +409,14 @@ def pipeline_fil(ctx: click.Context, **kwargs): labels_file = kwargs.get("labels_file") if (labels_file is not None): - with open(labels_file, "r") as lf: - config.class_labels = [x.strip() for x in lf.readlines()] - logger.debug("Loaded labels file. Current labels: [%s]", str(config.class_labels)) + config.class_labels = load_labels_file(labels_file) + logger.debug("Loaded labels file. Current labels: [%s]", str(config.class_labels)) else: config.class_labels = list(kwargs['label']) if ("columns_file" in kwargs and kwargs["columns_file"] is not None): - with open(kwargs["columns_file"], "r") as lf: - config.fil.feature_columns = [x.strip() for x in lf.readlines()] - logger.debug("Loaded columns. Current columns: [%s]", str(config.fil.feature_columns)) + config.fil.feature_columns = load_labels_file(kwargs["columns_file"]) + logger.debug("Loaded columns. Current columns: [%s]", str(config.fil.feature_columns)) else: raise ValueError('Unable to find columns file') @@ -432,8 +430,7 @@ def pipeline_fil(ctx: click.Context, **kwargs): @click.group(chain=True, short_help="Run the inference pipeline with an AutoEncoder model", no_args_is_help=True, - cls=PluginGroup, - pipeline_mode=PipelineModes.AE) + cls=PluginGroup) @click.option('--columns_file', required=True, type=MorpheusRelativePath(dir_okay=False, exists=True, file_okay=True, resolve_path=True), @@ -502,17 +499,15 @@ def pipeline_ae(ctx: click.Context, **kwargs): config.ae.use_generic_model = kwargs["use_generic_model"] if ("columns_file" in kwargs and kwargs["columns_file"] is not None): - with open(kwargs["columns_file"], "r") as lf: - config.ae.feature_columns = [x.strip() for x in lf.readlines()] - logger.debug("Loaded columns. Current columns: [%s]", str(config.ae.feature_columns)) + config.ae.feature_columns = load_labels_file(kwargs["columns_file"]) + logger.debug("Loaded columns. Current columns: [%s]", str(config.ae.feature_columns)) else: # Use a default single label config.class_labels = ["reconstruct_loss", "zscore"] if ("labels_file" in kwargs and kwargs["labels_file"] is not None): - with open(kwargs["labels_file"], "r") as lf: - config.class_labels = [x.strip() for x in lf.readlines()] - logger.debug("Loaded labels file. Current labels: [%s]", str(config.class_labels)) + config.class_labels = load_labels_file(kwargs["labels_file"]) + logger.debug("Loaded labels file. Current labels: [%s]", str(config.class_labels)) else: # Use a default single label config.class_labels = ["reconstruct_loss", "zscore"] @@ -582,9 +577,8 @@ def pipeline_other(ctx: click.Context, **kwargs): labels_file = kwargs.get("labels_file") if (labels_file is not None): - with open(labels_file, "r") as lf: - config.class_labels = [x.strip() for x in lf.readlines()] - logger.debug("Loaded labels file. Current labels: [%s]", str(config.class_labels)) + config.class_labels = load_labels_file(labels_file) + logger.debug("Loaded labels file. Current labels: [%s]", str(config.class_labels)) else: labels = kwargs["label"] if len(labels): diff --git a/morpheus/cli/register_stage.py b/morpheus/cli/register_stage.py index 7d273a9ef7..45e3821c61 100644 --- a/morpheus/cli/register_stage.py +++ b/morpheus/cli/register_stage.py @@ -381,14 +381,6 @@ def command_callback(ctx: click.Context, **kwargs): # Not registered, add to global registry GlobalStageRegistry.get().add_stage_info(stage_info) - import sys - import weakref - - def unregister_command(): - GlobalStageRegistry.get().remove_stage_info(stage_info) - - weakref.finalize(sys.modules[stage_class.__module__], unregister_command) - return stage_class return register_stage_inner diff --git a/morpheus/cli/utils.py b/morpheus/cli/utils.py index 889f7524bb..d36654ae7f 100644 --- a/morpheus/cli/utils.py +++ b/morpheus/cli/utils.py @@ -184,6 +184,32 @@ def parse_enum(_: click.Context, _2: click.Parameter, value: str, enum_class: ty return result +def load_labels_file(labels_file: str) -> typing.List[str]: + with open(labels_file, "r") as lf: + return [x.strip() for x in lf.readlines()] + + +def get_package_relative_file(filename: str): + # First check if the path is relative + if (not os.path.isabs(filename)): + + # See if the file exists. + does_exist = os.path.exists(filename) + + if (not does_exist): + # If it doesnt exist, then try to make it relative to the morpheus library root + morpheus_root = os.path.dirname(morpheus.__file__) + + value_abs_to_root = os.path.join(morpheus_root, filename) + + # If the file relative to our package exists, use that instead + if (os.path.exists(value_abs_to_root)): + + return value_abs_to_root + + return filename + + class MorpheusRelativePath(click.Path): """ A specialization of the `click.Path` class that falls back to using package relative paths if the file cannot be @@ -201,26 +227,13 @@ def convert(self, param: typing.Optional["click.Parameter"], ctx: typing.Optional["click.Context"]) -> typing.Any: - # First check if the path is relative - if (not os.path.isabs(value)): - - # See if the file exists. - does_exist = os.path.exists(value) - - if (not does_exist): - # If it doesnt exist, then try to make it relative to the morpheus library root - morpheus_root = os.path.dirname(morpheus.__file__) - - value_abs_to_root = os.path.join(morpheus_root, value) - - # If the file relative to our package exists, use that instead - if (os.path.exists(value_abs_to_root)): - logger.debug(("Parameter, '%s', with relative path, '%s', does not exist. " - "Using package relative location: '%s'"), - param.name, - value, - value_abs_to_root) + package_relative = get_package_relative_file(value) - return super().convert(value_abs_to_root, param, ctx) + if (package_relative != value): + logger.debug(("Parameter, '%s', with relative path, '%s', does not exist. " + "Using package relative location: '%s'"), + param.name, + value, + package_relative) - return super().convert(value, param, ctx) + return super().convert(package_relative, param, ctx) diff --git a/morpheus/config.py b/morpheus/config.py index 431e172f3e..54dc6ff00f 100644 --- a/morpheus/config.py +++ b/morpheus/config.py @@ -121,9 +121,11 @@ class ConfigAutoEncoder(ConfigBase): """ feature_columns: typing.List[str] = None userid_column_name: str = "userIdentityaccountId" + timestamp_column_name: str = "timestamp" userid_filter: str = None feature_scaler: AEFeatureScalar = AEFeatureScalar.STANDARD use_generic_model: bool = False + fallback_username: str = "generic_user" @dataclasses.dataclass diff --git a/morpheus/data/columns_ae_azure.txt b/morpheus/data/columns_ae_azure.txt index 071c9d2169..79120a5f2e 100644 --- a/morpheus/data/columns_ae_azure.txt +++ b/morpheus/data/columns_ae_azure.txt @@ -1,11 +1,9 @@ -locationcountryOrRegion appDisplayName -locationcity clientAppUsed -deviceDetaildisplayName deviceDetailbrowser +deviceDetaildisplayName deviceDetailoperatingSystem statusfailureReason -locincrement appincrement -logcount \ No newline at end of file +locincrement +logcount diff --git a/morpheus/data/columns_ae_duo.txt b/morpheus/data/columns_ae_duo.txt index 091e8d36a5..b653de69ba 100644 --- a/morpheus/data/columns_ae_duo.txt +++ b/morpheus/data/columns_ae_duo.txt @@ -1,8 +1,7 @@ accessdevicebrowser accessdeviceos -accessdevicelocationcity authdevicename -result reason +result locincrement logcount diff --git a/morpheus/messages/multi_ae_message.py b/morpheus/messages/multi_ae_message.py index 41f233d963..4b7068ed7d 100644 --- a/morpheus/messages/multi_ae_message.py +++ b/morpheus/messages/multi_ae_message.py @@ -27,8 +27,8 @@ class MultiAEMessage(MultiMessage): model: AutoEncoder # train_loss_scores: cp.ndarray - train_scores_mean: float - train_scores_std: float + train_scores_mean: float = 0.0 + train_scores_std: float = 1.0 def get_slice(self, start, stop): """ diff --git a/morpheus/utils/logger.py b/morpheus/utils/logger.py index c4ae9fa8b0..de8532aff1 100644 --- a/morpheus/utils/logger.py +++ b/morpheus/utils/logger.py @@ -180,3 +180,19 @@ def deprecated_stage_warning(logger, cls, name): "It has no effect and acts as a pass through stage."), cls.__name__, name) + + +def get_log_levels(): + log_levels = list(logging._nameToLevel.keys()) + + if ("NOTSET" in log_levels): + log_levels.remove("NOTSET") + + return log_levels + + +def parse_log_level(ctx, param, value): + x = logging._nameToLevel.get(value.upper(), None) + if x is None: + raise click.BadParameter('Must be one of {}. Passed: {}'.format(", ".join(logging._nameToLevel.keys()), value)) + return x diff --git a/setup.py b/setup.py index eabfa0fa2f..3ef4b83133 100644 --- a/setup.py +++ b/setup.py @@ -45,10 +45,7 @@ "docker", "dfencoder @ git+https://github.com/nv-morpheus/dfencoder.git@branch-22.09#egg=dfencoder", "grpcio-channelz", - "mlflow>=1.23", "networkx", - "pandas>=1.3", - "pluggy", "torch==1.10.2+cu113", "tqdm", "tritonclient[all]",