diff --git a/.gitattributes b/.gitattributes index 4d22373a5b..f3bd2f1096 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,3 +1,4 @@ +docs/source/developer_guide/guides/img/** filter=lfs diff=lfs merge=lfs -text examples/data/** filter=lfs diff=lfs merge=lfs -text morpheus/_version.py export-subst tests/mock_triton_server/payloads/** filter=lfs diff=lfs merge=lfs -text diff --git a/.pylintrc b/.pylintrc index 428b85fabe..691cefcd74 100644 --- a/.pylintrc +++ b/.pylintrc @@ -1,27 +1,71 @@ -[MASTER] +[MAIN] + +# Analyse import fallback blocks. This can be used to support both Python 2 and +# 3 compatible code, which means that the block might have code that exists +# only in one or another interpreter, leading to false positives when analysed. +analyse-fallback-blocks=no + +# Load and enable all available extensions. Use --list-extensions to see a list +# all available extensions. +#enable-all-extensions= + +# In error mode, messages with a category besides ERROR or FATAL are +# suppressed, and no reports are done by default. Error mode is compatible with +# disabling specific errors. +#errors-only= + +# Always return a 0 (non-error) status code, even if lint errors are found. +# This is primarily useful in continuous integration scripts. +#exit-zero= # A comma-separated list of package or module names from where C extensions may # be loaded. Extensions are loading into the active Python interpreter and may # run arbitrary code. extension-pkg-allow-list=srf,morpheus._lib +# A comma-separated list of package or module names from where C extensions may +# be loaded. Extensions are loading into the active Python interpreter and may +# run arbitrary code. (This is an alternative name to extension-pkg-allow-list +# for backward compatibility.) +extension-pkg-whitelist= + +# Return non-zero exit code if any of these messages/categories are detected, +# even if score is above --fail-under value. Syntax same as enable. Messages +# specified are enabled, while categories only check already-enabled messages. +fail-on= + # Specify a score threshold to be exceeded before program exits with error. fail-under=10 -# Add files or directories to the blacklist. They should be base names, not -# paths. +# Interpret the stdin as a python script, whose filename needs to be passed as +# the module_or_package argument. +#from-stdin= + +# Files or directories to be skipped. They should be base names, not paths. ignore=CVS -# Add files or directories matching the regex patterns to the blacklist. The -# regex matches against base names, not paths. -ignore-patterns= +# Add files or directories matching the regex patterns to the ignore-list. The +# regex matches against paths and can be in Posix or Windows format. +ignore-paths= + +# Files or directories matching the regex patterns are skipped. The regex +# matches against base names, not paths. The default value ignores Emacs file +# locks +ignore-patterns=^\.# + +# List of module names for which member attributes should not be checked +# (useful for modules/projects where namespaces are manipulated during runtime +# and thus existing member attributes cannot be deduced by static analysis). It +# supports qualified module names, as well as Unix pattern matching. +ignored-modules= # Python code to execute, usually for sys.path manipulation such as # pygtk.require(). #init-hook= # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the -# number of processors available to use. +# number of processors available to use, and will cap the count on Windows to +# avoid hangs. jobs=1 # Control the amount of potential inferred values when inferring a single @@ -31,11 +75,18 @@ limit-inference-results=100 # List of plugins (as comma separated values of python module names) to load, # usually to register additional checkers. -load-plugins= +load-plugins=pylint.extensions.docparams # Pickle collected data for later comparisons. persistent=yes +# Minimum Python version to use for version dependent checks. Will default to +# the version used to run pylint. +py-version=3.8 + +# Discover python modules and packages in the file system subtree. +recursive=no + # When enabled, pylint would attempt to guess common misconfiguration and emit # user-friendly hints instead of false-positive error messages. suggestion-mode=yes @@ -44,127 +95,22 @@ suggestion-mode=yes # active Python interpreter and may run arbitrary code. unsafe-load-any-extension=no - -[MESSAGES CONTROL] - -# Only show warnings with the listed confidence levels. Leave empty to show -# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED. -confidence= - -# Disable the message, report, category or checker with the given id(s). You -# can either give multiple identifiers separated by comma (,) or put this -# option multiple times (only on the command line, not in the configuration -# file where it should appear only once). You can also use "--disable=all" to -# disable everything first and then reenable specific checks. For example, if -# you want to run only the similarities checker, you can use "--disable=all -# --enable=similarities". If you want to run only the classes checker, but have -# no Warning level messages displayed, use "--disable=all --enable=classes -# --disable=W". -disable=print-statement, - parameter-unpacking, - unpacking-in-except, - old-raise-syntax, - backtick, - long-suffix, - old-ne-operator, - old-octal-literal, - import-star-module-level, - non-ascii-bytes-literal, - raw-checker-failed, - bad-inline-option, - locally-disabled, - file-ignored, - suppressed-message, - useless-suppression, - deprecated-pragma, - use-symbolic-message-instead, - apply-builtin, - basestring-builtin, - buffer-builtin, - cmp-builtin, - coerce-builtin, - execfile-builtin, - file-builtin, - long-builtin, - raw_input-builtin, - reduce-builtin, - standarderror-builtin, - unicode-builtin, - xrange-builtin, - coerce-method, - delslice-method, - getslice-method, - setslice-method, - no-absolute-import, - old-division, - dict-iter-method, - dict-view-method, - next-method-called, - metaclass-assignment, - indexing-exception, - raising-string, - reload-builtin, - oct-method, - hex-method, - nonzero-method, - cmp-method, - input-builtin, - round-builtin, - intern-builtin, - unichr-builtin, - map-builtin-not-iterating, - zip-builtin-not-iterating, - range-builtin-not-iterating, - filter-builtin-not-iterating, - using-cmp-argument, - eq-without-hash, - div-method, - idiv-method, - rdiv-method, - exception-message-attribute, - invalid-str-codec, - sys-max-int, - bad-python3-import, - deprecated-string-function, - deprecated-str-translate-call, - deprecated-itertools-function, - deprecated-types-field, - next-method-defined, - dict-items-not-iterating, - dict-keys-not-iterating, - dict-values-not-iterating, - deprecated-operator-function, - deprecated-urllib-function, - xreadlines-attribute, - deprecated-sys-function, - exception-escape, - comprehension-escape, - superfluous-parens, - missing-module-docstring, - missing-class-docstring, - missing-function-docstring, - protected-access, - unspecified-encoding - -# Enable the message, report, category or checker with the given id(s). You can -# either give multiple identifier separated by comma (,) or put this option -# multiple time (only on the command line, not in the configuration file where -# it should appear only once). See also the "--disable" option for examples. -enable=c-extension-no-member +# In verbose mode, extra non-checker-related info will be displayed. +#verbose= [REPORTS] # Python expression which should return a score less than or equal to 10. You -# have access to the variables 'error', 'warning', 'refactor', and 'convention' -# which contain the number of messages in each category, as well as 'statement' -# which is the total number of statements analyzed. This score is used by the -# global evaluation report (RP0004). -evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) +# have access to the variables 'fatal', 'error', 'warning', 'refactor', +# 'convention', and 'info' which contain the number of messages in each +# category, as well as 'statement' which is the total number of statements +# analyzed. This score is used by the global evaluation report (RP0004). +evaluation=max(0, 0 if fatal else 10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)) # Template used to display messages. This is a python new-style format string # used to format the message information. See doc for all details. -#msg-template= +msg-template= # Set the output format. Available formats are text, parseable, colorized, json # and msvs (visual studio). You can also give a reporter class, e.g. @@ -178,16 +124,45 @@ reports=no score=yes -[REFACTORING] +[MESSAGES CONTROL] -# Maximum number of nested blocks for function / method body -max-nested-blocks=5 +# Only show warnings with the listed confidence levels. Leave empty to show +# all. Valid levels: HIGH, CONTROL_FLOW, INFERENCE, INFERENCE_FAILURE, +# UNDEFINED. +confidence=HIGH, + CONTROL_FLOW, + INFERENCE, + INFERENCE_FAILURE, + UNDEFINED -# Complete name of functions that never returns. When checking for -# inconsistent-return-statements if a never returning function is called then -# it will be considered as an explicit return statement and no message will be -# printed. -never-returning-functions=sys.exit +# Disable the message, report, category or checker with the given id(s). You +# can either give multiple identifiers separated by comma (,) or put this +# option multiple times (only on the command line, not in the configuration +# file where it should appear only once). You can also use "--disable=all" to +# disable everything first and then re-enable specific checks. For example, if +# you want to run only the similarities checker, you can use "--disable=all +# --enable=similarities". If you want to run only the classes checker, but have +# no Warning level messages displayed, use "--disable=all --enable=classes +# --disable=W". +disable=bad-inline-option, + deprecated-pragma, + file-ignored, + locally-disabled, + missing-class-docstring, + missing-function-docstring, + missing-module-docstring, + protected-access, + raw-checker-failed, + superfluous-parens, + suppressed-message, + use-symbolic-message-instead, + useless-suppression + +# Enable the message, report, category or checker with the given id(s). You can +# either give multiple identifier separated by comma (,) or put this option +# multiple time (only on the command line, not in the configuration file where +# it should appear only once). See also the "--disable" option for examples. +enable=c-extension-no-member [STRING] @@ -210,6 +185,9 @@ additional-builtins= # Tells whether unused global variables should be treated as a violation. allow-global-unused-variables=yes +# List of names allowed to shadow builtins +allowed-redefined-builtins= + # List of strings which can identify a callback function by name. A callback # name must start or end with one of those strings. callbacks=cb_, @@ -242,56 +220,57 @@ logging-format-style=old logging-modules=logging -[SIMILARITIES] +[REFACTORING] -# Ignore comments when computing similarities. -ignore-comments=yes +# Maximum number of nested blocks for function / method body +max-nested-blocks=5 -# Ignore docstrings when computing similarities. -ignore-docstrings=yes +# Complete name of functions that never returns. When checking for +# inconsistent-return-statements if a never returning function is called then +# it will be considered as an explicit return statement and no message will be +# printed. +never-returning-functions=sys.exit,argparse.parse_error -# Ignore imports when computing similarities. -ignore-imports=no -# Minimum lines number of a similarity. -min-similarity-lines=4 +[DESIGN] +# List of regular expressions of class ancestor names to ignore when counting +# public methods (see R0903) +exclude-too-few-public-methods= -[FORMAT] +# List of qualified class names to ignore when counting class parents (see +# R0901) +ignored-parents= -# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. -expected-line-ending-format= +# Maximum number of arguments for function / method. +max-args=5 -# Regexp for a line that is allowed to be longer than the limit. -ignore-long-lines=^\s*(# )??$ +# Maximum number of attributes for a class (see R0902). +max-attributes=7 -# Number of spaces of indent required inside a hanging or continued line. -indent-after-paren=4 +# Maximum number of boolean expressions in an if statement (see R0916). +max-bool-expr=5 -# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 -# tab). -indent-string=' ' +# Maximum number of branch for function / method body. +max-branches=12 -# Maximum number of characters on a single line. -max-line-length=130 +# Maximum number of locals for function / method body. +max-locals=15 -# Maximum number of lines in a module. -max-module-lines=1000 +# Maximum number of parents for a class (see R0901). +max-parents=7 -# List of optional constructs for which whitespace checking is disabled. `dict- -# separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. -# `trailing-comma` allows a space between comma and closing bracket: (a, ). -# `empty-line` allows space-only lines. -no-space-check=trailing-comma, - dict-separator +# Maximum number of public methods for a class (see R0904). +max-public-methods=20 -# Allow the body of a class to be on the same line as the declaration if body -# contains single statement. -single-line-class-stmt=no +# Maximum number of return / yield for function / method body. +max-returns=6 -# Allow the body of an if to be on the same line as the test if there is no -# else. -single-line-if-stmt=no +# Maximum number of statements in function / method body. +max-statements=50 + +# Minimum number of public methods for a class (see R0903). +min-public-methods=2 [BASIC] @@ -300,13 +279,15 @@ single-line-if-stmt=no argument-naming-style=snake_case # Regular expression matching correct argument names. Overrides argument- -# naming-style. +# naming-style. If left empty, argument names will be checked with the set +# naming style. #argument-rgx= # Naming style matching correct attribute names. attr-naming-style=snake_case # Regular expression matching correct attribute names. Overrides attr-naming- +# style. If left empty, attribute names will be checked with the set naming # style. #attr-rgx= @@ -326,20 +307,30 @@ bad-names-rgxs= class-attribute-naming-style=any # Regular expression matching correct class attribute names. Overrides class- -# attribute-naming-style. +# attribute-naming-style. If left empty, class attribute names will be checked +# with the set naming style. #class-attribute-rgx= +# Naming style matching correct class constant names. +class-const-naming-style=UPPER_CASE + +# Regular expression matching correct class constant names. Overrides class- +# const-naming-style. If left empty, class constant names will be checked with +# the set naming style. +#class-const-rgx= + # Naming style matching correct class names. class-naming-style=PascalCase # Regular expression matching correct class names. Overrides class-naming- -# style. +# style. If left empty, class names will be checked with the set naming style. #class-rgx= # Naming style matching correct constant names. const-naming-style=UPPER_CASE # Regular expression matching correct constant names. Overrides const-naming- +# style. If left empty, constant names will be checked with the set naming # style. #const-rgx= @@ -351,7 +342,8 @@ docstring-min-length=-1 function-naming-style=snake_case # Regular expression matching correct function names. Overrides function- -# naming-style. +# naming-style. If left empty, function names will be checked with the set +# naming style. #function-rgx= # Good variable names which should always be accepted, separated by a comma. @@ -377,21 +369,22 @@ include-naming-hint=yes inlinevar-naming-style=any # Regular expression matching correct inline iteration names. Overrides -# inlinevar-naming-style. +# inlinevar-naming-style. If left empty, inline iteration names will be checked +# with the set naming style. #inlinevar-rgx= # Naming style matching correct method names. method-naming-style=snake_case # Regular expression matching correct method names. Overrides method-naming- -# style. +# style. If left empty, method names will be checked with the set naming style. #method-rgx= # Naming style matching correct module names. module-naming-style=snake_case # Regular expression matching correct module names. Overrides module-naming- -# style. +# style. If left empty, module names will be checked with the set naming style. #module-rgx= # Colon-delimited sets of names that determine each other's naming style when @@ -407,14 +400,136 @@ no-docstring-rgx=^_ # These decorators are taken in consideration only for invalid-name. property-classes=abc.abstractproperty +# Regular expression matching correct type variable names. If left empty, type +# variable names will be checked with the set naming style. +#typevar-rgx= + # Naming style matching correct variable names. variable-naming-style=snake_case # Regular expression matching correct variable names. Overrides variable- -# naming-style. +# naming-style. If left empty, variable names will be checked with the set +# naming style. #variable-rgx= +[CLASSES] + +# Warn about protected attribute access inside special methods +check-protected-access-in-special-methods=no + +# List of method names used to declare (i.e. assign) instance attributes. +defining-attr-methods=__init__, + __new__, + setUp, + __post_init__ + +# List of member names, which should be excluded from the protected access +# warning. +exclude-protected=_asdict, + _fields, + _replace, + _source, + _make + +# List of valid names for the first argument in a class method. +valid-classmethod-first-arg=cls + +# List of valid names for the first argument in a metaclass class method. +valid-metaclass-classmethod-first-arg=cls + + +[SIMILARITIES] + +# Comments are removed from the similarity computation +ignore-comments=yes + +# Docstrings are removed from the similarity computation +ignore-docstrings=yes + +# Imports are removed from the similarity computation +ignore-imports=yes + +# Signatures are removed from the similarity computation +ignore-signatures=yes + +# Minimum lines number of a similarity. +min-similarity-lines=4 + + +[FORMAT] + +# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. +expected-line-ending-format= + +# Regexp for a line that is allowed to be longer than the limit. +ignore-long-lines=^\s*(# )??$ + +# Number of spaces of indent required inside a hanging or continued line. +indent-after-paren=4 + +# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 +# tab). +indent-string=' ' + +# Maximum number of characters on a single line. +max-line-length=120 + +# Maximum number of lines in a module. +max-module-lines=1000 + +# Allow the body of a class to be on the same line as the declaration if body +# contains single statement. +single-line-class-stmt=no + +# Allow the body of an if to be on the same line as the test if there is no +# else. +single-line-if-stmt=no + + +[EXCEPTIONS] + +# Exceptions that will emit a warning when caught. +overgeneral-exceptions=BaseException, + Exception + + +[IMPORTS] + +# List of modules that can be imported at any level, not just the top level +# one. +allow-any-import-level= + +# Allow wildcard imports from modules that define __all__. +allow-wildcard-with-all=no + +# Deprecated modules which should not be used, separated by a comma. +deprecated-modules= + +# Output a graph (.gv or any supported image format) of external dependencies +# to the given file (report RP0402 must not be disabled). +ext-import-graph= + +# Output a graph (.gv or any supported image format) of all (i.e. internal and +# external) dependencies to the given file (report RP0402 must not be +# disabled). +import-graph= + +# Output a graph (.gv or any supported image format) of internal dependencies +# to the given file (report RP0402 must not be disabled). +int-import-graph= + +# Force import order to recognize a module as part of the standard +# compatibility libraries. +known-standard-library= + +# Force import order to recognize a module as part of a third party library. +known-third-party=enchant + +# Couples of modules and preferred modules, separated by a comma. +preferred-modules= + + [TYPECHECK] # List of decorators that produce context managers, such as @@ -427,10 +542,6 @@ contextmanager-decorators=contextlib.contextmanager # expressions are accepted. generated-members= -# Tells whether missing members accessed in mixin class should be ignored. A -# mixin class is detected if its name ends with "mixin" (case insensitive). -ignore-mixin-members=yes - # Tells whether to warn about missing members when the owner of the attribute # is inferred to be None. ignore-none=yes @@ -443,16 +554,16 @@ ignore-none=yes # the rest of the inferred objects. ignore-on-opaque-inference=yes +# List of symbolic message names to ignore for Mixin members. +ignored-checks-for-mixins=no-member, + not-async-context-manager, + not-context-manager, + attribute-defined-outside-init + # List of class names for which member attributes should not be checked (useful # for classes with dynamically set attributes). This supports the use of # qualified names. -ignored-classes=optparse.Values,thread._local,_thread._local - -# List of module names for which member attributes should not be checked -# (useful for modules/projects where namespaces are manipulated during runtime -# and thus existing member attributes cannot be deduced by static analysis). It -# supports qualified module names, as well as Unix pattern matching. -ignored-modules= +ignored-classes=optparse.Values,thread._local,_thread._local,argparse.Namespace # Show a hint with possible names when a member name was not found. The aspect # of finding the hint is based on edit distance. @@ -466,6 +577,9 @@ missing-member-hint-distance=1 # showing a hint for a missing member. missing-member-max-choices=1 +# Regex pattern to define which classes are considered mixins. +mixin-class-rgx=.*[Mm]ixin + # List of decorators that change the signature of a decorated function. signature-mutators= @@ -476,9 +590,13 @@ signature-mutators= max-spelling-suggestions=4 # Spelling dictionary name. Available dictionaries: none. To make it work, -# install the python-enchant package. +# install the 'python-enchant' package. spelling-dict= +# List of comma separated words that should be considered directives if they +# appear at the beginning of a comment and should not be checked. +spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy: + # List of comma separated words that should not be checked. spelling-ignore-words= @@ -498,109 +616,4 @@ notes=FIXME, TODO # Regular expression of note tags to take in consideration. -#notes-rgx= - - -[DESIGN] - -# Maximum number of arguments for function / method. -max-args=5 - -# Maximum number of attributes for a class (see R0902). -max-attributes=7 - -# Maximum number of boolean expressions in an if statement (see R0916). -max-bool-expr=5 - -# Maximum number of branch for function / method body. -max-branches=12 - -# Maximum number of locals for function / method body. -max-locals=15 - -# Maximum number of parents for a class (see R0901). -max-parents=7 - -# Maximum number of public methods for a class (see R0904). -max-public-methods=20 - -# Maximum number of return / yield for function / method body. -max-returns=6 - -# Maximum number of statements in function / method body. -max-statements=50 - -# Minimum number of public methods for a class (see R0903). -min-public-methods=2 - - -[IMPORTS] - -# List of modules that can be imported at any level, not just the top level -# one. -allow-any-import-level= - -# Allow wildcard imports from modules that define __all__. -allow-wildcard-with-all=no - -# Analyse import fallback blocks. This can be used to support both Python 2 and -# 3 compatible code, which means that the block might have code that exists -# only in one or another interpreter, leading to false positives when analysed. -analyse-fallback-blocks=no - -# Deprecated modules which should not be used, separated by a comma. -deprecated-modules=optparse,tkinter.tix - -# Create a graph of external dependencies in the given file (report RP0402 must -# not be disabled). -ext-import-graph= - -# Create a graph of every (i.e. internal and external) dependencies in the -# given file (report RP0402 must not be disabled). -import-graph= - -# Create a graph of internal dependencies in the given file (report RP0402 must -# not be disabled). -int-import-graph= - -# Force import order to recognize a module as part of the standard -# compatibility libraries. -known-standard-library= - -# Force import order to recognize a module as part of a third party library. -known-third-party=enchant - -# Couples of modules and preferred modules, separated by a comma. -preferred-modules= - - -[CLASSES] - -# List of method names used to declare (i.e. assign) instance attributes. -defining-attr-methods=__init__, - __new__, - setUp, - __post_init__, - __set_name__ - -# List of member names, which should be excluded from the protected access -# warning. -exclude-protected=_asdict, - _fields, - _replace, - _source, - _make - -# List of valid names for the first argument in a class method. -valid-classmethod-first-arg=cls - -# List of valid names for the first argument in a metaclass class method. -valid-metaclass-classmethod-first-arg=cls - - -[EXCEPTIONS] - -# Exceptions that will emit a warning when being caught. Defaults to -# "BaseException, Exception". -overgeneral-exceptions=BaseException, - Exception +notes-rgx= diff --git a/ci/scripts/jenkins/docs.sh b/ci/scripts/jenkins/docs.sh index fac7648ee4..c5501aad2a 100755 --- a/ci/scripts/jenkins/docs.sh +++ b/ci/scripts/jenkins/docs.sh @@ -21,6 +21,12 @@ source ${WORKSPACE}/ci/scripts/jenkins/common.sh restore_conda_env pip install ${MORPHEUS_ROOT}/build/wheel +gpuci_logger "Pulling LFS assets" +cd ${MORPHEUS_ROOT} + +git lfs install +${MORPHEUS_ROOT}/scripts/fetch_data.py fetch docs + cd ${MORPHEUS_ROOT}/docs gpuci_logger "Installing Documentation dependencies" pip install -r requirement.txt diff --git a/docker/Dockerfile b/docker/Dockerfile index 6d970f5a2f..730bb7b71f 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -86,7 +86,7 @@ RUN source activate morpheus &&\ # Set the entrypoint to use the entrypoint.sh script which sets the conda env COPY docker/entrypoint.sh ./docker/ -ENTRYPOINT [ "/opt/conda/bin/tini", "--", "docker/entrypoint.sh" ] +ENTRYPOINT [ "/opt/conda/bin/tini", "--", "/workspace/docker/entrypoint.sh" ] # Reset the shell back to normal SHELL ["/bin/bash", "-c"] diff --git a/docs/source/developer_guide/guides.rst b/docs/source/developer_guide/guides.rst index bb994af56d..0468fce39f 100644 --- a/docs/source/developer_guide/guides.rst +++ b/docs/source/developer_guide/guides.rst @@ -25,3 +25,4 @@ Morpheus includes several stages to choose from when building a custom pipeline, guides/2_real_world_phishing guides/3_simple_cpp_stage guides/4_source_cpp_stage + guides/5_digital_fingerprinting diff --git a/docs/source/developer_guide/guides/5_digital_fingerprinting.md b/docs/source/developer_guide/guides/5_digital_fingerprinting.md new file mode 100644 index 0000000000..cf19e4fb9d --- /dev/null +++ b/docs/source/developer_guide/guides/5_digital_fingerprinting.md @@ -0,0 +1,337 @@ + + +# 5. Digital Fingerprinting (DFP) + +## Overview +Every account, user, service and machine has a digital fingerprint​, which represents the typical actions performed and not performed over a given period of time. Understanding every entity's day-to-day, moment-by-moment work helps us identify anomalous behavior and uncover potential threats in the environment​. + +To construct this digital fingerprint we will be training unsupervised behavioral models at various granularities, including a generic model for all users in the organization along with fine-grained models for each user to monitor their behavior. These models are continuously updated and retrained overtime​, and alerts are triggered when deviations from normality occur for any user​. + +## Training Sources +The data we will want to use for the training and inference will be any sensitive system that the user interacts with, such as VPN, authentication and cloud services. The [digital fingerprinting example](/examples/digital_fingerprinting/README.md) included in Morpheus ingests logs from [AWS CloudTrail](https://docs.aws.amazon.com/cloudtrail/index.html), [Azure Active Directory](https://docs.microsoft.com/en-us/azure/active-directory/reports-monitoring/concept-sign-ins) and [Duo Authentication](https://help.duo.com/s/article/1023?language=en_US). + +The location of these logs could be either local to the machine running Morpheus, a shared filesystem like NFS or on a remote store such as [Amazon S3](https://aws.amazon.com/s3/). + +Additional data sources and remote stores can easily be added using the Morpheus SDK, the key to applying DFP to a new data source is through the process of feature selection. Any data source can be fed into DFP after some preprocessing to get a feature vector per log/data point​. Since DFP builds a targeted model for each entity (user/service/machine... etc.), it would work best if the chosen data source has a field that uniquely identifies the entity we’re trying to model. + + +### DFP Features + +#### AWS CloudTrail +| Feature | Description | +| ------- | ----------- | +| userIdentityaccessKeyId | e.g., ACPOSBUM5JG5BOW7B2TR, ABTHWOIIC0L5POZJM2FF, AYI2CM8JC3NCFM4VMMB4 | +| userAgent | e.g., Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 10.0; Trident/5.1), Mozilla/5.0 (Linux; Android 4.3.1) AppleWebKit/536.1 (KHTML, like Gecko) Chrome/62.0.822.0 Safari/536.1, Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10 7_0; rv:1.9.4.20) Gecko/2012-06-10 12:09:43 Firefox/3.8 | +| userIdentitysessionContextsessionIssueruserName | e.g., role-g | +| sourceIPAddress | e.g., 208.49.113.40, 123.79.131.26, 128.170.173.123 | +| userIdentityaccountId | e.g., Account-123456789 | +| errorMessage | e.g., The input fails to satisfy the constraints specified by an AWS service., The specified subnet cannot be found in the VPN with which the Client VPN endpoint is associated., Your account is currently blocked. Contact aws-verification@amazon.com if you have questions. | +| userIdentitytype | e.g., FederatedUser | +| eventName | e.g., GetSendQuota, ListTagsForResource, DescribeManagedPrefixLists | +| userIdentityprincipalId | e.g., 39c71b3a-ad54-4c28-916b-3da010b92564, 0baf594e-28c1-46cf-b261-f60b4c4790d1, 7f8a985f-df3b-4c5c-92c0-e8bffd68abbf | +| errorCode | e.g., success, MissingAction, ValidationError | +| eventSource | e.g., lopez-byrd.info, robinson.com, lin.com | +| userIdentityarn | e.g., arn:aws:4a40df8e-c56a-4e6c-acff-f24eebbc4512, arn:aws:573fd2d9-4345-487a-9673-87de888e4e10, arn:aws:c8c23266-13bb-4d89-bce9-a6eef8989214 | +| apiVersion | e.g., 1984-11-26, 1990-05-27, 2001-06-09 | + +#### Azure Active Directory +| Feature | Description | +| ------- | ----------- | +| appDisplayName | e.g., Windows sign in, MS Teams, Office 365​ | +| clientAppUsed | e.g., IMAP4, Browser​ | +| deviceDetail.displayName | e.g., username-LT​ | +| deviceDetail.browser | e.g., EDGE 98.0.xyz, Chrome 98.0.xyz​ | +| deviceDetail.operatingSystem | e.g., Linux, IOS 15, Windows 10​ | +| statusfailureReason | e.g., external security challenge not satisfied, error validating credentials​ | +| riskEventTypesv2 | AzureADThreatIntel, unfamiliarFeatures​ | +| location.countryOrRegion | country or region name​ | +| location.city | city name | + +##### Derived Features +| Feature | Description | +| ------- | ----------- | +| logcount | tracks the number of logs generated by a user within that day (increments with every log)​ | +| locincrement | increments every time we observe a new city (location.city) in a user’s logs within that day​ | +| appincrement | increments every time we observe a new app (appDisplayName) in a user’s logs within that day​ | + +#### Duo Authentication +| Feature | Description | +| ------- | ----------- | +| auth_device.name | phone number​ | +| access_device.browser | e.g., Edge, Chrome, Chrome Mobile​ | +| access_device.os | e.g., Android, Windows​ | +| result | SUCCESS or FAILURE ​ | +| reason | reason for the results, e.g., User Cancelled, User Approved, User Mistake, No Response​ | +| access_device.location.city | city name | + +##### Derived Features +| Feature | Description | +| ------- | ----------- | +| logcount | tracks the number of logs generated by a user within that day (increments with every log)​ | +| locincrement | increments every time we observe a new city (location.city) in a user’s logs within that day​ | + + +## High Level Architecture +DFP in Morpheus is accomplished via two independent pipelines: training and inference​. The pipelines communicate via a shared model store ([MLflow](https://mlflow.org/)), and both share many common components​, as Morpheus is composed of reusable stages which can be easily mixed and matched. + +![High Level Architecture](img/dfp_high_level_arch.png) + +#### Training Pipeline +* Trains user models and uploads to the model store​ +* Capable of training individual user models or a fallback generic model for all users​ + +#### Inference Pipeline +* Downloads user models from the model store​ +* Generates anomaly scores per log​ +* Sends detected anomalies to monitoring services + +#### Monitoring +* Detected anomilies are published to an S3 bucket, directory or a Kafka topic. +* Output can be integrated with a monitoring tool. + + +## Runtune Environment Setup +![Runtune Environment Setup](img/dfp_runtime_env.png) + +DFP in Morpheus is built as an application of containerized services​ and can be run in two ways: +1. Using docker-compose for testing and development​ +1. Using helm charts for production Kubernetes deployment​ + +### System requirements for the DFP reference architecture: +##### Running via docker-compose: +* [Docker](https://docs.docker.com/get-docker/) and [docker-compose](https://docs.docker.com/compose/) installed on the host machine​ +* Supported GPU with [nvidia-docker runtime​]((https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#docker)) + +##### Running via Kubernetes​ +* [Kubernetes](https://kubernetes.io/) cluster configured with GPU resources​ +* [NVIDIA GPU Operator](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/gpu-operator) installed in the cluster + +Note: For GPU Requirements see [README.md](/README.md#requirements) + +### Services +The reference architecture is composed of the following services:​ +| Service | Description | +| ------- | ----------- | +| [MLflow](https://mlflow.org/) | Provides a versioned model store​ | +| [Jupyter Server](https://jupyter-notebook.readthedocs.io/en/stable/public_server.html)​ | Necessary for testing and development of the pipelines​ | +| Morpheus Training Pipeline​ | Trains the autoencoder models and uploads to MLFlow | +| Morpheus Inference Pipeline​ | Downloads models from MLFlow for inferencing​ & Publishes anomalies | + + +## Morpheus Configuration +![Morpheus Configuration](img/dfp_deployment_configs.png) + +### Pipeline Structure Configuration +![Pipeline Structure Configuration](img/dfp_pipeline_structure.png) + +The stages in both the Training and Inference pipelines can be mixed and matched with little impact​, i.e., the `MultiFileSource` can be configured to pull from S3 or from local files and can be replaced altogether with any other Morpheus input stage. similarly the S3 writer can be replaced with any Morpheus output stage. Regardless of the inputs & outputs the core pipeline should renmain unchanged. While stages in the core of the pipeline (inside the blue areas in the above diagram) perform common actions that should be configured not exchanged. + +### Morpheus Config + +For both inference and training pipeline the Morpheus config object should be constructed with the same values, and should look like: +```python +import os + +from morpheus.config import Config +from morpheus.config import ConfigAutoEncoder +from morpheus.config import CppConfig +from morpheus.cli.utils import get_package_relative_file +from morpheus.cli.utils import load_labels_file +``` +```python +CppConfig.set_should_use_cpp(False) + +config = Config() +config.num_threads = os.cpu_count() +config.ae = ConfigAutoEncoder() +config.ae.feature_columns = load_labels_file(get_package_relative_file("data/columns_ae_azure.txt")) +``` + +Other attributes which might be needed: +| Attribute | Type | Default | Description | +| --------- | ---- | ------- | ----------- | +| `Config.ae.userid_column_name` | `str` | `userIdentityaccountId` | Column in the `DataFrame` containing the username or user ID | +| `Config.ae.timestamp_column_name` | `str` | `timestamp` | Column in the `DataFrame` containing the timestamp of the event | +| `Config.ae.fallback_username` | `str` | `generic_user` | Name to use for the generic user model, shoudl nto match the name of any real users | + +### Input Stages +![Input Stages](img/dfp_input_config.png) + +#### MultiFileSource +The `MultiFileSource`([`examples/digital_fingerprinting/production/morpheus/dfp/stages/multi_file_source.py`](/examples/digital_fingerprinting/production/morpheus/dfp/stages/multi_file_source.py)) receives a path or list of paths (`filenames`), and will collectively be emitted into the pipeline as an [fsspec.core.OpenFiles](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.core.OpenFiles) object. The paths may include wildcards `*` as well as urls (ex: `s3://path`) to remote storage providers such as S3, FTP, GCP, Azure, Databricks and others as defined by [fsspec](https://filesystem-spec.readthedocs.io/en/latest/api.html?highlight=open_files#fsspec.open_files). In addition to this paths can be cached locally by prefixing them with `filecache::` (ex: `filecache::s3://bucket-name/key-name`). + +Note: this stage does not actually download the data files, allowing the file list to be filtered and batched prior to being downloaded. + +| Argument | Type | Descirption | +| -------- | ---- | ----------- | +| `c` | `morpheus.config.Config` | Morpheus config object | +| `filenames` | `List[str]` or `str` | Paths to source file to be read from | + + +#### DFPFileBatcherStage +The `DFPFileBatcherStage` ([`examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_file_batcher_stage.py`](/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_file_batcher_stage.py)) groups data in the incoming `DataFrame` in batches of a time period (per day default). This stage assumes that the date of the logs in S3 can be easily inferred such as encoding the creation time in the file name (e.g., `AUTH_LOG-2022-08-21T22.05.23Z.json`), the actual method for extracting the date is encoded in a user-supplied `date_conversion_func` function (more on this later). + +| Argument | Type | Descirption | +| -------- | ---- | ----------- | +| `c` | `morpheus.config.Config` | Morpheus config object | +| `date_conversion_func` | `function` | Function receives a single [fsspec.core.OpenFile](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.core.OpenFile) argument and returns a `datetime.datetime` object | +| `period` | `str` | Time period to group data by, value must be [one of pandas' offset strings](https://pandas.pydata.org/docs/user_guide/timeseries.html#timeseries-offset-aliases) | +| `sampling_rate_s` | `int` | Optional, default=`0`. When non-zero a subset of the incoming data files will be sampled, only including a file if the `datetime` returned by `date_conversion_func` is at least `sampling_rate_s` seconds greater than the `datetime` of the previously included file | + +For situations where the creation date of of the log file is encoded in the filename is desired, the `date_extractor` in the [`examples/digital_fingerprinting/production/morpheus/dfp/utils/file_utils.py`](/examples/digital_fingerprinting/production/morpheus/dfp/utils/file_utils.py) module can be used. The `date_extractor` will need to have a regex pattern bound to it before being passed in as a parameter to `DFPFileBatcherStage`. The regex pattern will need to contain the following named groups: `year`, `month`, `day`, `hour`, `minute`, `second`, and optionally `microsecond`. + +For input files containing an ISO 8601 formatted date string the `iso_date_regex` regex can be used ex: +```python +from functools import partial + +from dfp.utils.file_utils import date_extractor +from dfp.utils.file_utils import iso_date_regex +``` +```python +# Batch files into buckets by time. Use the default ISO date extractor from the filename +pipeline.add_stage( + DFPFileBatcherStage(config, + period="D", + date_conversion_func=functools.partial(date_extractor, filename_regex=iso_date_regex))) +``` + +Note: in cases where the regular expression does not match the `date_extractor` function will fallback to using the modified time of the file. + +#### DFPFileToDataFrameStage +The `DFPFileToDataFrameStage` ([examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_file_to_df.py](/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_file_to_df.py)) stage receives a `list` of an [fsspec.core.OpenFiles](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.core.OpenFiles) and loads them into a single `DataFrame` which is then emitted into the pipeline. When the parent stage is `DFPFileBatcherStage` each batch (typically one day) is concatenated into a single `DataFrame`, without this if the parent was `MultiFileSource` the entire dataset is loaded into a single `DataFrame`. Because of this, it is important to chose a `period` argument for `DFPFileBatcherStage` small enough such that each batch can fit into memory. + +| Argument | Type | Descirption | +| -------- | ---- | ----------- | +| `c` | `morpheus.config.Config` | Morpheus config object | +| `schema` | `dfp.utils.column_info.DataFrameInputSchema` | Schema specifying columns to load, along with any necessary renames and data type conversions | +| `filter_null` | `bool` | Optional: Whether to filter null rows after loading, by default True. | +| `file_type` | `morpheus._lib.file_types.FileTypes` (enum) | Optional: Indicates file type to be loaded. Currently supported values at time of writing are: `FileTypes.Auto`, `FileTypes.CSV`, and `FileTypes.JSON`. Default value is `FileTypes.Auto` which will infer the type based on the file extension, set this value if using a custom extension | +| `parser_kwargs` | `dict` or `None` | Optional: additional keyword arguments to be passed into the `DataFrame` parser, currently this is going to be either [`pandas.read_csv`](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html) or [`pandas.read_json`](https://pandas.pydata.org/docs/reference/api/pandas.read_json.html) | +| `cache_dir` | `str` | Optional: path to cache location, defaults to `./.cache/dfp` | + +This stage is able to download & load data files concurrently by multiple methods, currently supported methods are: `single_thread`, `multiprocess`, `dask`, and `dask_thread`. The method used is chosen by setting the `FILE_DOWNLOAD_TYPE` environment variable, and `dask_thread` is used by default, and `single_thread` effectively disables concurrent loading. + +This stage will cache the resulting `DataFrame` in `cache_dir`, since we are caching the `DataFrame`s and not the source files, a cache hit avoids the cost of parsing the incoming data. In the case of remote storage systems such as S3 this avoids both parsing and a download on a cache hit. One consequence of this is that any change to the `schema` will require purging cached files in the `cache_dir` before those changes are visible. + +Note: this caching is in addition to any caching which may have occurred when using the optional `filecache::` prefix. + +### DataFrameInputSchema +TODO: Document input schemas and `ColumnInfo` subclasses + +### Output Stages +![Output Stages](img/dfp_output_config.png) + +For the inference pipeline any Morpheus output stage such as `morpheus.stages.output.write_to_file_stage.WriteToFileStage` and `morpheus.stages.output.write_to_kafka_stage.WriteToKafkaStage` could be used in addition to the `WriteToS3Stage` documented below. + +#### WriteToS3Stage +The `WriteToS3Stage` ([examples/digital_fingerprinting/production/morpheus/dfp/stages/write_to_s3_stage.py](/examples/digital_fingerprinting/production/morpheus/dfp/stages/write_to_s3_stage.py)) stage writes the resulting anomaly detections to S3. The `WriteToS3Stage` decouples the S3 specifc operations from the Morpheus stage, and as such receives an `s3_writer` argument. + +| Argument | Type | Descirption | +| -------- | ---- | ----------- | +| `c` | `morpheus.config.Config` | Morpheus config object | +| `s3_writer` | `function` | User defined function which receives an instance of a `morpheus.messages.message_meta.MessageMeta` and returns that same message instance. Any S3 specific configurations such as bucket name should be bound to the method. | + +### Core Pipeline +These stages are common to both the training & inference pipelines, unlike the input & output stages these are specific to the DFP pipeline and intended to be configured but not replacable. + +#### DFPSplitUsersStage +The `DFPSplitUsersStage` [examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_split_users_stage.py](/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_split_users_stage.py) stage recieves an incoming `DataFrame` and emits a `list` of `DFPMessageMeta` where each `DFPMessageMeta` represents the records associated for a given user. This allows for downstream stages to perform all nescesary opperations on a per user basis. + +| Argument | Type | Descirption | +| -------- | ---- | ----------- | +| `c` | `morpheus.config.Config` | Morpheus config object | +| `include_generic` | `bool` | When `True` a `DFPMessageMeta` will be constructed for the generic user containing all records not excluded by the `skip_users` and `only_users` filters | +| `include_individual` | `bool` | When `True` a `DFPMessageMeta` instance will be constructed for each user not excluded by the `skip_users` and `only_users` filters | +| `skip_users` | `List[str]` or `None` | List of users to exclude, when `include_generic` is `True` excluded records will also be excluded from the generic user | +| `only_users` | `List[str]` or `None` | Limit records to a specific list of users, when `include_generic` is `True` the generic user's records will also be limited to the users in this list | + +#### DFPRollingWindowStage +The `DFPRollingWindowStage` [examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_rolling_window_stage.py](/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_rolling_window_stage.py) stage will trim incoming events within a specific size or time window as specified by the `min_history`, `min_increment` and `max_history` constructor arguments. + +| Argument | Type | Descirption | +| -------- | ---- | ----------- | +| `c` | `morpheus.config.Config` | Morpheus config object | +| `min_history` | `int` | Exclude users with less than `min_history` records, setting this to `1` effectively disables this feature | +| `min_increment` | `int` | Exclude incoming batches for users where less than `min_increment` new records have been added since the last batch, setting this to `0` effectively disables this feature | +| `max_history` | `int`, `str` or `None` | When not `None`, include up to `max_history` records. When `max_history` is an int, then the last `max_history` records will be included. When `max_history` is a `str` it is assumed to represent a duration parsable by [`pandas.Timedelta`](https://pandas.pydata.org/docs/reference/api/pandas.Timedelta.html) and only those records within the window of [latest timestamp - `max_history`, latest timestamp] will be included. | +| `cache_dir` | `str` | Optional path to cache directory, cached items will be stored in a subdirectory under `cache_dir` named `rolling-user-data` this directory, along with `cache_dir` will be created if it does not already exist. | + +Note: this stage computes a row hash for the first and last rows of the incoming `DataFrame` as such all data contained must be hashable, any non-hashable values such as `lists` should be dropped or conveted into hashable types in the `DFPFileToDataFrameStage`. + +#### DFPPreprocessingStage +The `DFPPreprocessingStage` [examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_preprocessing_stage.py](/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_preprocessing_stage.py) stage, the actual logic of preprocessing is defined in the `input_schema` argument. Since this stage occurrs in the pipeline after the `DFPFileBatcherStage` and `DFPSplitUsersStage` stages all records in the incoming `DataFrame` correspond to only a single user within a specific time period allowing for columns to be computer on a per-user per-time period basis such as the `logcount` and `locincrement` features mentioned above. Making the type of processing performed in this stage different than those performed in the `DFPFileToDataFrameStage`. + +| Argument | Type | Descirption | +| -------- | ---- | ----------- | +| `c` | `morpheus.config.Config` | Morpheus config object | +| `input_schema` | `dfp.utils.column_info.DataFrameInputSchema` | Schema specifying columns to be included in the output `DataFrame` including computed columns | + +## Training Pipeline +![Training PipelineOverview](img/dfp_training_overview.png) + +Training must begin with the generic user model​ which is trained with the logs from all users. This model serves as a fallback model for users & accounts without sufficient training data​. The name of the generic user is defined in the `ae.fallback_username` attribute of the Morpheus config object and defaults to `generic_user`. + +After training the generic model, individual user models can be trained​. Individual user models provide better accuracy but require sufficient data​, many users do not have sufficient data to accurately train the model​. + +### Training Stages + +#### DFPTraining +The `DFPTraining` [examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_training.py](/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_training.py) trains a model for each incoming `DataFrame` and emits an instance of `morpheus.messages.multi_ae_message.MultiAEMessage` containing the trained model. + +#### DFPMLFlowModelWriterStage +The `DFPMLFlowModelWriterStage` [examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_mlflow_model_writer.py](/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_mlflow_model_writer.py) stage publishes trained models into MLflow, skipping any model which lacked sufficient training data (current required minimum is 300 log records). + +| Argument | Type | Descirption | +| -------- | ---- | ----------- | +| `c` | `morpheus.config.Config` | Morpheus config object | +| `model_name_formatter` | `str` | Optional format string to control the name of models stored in MLflow, default is `dfp-{user_id}`. Currently available field names are: `user_id` and `user_md5` which is an md5 hexadecimal digest as returned by [`hash.hexdigest`](https://docs.python.org/3.8/library/hashlib.html?highlight=hexdigest#hashlib.hash.hexdigest). | +| `experiment_name_formatter` | `str` | Optional format string to control the experiement name for models stored in MLfklow, default is `/dfp-models/{reg_model_name}`. Currently available field names are: `user_id`, `user_md5` and `reg_model_name` which is the model name as defined by `model_name_formatter` once the field names have been applied. | +| `databricks_permissions` | `dict` or `None` | Optional, when not `None` sets permissions needed when using a databricks hosted MLflow server | + +Note: If using a remote MLflow server, users will need to call [`mlflow.set_tracking_uri`](https://www.mlflow.org/docs/latest/python_api/mlflow.html#mlflow.set_tracking_uri) before starting the pipeline. + +## Inference Pipeline +![Inference Pipeline Overview](img/dfp_inference_overview.png) + +### Inference Stages + +#### DFPInferenceStage +The `DFPInferenceStage` [examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_inference_stage.py](/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_inference_stage.py) stage loads models from MLflow and performs inferences against those models. This stage emits a message containing the original `DataFrame` along with new columns containing the z score (`mean_abs_z`), along with the name and version of the model that generated that score (`model_version`). For each feature in the model three additional columns will also be added: +* `_loss` : The loss +* `_z_loss` : The loss z-score +* `_pred` : The predicted value + +For a hypothtetical feature named `result` the three added columns will be: `result_loss`, `result_z_loss`, `result_pred`. + +For performance models fetched from MLflow are cached locally, and are cached for up to 10 minutes allowing updated models to be routinely updated. In addition to caching individual models the stage also maintains a cache of which models are available, so a newly trained user model published to MLflow won't be visible to an already running inference pipeline for up to 10 minutes. + +For any user without an associated model in MLflow, the model for the generic user is used. The name of the generic user is defined in the `ae.fallback_username` attribute of the Morpheus config object defaults to `generic_user`. + +| Argument | Type | Descirption | +| -------- | ---- | ----------- | +| `c` | `morpheus.config.Config` | Morpheus config object | +| `model_name_formatter` | `str` | Format string to control the name of models fetched from MLflow. Currently available field names are: `user_id`. | + + +#### DFPPostprocessingStage +The `DFPPostprocessingStage` [examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_postprocessing_stage.py](/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_postprocessing_stage.py) stage filters records contained in the `DataFrame` emits a new `dfp.messages.DFPMessageMeta` message containing all records with a z score greater than `z_score_threshold` or `None` if no records are greater than `z_score_threshold`. + +| Argument | Type | Descirption | +| -------- | ---- | ----------- | +| `c` | `morpheus.config.Config` | Morpheus config object | +| `z_score_threshold` | `float` | Optional, sets the threshold value above which values of `mean_abs_z` must be above in order to be considered an anomily, default is 2.0 | diff --git a/docs/source/developer_guide/guides/img/dfp_deployment_configs.png b/docs/source/developer_guide/guides/img/dfp_deployment_configs.png new file mode 100644 index 0000000000..3d295bbc32 --- /dev/null +++ b/docs/source/developer_guide/guides/img/dfp_deployment_configs.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb0e3f2bb29ba664ec0cffb549aaeb200462b488db491559179d35378fcd6c36 +size 1556055 diff --git a/docs/source/developer_guide/guides/img/dfp_high_level_arch.png b/docs/source/developer_guide/guides/img/dfp_high_level_arch.png new file mode 100644 index 0000000000..ed28d116e1 --- /dev/null +++ b/docs/source/developer_guide/guides/img/dfp_high_level_arch.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d3ed1e7eb4ab702526cd527242ecd02855deee0031e6d2c818834cf45393c55 +size 486948 diff --git a/docs/source/developer_guide/guides/img/dfp_inference_overview.png b/docs/source/developer_guide/guides/img/dfp_inference_overview.png new file mode 100644 index 0000000000..bdde882555 --- /dev/null +++ b/docs/source/developer_guide/guides/img/dfp_inference_overview.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3507a51a138f1f5d79c678df171d345de3d8991c46e024a7a8cdc8fd191e08e4 +size 752315 diff --git a/docs/source/developer_guide/guides/img/dfp_input_config.png b/docs/source/developer_guide/guides/img/dfp_input_config.png new file mode 100644 index 0000000000..1b37362b37 --- /dev/null +++ b/docs/source/developer_guide/guides/img/dfp_input_config.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15ce037aba5608567f1fc983a67c5047ea8c1b17ddae6b4f917f8bb88f6f45e4 +size 564808 diff --git a/docs/source/developer_guide/guides/img/dfp_output_config.png b/docs/source/developer_guide/guides/img/dfp_output_config.png new file mode 100644 index 0000000000..dec53ef3a1 --- /dev/null +++ b/docs/source/developer_guide/guides/img/dfp_output_config.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3f2dda7c0a3d9deb2405e4c4b2e3f3420dce58cdaba71a142d2dc3b7ff616a4 +size 326262 diff --git a/docs/source/developer_guide/guides/img/dfp_pipeline_structure.png b/docs/source/developer_guide/guides/img/dfp_pipeline_structure.png new file mode 100644 index 0000000000..c81966949f --- /dev/null +++ b/docs/source/developer_guide/guides/img/dfp_pipeline_structure.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:947a1240af49e93f85777597a65895ee5529b6df3f76d7bcf9792dc34739b95c +size 891774 diff --git a/docs/source/developer_guide/guides/img/dfp_runtime_env.png b/docs/source/developer_guide/guides/img/dfp_runtime_env.png new file mode 100644 index 0000000000..91e30e9323 --- /dev/null +++ b/docs/source/developer_guide/guides/img/dfp_runtime_env.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1d9379784ddb7ebeab470d69bae7368dceaca07c189ddf2fe3b288494403aa9 +size 729655 diff --git a/docs/source/developer_guide/guides/img/dfp_training_overview.png b/docs/source/developer_guide/guides/img/dfp_training_overview.png new file mode 100644 index 0000000000..495370bdc1 --- /dev/null +++ b/docs/source/developer_guide/guides/img/dfp_training_overview.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:737142be02f0c5750e2c9184fe8a23d35a015ed6c4935b906d0c3fc6bb09053a +size 770463 diff --git a/docs/source/developer_guide/guides/img/sink_deps.png b/docs/source/developer_guide/guides/img/sink_deps.png index 8fb07072d6..dfac47a4a0 100644 Binary files a/docs/source/developer_guide/guides/img/sink_deps.png and b/docs/source/developer_guide/guides/img/sink_deps.png differ diff --git a/examples/digital_fingerprinting/README.md b/examples/digital_fingerprinting/README.md index c0f1a9d5ce..6e6ae230b7 100644 --- a/examples/digital_fingerprinting/README.md +++ b/examples/digital_fingerprinting/README.md @@ -14,280 +14,34 @@ # limitations under the License. --> -# Digital Fingerprinting Pipeline +# Digital Fingerprinting (DFP) in Morpheus -We show here how to set up and run the DFP pipeline for three log types: CloudTrail, Duo and Azure. Each of these log types uses a built-in source stage that handles that specific data format. New source stages can be added to allow the DFP pipeline to process different log types. All stages after the source stages are identical across all log types but can be configured differently via pipeline or stage configuration options. +## Organization -## Environment Setup +The DFP example workflows in Morpheus are designed to scale up to company wide workloads and handle several different log types which resulted in a large number of moving parts to handle the various services and configuration options. To simplify things, the DFP workflow is provided as two separate examples: a simple, "starter" pipeline for new users and a complex, "production" pipeline for full scale deployments. While these two examples both peform the same general tasks, they do so in very different ways. The following is a breakdown of the differences between the two examples. -Follow the instructions [here](https://github.com/nv-morpheus/Morpheus/blob/branch-22.06/CONTRIBUTING.md) to set up your development environment in either a Docker container or conda environment. +### The "Starter" Example -## Morpheus CLI +This example is designed to simplify the number of stages and components and provided a fully contained workflow in a single pipeline. -DFP pipelines can be constructed and run using the Morpheus CLI command `morpheus run pipeline-ae ...` +Key Differences: + * A single pipeline which performs both training and inference + * Requires no external services + * Can be run from the Morpheus CLI -Use `--help` to display information about the autoencoder pipeline command line options: -``` -morpheus run pipeline-ae --help +### The "Production" Example -Usage: morpheus run pipeline-ae [OPTIONS] COMMAND1 [ARGS]... [COMMAND2 - [ARGS]...]... +This example is designed to show what a full scale, production ready, DFP deployment in Morpheus would look like. It contains all of the necessary components (such as a model store), to allow multiple Morpheus pipelines to communicate at a scale that can handle the workload of an entire company. - Configure and run the pipeline. To configure the pipeline, list the stages - in the order that data should flow. The output of each stage will become the - input for the next stage. For example, to read, classify and write to a - file, the following stages could be used +Key Differences: + * Multiple pipelines are specialized to perform either training or inference + * Requires setting up a model store to allow the training and inference pipelines to communicate + * Organized into a docker-compose deployment for easy startup + * Contains a Jupyter notebook service to ease development and debugging + * Can be deployed to Kubernetes using provided Helm charts + * Uses many customized stages to maximize performance. - pipeline from-file --filename=my_dataset.json deserialize preprocess inf-triton --model_name=my_model - --server_url=localhost:8001 filter --threshold=0.5 to-file --filename=classifications.json +## Getting Started - Pipelines must follow a few rules: - 1. Data must originate in a source stage. Current options are `from-file` or `from-kafka` - 2. A `deserialize` stage must be placed between the source stages and the rest of the pipeline - 3. Only one inference stage can be used. Zero is also fine - 4. The following stages must come after an inference stage: `add-class`, `filter`, `gen-viz` - -Options: - --columns_file FILE [default: /my_data/gitrepos/efajardo-nv/Morp - heus/morpheus/data/columns_ae_cloudtrail.txt - ] - --labels_file FILE Specifies a file to read labels from in - order to convert class IDs into labels. A - label file is a simple text file where each - line corresponds to a label. If unspecified, - only a single output label is created for - FIL - --userid_column_name TEXT Which column to use as the User ID. - [default: userIdentityaccountId; required] - --userid_filter TEXT Specifying this value will filter all - incoming data to only use rows with matching - User IDs. Which column is used for the User - ID is specified by `userid_column_name` - --feature_scaler TEXT Autoencoder feature scaler [default: - standard] - --use_generic_model BOOLEAN Whether to use a generic model when user does - not have minimum number of training rows - [default: False] - --viz_file FILE Save a visualization of the pipeline at the - specified location - --help Show this message and exit. - -Commands: - add-class Add detected classifications to each message - add-scores Add probability scores to each message - buffer (Deprecated) Buffer results - delay (Deprecated) Delay results for a certain duration - filter Filter message by a classification threshold - from-azure Load messages from a Duo directory - from-cloudtrail Load messages from a Cloudtrail directory - from-duo Load messages from a Duo directory - gen-viz (Deprecated) Write out vizualization data frames - inf-pytorch Perform inference with PyTorch - inf-triton Perform inference with Triton - monitor Display throughput numbers at a specific point in the - pipeline - preprocess Convert messages to tokens - serialize Include & exclude columns from messages - timeseries Perform time series anomaly detection and add prediction. - to-file Write all messages to a file - to-kafka Write all messages to a Kafka cluster - train-ae Deserialize source data from JSON - validate Validates pipeline output against an expected output -``` -The commands above correspond to the Morpheus stages that can be used to construct your DFP pipeline. Options are available to configure pipeline and stages. -The following table shows mapping between the main Morpheus CLI commands and underlying Morpheus Python stage classes: - -| CLI Command | Stage Class | Python File | -| ---------------| -------------------------| --------------------------------------------------------- -| from-azure | AzureSourceStage | morpheus/stages/input/azure_source_stage.py -| from-cloudtrail| CloudTrailSourceStage | morpheus/stages/input/clout_trail_source_stage.py -| from-duo | DuoSourceStage | morpheus/stages/input/duo_source_stage.py -| train-ae | TrainAEStage | morpheus/stages/preprocess/train_ae_stage.py -| preprocess | PreprocessAEStage | morpheus/stages/preprocess/preprocess_ae_stage.py -| inf-pytorch | AutoEncoderInferenceStage| morpheus/stages/inference/auto_encoder_inference_stage.py -| add-scores | AddScoresStage | morpheus/stages/postprocess/add_scores_stage.py -| serialize | SerializeStage | morpheus/stages/postprocess/serialize_stage.py -| to-file | WriteToFileStage | morpheus/stages/output/write_to_file_stage.py - - -## Morpheus DFP Stages - -**Source stages** - These include `AzureSourceStage`, `CloudTrailSourceStage` and `DuoSourceStage`. They are responsible for reading log file(s) that match provided `--input_glob` (e.g. `/duo_logs/*.json`). Data is grouped by user so that each batch processed by the pipeline will only contain rows corresponding to a single user. Feature engineering also happens in this stage. All DFP source stages must extend `AutoencoderSourceStage` and implement the `files_to_dfs_per_user` abstract method. Feature columns can be managed by overriding the `derive_features` method. Otherwise, all columns from input data pass through to next stage. - -**Preprocessing stages** - -`TrainAEStage` can either train user models using data matching a provided `--train_data_glob` or load pre-trained models from file using `--pretrained_filename`. When using `--train_data_glob`, user models can be saved using the `--models_output_filename` option. The `--source_stage_class` must also be used with `--train_data_glob` so that the training stage knows how to read the training data. The autoencoder implementation from this [fork](https://github.com/efajardo-nv/dfencoder/tree/morpheus-22.08) is used for user model training. The following are the available CLI options for the `TrainAEStage` (train-ae): - -| Option | Description -| ----------------------| --------------------------------------------------------- -| pretrained_filename | File path to pickled user models saved from previous training run using `--models_output_filename`. -| train_data_glob | Glob path to training data. -| source_stage_class | Source stage so that training stage knows how to read/parse training data. -| train_epochs | Number of training epochs. Default is 25. -| min_train_rows | Minimum number of training rows required to train user model. Default is 300. -| train_max_history | Maximum number of training rows per user. Default is 1000. -| seed | When not None, ensure random number generators are seeded with `seed` to control reproducibility of user model. -| sort_glob | If true the list of files matching `input_glob` will be processed in sorted order. Default is False. -| models_output_filename| Can be used with `--train_data_glob` to save trained user models to file using provided file path. Models can be loaded later using `--pretrained_filename`. - -The `PreprocessAEStage` is responsible for creating a Morpheus message that contains everything needed by the inference stage. For DFP inference, this stage must pass a `MultiInferenceAEMessage` to the inference stage. Each message will correspond to a single user and include the input feature columns, the user's model and training data anomaly scores. - -**Inference stage** - `AutoEncoderInferenceStage` calculates anomaly scores (i.e. reconstruction loss) and z-scores for each user input dataset. - -**Postprocessing stage** - The DFP pipeline uses the `AddScoresStage` for postprocessing to add anomaly scores and zscores from previous inference stage with matching labels. - -**Serialize stage** - `SerializeStage` is used to convert `MultiResponseProbsMessage` from previous stage to a `MessageMeta` to make it suitable for output (i.e. write to file or Kafka). - -**Write stage** - `WriteToFileStage` writes input data with inference results to an output file path. - - -## CloudTrail DFP Pipeline - -Run the following in your Morpheus container to start the CloudTrail DFP pipeline: - -``` -morpheus --log_level=DEBUG \ -run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=False \ -pipeline-ae \ ---columns_file=morpheus/data/columns_ae_cloudtrail.txt \ ---userid_column_name=userIdentitysessionContextsessionIssueruserName \ ---userid_filter=user123 \ ---feature_scaler=standard \ -from-cloudtrail \ ---input_glob=models/datasets/validation-data/dfp-cloudtrail-*-input.csv \ ---max_files=200 \ -train-ae \ ---train_data_glob=models/datasets/training-data/dfp-cloudtrail-*.csv \ ---source_stage_class=morpheus.stages.input.cloud_trail_source_stage.CloudTrailSourceStage \ ---seed=42 \ -preprocess \ -inf-pytorch \ -add-scores \ -serialize \ -to-file --filename=./cloudtrail-dfp-detections.csv --overwrite -``` - -## Duo DFP Pipeline - -First, trains user models from files in `models/datasets/training-data/duo` and saves user models to file. Pipeline then uses these models to run inference -on validation data in `models/datasets/validation-data/duo`. Inference results are written to `duo-detections.csv`. -``` -morpheus --log_level=DEBUG \ -run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=False \ -pipeline-ae \ ---columns_file=morpheus/data/columns_ae_duo.txt \ ---userid_column_name=username \ ---feature_scaler=standard \ -from-duo \ ---input_glob=models/datasets/validation-data/duo/*.json \ ---max_files=200 \ -monitor --description='Input rate' \ -train-ae \ ---train_data_glob=models/datasets/training-data/duo/*.json \ ---source_stage_class=morpheus.stages.input.duo_source_stage.DuoSourceStage \ ---seed=42 \ ---train_epochs=1 \ ---models_output_filename=models/dfp-models/duo_ae_user_models.pkl \ -preprocess \ -inf-pytorch \ -monitor --description='Inference rate' --unit inf \ -add-scores \ -serialize \ -to-file --filename=./duo-detections.csv --overwrite -``` - -The following example shows how we can load pre-trained user models from the file (`models/dfp-models/duo_ae_user_models.pkl`) we created in the previous example. Pipeline then uses these models to run inference on validation data in `models/datasets/validation-data/duo`. Inference results are written to `duo-detections.csv`. -``` -morpheus --log_level=DEBUG \ -run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=False \ -pipeline-ae \ ---columns_file=morpheus/data/columns_ae_duo.txt \ ---userid_column_name=username \ ---feature_scaler=standard \ -from-duo \ ---input_glob=models/datasets/validation-data/duo/*.json \ ---max_files=200 \ -monitor --description='Input rate' \ -train-ae \ ---pretrained_filename=models/dfp-models/duo_ae_user_models.pkl \ -preprocess \ -inf-pytorch \ -monitor --description='Inference rate' --unit inf \ -add-scores \ -serialize \ -to-file --filename=./duo-detections.csv --overwrite -``` - -## Azure DFP Pipeline - -First, trains user models from files in `models/datasets/training-data/azure` and saves user models to file. Pipeline then uses these models to run inference -on validation data in `models/datasets/validation-data/azure`. Inference results are written to `azure-detections.csv`. -``` -morpheus --log_level=DEBUG \ -run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=False \ -pipeline-ae \ ---columns_file=morpheus/data/columns_ae_azure.txt \ ---userid_column_name=userPrincipalName \ ---feature_scaler=standard \ -from-azure \ ---input_glob=models/datasets/validation-data/azure/*.json \ ---max_files=200 \ -train-ae \ ---train_data_glob=models/datasets/training-data/azure/*.json \ ---source_stage_class=morpheus.stages.input.azure_source_stage.AzureSourceStage \ ---seed=42 \ ---models_output_filename=models/dfp-models/azure_ae_user_models.pkl \ -preprocess \ -inf-pytorch \ -monitor --description='Inference rate' --unit inf \ -add-scores \ -serialize \ -to-file --filename=./azure-detections.csv --overwrite -``` - -The following example shows how we can load pre-trained user models from the file (`models/dfp-models/azure_ae_user_models.pkl`) we created in the previous example. Pipeline then uses these models to run inference on validation data in `models/datasets/validation-data/azure`. Inference results are written to `azure-detections.csv`. -``` -morpheus --log_level=DEBUG \ -run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=False \ -pipeline-ae \ ---columns_file=morpheus/data/columns_ae_azure.txt \ ---userid_column_name=userPrincipalName \ ---feature_scaler=standard \ -from-azure \ ---input_glob=models/datasets/validation-data/azure/*.json \ ---max_files=200 \ -train-ae \ ---pretrained_filename=models/dfp-models/azure_ae_user_models.pkl \ -preprocess \ -inf-pytorch \ -monitor --description='Inference rate' --unit inf \ -add-scores \ -serialize \ -to-file --filename=./azure-detections.csv --overwrite -``` - - -## Using Morpheus Python API - -The DFP pipelines can also be constructed and run via the Morpheus Python API. An [example](./run_cloudtrail_dfp.py) is included for the Cloudtrail DFP pipeline. The following are some commands to -run the example. - -Train user models from files in `models/datasets/training-data/dfp-cloudtrail-*.csv` and saves user models to file. Pipeline then uses these models to run inference on Cloudtrail validation data in `models/datasets/validation-data/dfp-cloudtrail-*-input.csv`. Inference results are written to `cloudtrail-dfp-results.csv`. -``` -python ./examples/digital_fingerprinting/run_cloudtrail_dfp.py \ - --columns_file=morpheus/data/columns_ae_cloudtrail.txt \ - --input_glob=models/datasets/validation-data/dfp-cloudtrail-*-input.csv \ - --train_data_glob=models/datasets/training-data/dfp-*.csv \ - --models_output_filename=models/dfp-models/cloudtrail_ae_user_models.pkl \ - --output_file ./cloudtrail-dfp-results.csv -``` - -Here we load pre-trained user models from the file (`models/dfp-models/cloudtrail_ae_user_models.pkl`) we created in the previous example. Pipeline then uses these models to run inference on validation data in `models/datasets/validation-data/dfp-cloudtrail-*-input.csv`. Inference results are written to `cloudtrail-dfp-results.csv`. -``` -python ./examples/digital_fingerprinting/run_cloudtrail_dfp.py \ - --columns_file=morpheus/data/columns_ae_cloudtrail.txt \ - --input_glob=models/datasets/validation-data/dfp-cloudtrail-*-input.csv \ - --pretrained_filename=models/dfp-models/cloudtrail_ae_user_models.pkl \ - --output_file=./cloudtrail-dfp-results.csv -``` \ No newline at end of file +Guides for each of the two examples can be found in their respective directories: [The Starter Example](./starter/README.md) and [The Production Example](./production/README.md) diff --git a/examples/digital_fingerprinting/production/.dockerignore b/examples/digital_fingerprinting/production/.dockerignore new file mode 100644 index 0000000000..a452c24f39 --- /dev/null +++ b/examples/digital_fingerprinting/production/.dockerignore @@ -0,0 +1,13 @@ +# Ignore any Dockerfile +Dockerfile + +# Ignore docker-compose files +docker-compose.yml +.env* + +# ML Flow files +artifacts/ +mlflow.db + +# Ignore any S3 cache folders +*.s3_cache diff --git a/examples/digital_fingerprinting/production/.env.sample b/examples/digital_fingerprinting/production/.env.sample new file mode 100644 index 0000000000..3fee3685cc --- /dev/null +++ b/examples/digital_fingerprinting/production/.env.sample @@ -0,0 +1,11 @@ +# NOTE: This file should be copied to `.env` in the same folder and updated for each user +MYSQL_DATABASE="db" +MYSQL_USER="mlflow" +MYSQL_PASSWORD="good_password" +MYSQL_ROOT_PASSWORD="even_better_password" +MYSQL_ROOT_HOST="172.*.*.*" +MYSQL_LOG_CONSOLE=1 + +# Update these with your own credentials UID=$(id -u) GID=$(id -g) +UID=1000 +GID=1000 diff --git a/examples/digital_fingerprinting/production/.gitignore b/examples/digital_fingerprinting/production/.gitignore new file mode 100644 index 0000000000..175181a4c4 --- /dev/null +++ b/examples/digital_fingerprinting/production/.gitignore @@ -0,0 +1,2 @@ +*.s3_cache +artifacts/ diff --git a/examples/digital_fingerprinting/production/Dockerfile b/examples/digital_fingerprinting/production/Dockerfile new file mode 100644 index 0000000000..7166edc12e --- /dev/null +++ b/examples/digital_fingerprinting/production/Dockerfile @@ -0,0 +1,65 @@ +# SPDX-FileCopyrightText: Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ARG MORPHEUS_CONTAINER=nvcr.io/nvidia/morpheus/morpheus +ARG MORPHEUS_CONTAINER_VERSION=v22.09.00-runtime + +FROM ${MORPHEUS_CONTAINER}:${MORPHEUS_CONTAINER_VERSION} as base + +# # Fix the entrypoint to work with different WORKDIR +ENTRYPOINT [ "/opt/conda/bin/tini", "--", "/workspace/docker/entrypoint.sh" ] + +SHELL ["/bin/bash", "-c"] + +# Set the workdir to the DFP base folder +WORKDIR /workspace/examples/digital_fingerprinting/ + +# Copy the conda_env file +COPY ./conda_env.yml ./ + +# Install DFP dependencies +RUN source activate morpheus \ + && mamba env update -n morpheus -f ./conda_env.yml + +# Set the tracking URI for mlflow +ENV MLFLOW_TRACKING_URI="http://mlflow:5000" + +# This will get used by pipelines for the --s3_cache option +# ENV DFP_S3_CACHE="/work/examples/dfp_workflow/morpheus/.s3_cache" + +# If any changes have been made from the base image, recopy the sources +COPY . /workspace/examples/digital_fingerprinting/ + +# Set the workdir to be the morpheus folder +WORKDIR /workspace/examples/digital_fingerprinting/production/morpheus + +# ===== Setup for running unattended ===== +FROM base as runtime + +# Launch morpheus +CMD ["./launch.sh"] + +# ===== Setup for running Jupyter ===== +FROM base as jupyter + +# Install the jupyter specific requirements +RUN source activate morpheus \ + && mamba install -y -c conda-forge \ + ipywidgets \ + jupyterlab \ + nb_conda_kernels + +# Launch jupyter +CMD ["jupyter-lab", "--ip=0.0.0.0", "--no-browser", "--allow-root"] diff --git a/examples/digital_fingerprinting/production/README.md b/examples/digital_fingerprinting/production/README.md new file mode 100644 index 0000000000..a7e01c636b --- /dev/null +++ b/examples/digital_fingerprinting/production/README.md @@ -0,0 +1,17 @@ +# "Production" Digital Fingerprinting Pipeline + +### Build the Morpheus container + +This is necessary to get the latest changes needed for DFP + +```bash +./docker/build_container_release.sh +``` + +### Running locally via `docker-compose` + +```bash +docker-compose build + +docker-compose up +``` diff --git a/examples/digital_fingerprinting/production/conda_env.yml b/examples/digital_fingerprinting/production/conda_env.yml new file mode 100644 index 0000000000..bfef17915a --- /dev/null +++ b/examples/digital_fingerprinting/production/conda_env.yml @@ -0,0 +1,31 @@ +# SPDX-FileCopyrightText: Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: morpheus +channels: + - rapidsai + - nvidia + - nvidia/label/dev # For pre-releases of SRF. Should still default to full releases if available + - nvidia/label/cuda-11.5.2 # For cuda-nvml-dev=11.5, which is not published under nvidia channel yet. + - conda-forge +dependencies: + ####### Morpheus Dependencies (keep sorted!) ####### + - boto3 + - dill + - kfp + - librdkafka + - mlflow + - papermill + - s3fs diff --git a/examples/digital_fingerprinting/production/docker-compose.yml b/examples/digital_fingerprinting/production/docker-compose.yml new file mode 100644 index 0000000000..bb7a0f35df --- /dev/null +++ b/examples/digital_fingerprinting/production/docker-compose.yml @@ -0,0 +1,97 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +version: '3.3' + +services: + + mlflow: + restart: always + build: ./mlflow + image: mlflow_server + container_name: mlflow_server + ports: + - "5000:5000" + networks: + - frontend + - backend + command: mlflow server --gunicorn-opts "--log-level debug" --backend-store-uri sqlite:////opt/mlflow/dbdata/mlflow.db --serve-artifacts --artifacts-destination /opt/mlflow/artifacts --host 0.0.0.0 + volumes: + - db_data:/opt/mlflow/dbdata + - mlflow_data:/opt/mlflow/artifacts + + jupyter: + restart: always + build: + context: ./ + dockerfile: ./Dockerfile + target: jupyter + args: + - MORPHEUS_CONTAINER_VERSION=${MORPHEUS_CONTAINER_VERSION:-v22.09.00-runtime} + image: dfp_morpheus_jupyter + container_name: jupyter + ports: + - "8888:8888" + networks: + - frontend + - backend + command: jupyter-lab --no-browser --allow-root --ip='*' + volumes: + - ../../..:/workspace + depends_on: + - mlflow + profiles: + - dev + cap_add: + - sys_nice + + morpheus_training: + # restart: always + build: + context: ./ + dockerfile: ./Dockerfile + target: runtime + args: + - MORPHEUS_CONTAINER_VERSION=${MORPHEUS_CONTAINER_VERSION:-v22.09.00-runtime} + image: dfp_morpheus + container_name: morpheus_training + networks: + - frontend + - backend + environment: + # Colorize the terminal in the container if possible + TERM: "${TERM:-}" + DFP_CACHE_DIR: "/workspace/.cache/dfp" + DFP_TRACKING_URI: "http://mlflow:5000" + command: ./launch.sh --train_users=generic --duration=1d + volumes: + - ../../..:/workspace + depends_on: + - mlflow + profiles: + - training + cap_add: + - sys_nice + +networks: + frontend: + driver: bridge + backend: + driver: bridge + +volumes: + db_data: + mlflow_data: diff --git a/examples/digital_fingerprinting/production/mlflow/Dockerfile b/examples/digital_fingerprinting/production/mlflow/Dockerfile new file mode 100644 index 0000000000..8d8dc942bb --- /dev/null +++ b/examples/digital_fingerprinting/production/mlflow/Dockerfile @@ -0,0 +1,32 @@ +# SPDX-FileCopyrightText: Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM python:3.8-slim-buster + +# Install curl for health check +RUN apt update && \ + apt install -y --no-install-recommends \ + curl libyaml-cpp-dev libyaml-dev && \ + apt autoremove -y && \ + apt clean all && \ + rm -rf /var/cache/apt/* /var/lib/apt/lists/* + +# Install python packages +RUN pip install mlflow boto3 pymysql pyyaml + +# We run on port 5000 +EXPOSE 5000 + +HEALTHCHECK CMD curl -f http://localhost:5000/health || exit 1 diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/__init__.py b/examples/digital_fingerprinting/production/morpheus/dfp/__init__.py new file mode 100644 index 0000000000..d11ef3c507 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/messages/__init__.py b/examples/digital_fingerprinting/production/morpheus/dfp/messages/__init__.py new file mode 100644 index 0000000000..d11ef3c507 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/messages/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/messages/multi_dfp_message.py b/examples/digital_fingerprinting/production/morpheus/dfp/messages/multi_dfp_message.py new file mode 100644 index 0000000000..de11752ed6 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/messages/multi_dfp_message.py @@ -0,0 +1,92 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dataclasses +import logging +import typing + +from morpheus.messages.message_meta import MessageMeta +from morpheus.messages.multi_message import MultiMessage + +logger = logging.getLogger(__name__) + + +@dataclasses.dataclass +class DFPMessageMeta(MessageMeta, cpp_class=None): + """ + This class extends MessageMeta to also hold userid corresponding to batched metadata. + + Parameters + ---------- + df : pandas.DataFrame + Input rows in dataframe. + user_id : str + User id. + + """ + user_id: str + + def get_df(self): + return self.df + + def set_df(self, df): + self.df = df + + +@dataclasses.dataclass +class MultiDFPMessage(MultiMessage): + + def __post_init__(self): + + assert isinstance(self.meta, DFPMessageMeta), "`meta` must be an instance of DFPMessageMeta" + + @property + def user_id(self): + return typing.cast(DFPMessageMeta, self.meta).user_id + + def get_meta_dataframe(self): + return typing.cast(DFPMessageMeta, self.meta).get_df() + + def set_meta_dataframe(self, columns: typing.Union[None, str, typing.List[str]], value): + + df = typing.cast(DFPMessageMeta, self.meta).get_df() + + if (columns is None): + # Set all columns + df[list(value.columns)] = value + else: + # If its a single column or list of columns, this is the same + df[columns] = value + + typing.cast(DFPMessageMeta, self.meta).set_df(df) + + def get_slice(self, start, stop): + """ + Returns sliced batches based on offsets supplied. Automatically calculates the correct `mess_offset` + and `mess_count`. + + Parameters + ---------- + start : int + Start offset address. + stop : int + Stop offset address. + + Returns + ------- + morpheus.pipeline.preprocess.autoencoder.MultiAEMessage + A new `MultiAEMessage` with sliced offset and count. + + """ + return MultiDFPMessage(meta=self.meta, mess_offset=start, mess_count=stop - start) diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/stages/__init__.py b/examples/digital_fingerprinting/production/morpheus/dfp/stages/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_file_batcher_stage.py b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_file_batcher_stage.py new file mode 100644 index 0000000000..bd5034ccf3 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_file_batcher_stage.py @@ -0,0 +1,104 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import typing + +import fsspec +import pandas as pd +import srf +from srf.core import operators as ops + +from morpheus.config import Config +from morpheus.pipeline.single_port_stage import SinglePortStage +from morpheus.pipeline.stream_pair import StreamPair + +logger = logging.getLogger("morpheus.{}".format(__name__)) + + +class DFPFileBatcherStage(SinglePortStage): + + def __init__(self, c: Config, date_conversion_func, period="D", sampling_rate_s=0): + super().__init__(c) + + self._date_conversion_func = date_conversion_func + self._sampling_rate_s = sampling_rate_s + self._period = period + + @property + def name(self) -> str: + return "dfp-file-batcher" + + def supports_cpp_node(self): + return False + + def accepted_types(self) -> typing.Tuple: + return (fsspec.core.OpenFiles, ) + + def on_data(self, file_objects: fsspec.core.OpenFiles): + + file_object_list = file_objects + + # Create a dataframe with the incoming metadata + if ((len(file_object_list) > 1) and (self._sampling_rate_s > 0)): + file_sampled_list = [] + + file_object_list.sort(key=lambda file_object: self._date_conversion_func(file_object)) + + ts_last = self._date_conversion_func(file_object_list[0]) + + file_sampled_list.append(file_object_list[0]) + + for idx in range(1, len(file_object_list)): + ts = self._date_conversion_func(file_object_list[idx]) + + if ((ts - ts_last).seconds >= self._sampling_rate_s): + + file_sampled_list.append(file_object_list[idx]) + ts_last = ts + else: + file_object_list = file_sampled_list + + df = pd.DataFrame() + + df["dfp_timestamp"] = [self._date_conversion_func(file_object) for file_object in file_object_list] + df["key"] = [file_object.full_name for file_object in file_object_list] + df["objects"] = file_object_list + + # Now split by the batching settings + df_period = df["dfp_timestamp"].dt.to_period(self._period) + + period_gb = df.groupby(df_period) + + output_batches = [] + + n_groups = len(period_gb) + for group in period_gb.groups: + period_df = period_gb.get_group(group) + + obj_list = fsspec.core.OpenFiles(period_df["objects"].to_list(), mode=file_objects.mode, fs=file_objects.fs) + + output_batches.append((obj_list, n_groups)) + + return output_batches + + def _build_single(self, builder: srf.Builder, input_stream: StreamPair) -> StreamPair: + + def node_fn(obs: srf.Observable, sub: srf.Subscriber): + obs.pipe(ops.map(self.on_data), ops.flatten()).subscribe(sub) + + stream = builder.make_node_full(self.unique_name, node_fn) + builder.make_edge(input_stream[0], stream) + + return stream, typing.List[fsspec.core.OpenFiles] diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_file_to_df.py b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_file_to_df.py new file mode 100644 index 0000000000..ebcbffc818 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_file_to_df.py @@ -0,0 +1,256 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import hashlib +import json +import logging +import multiprocessing as mp +import os +import time +import typing +from functools import partial + +import fsspec +import pandas as pd +import srf +from srf.core import operators as ops + +import dask +from dask.distributed import Client +from dask.distributed import LocalCluster + +import cudf + +from morpheus._lib.file_types import FileTypes +from morpheus.config import Config +from morpheus.io.deserializers import read_file_to_df +from morpheus.pipeline.single_port_stage import SinglePortStage +from morpheus.pipeline.stream_pair import StreamPair + +from ..utils.column_info import DataFrameInputSchema +from ..utils.column_info import process_dataframe + +logger = logging.getLogger("morpheus.{}".format(__name__)) + + +def _single_object_to_dataframe(file_object: fsspec.core.OpenFile, + schema: DataFrameInputSchema, + file_type: FileTypes, + filter_null: bool, + parser_kwargs: dict): + + retries = 0 + s3_df = None + while (retries < 2): + try: + with file_object as f: + s3_df = read_file_to_df(f, + file_type, + filter_nulls=filter_null, + df_type="pandas", + parser_kwargs=parser_kwargs) + + break + except Exception as e: + if (retries < 2): + logger.warning("Refreshing S3 credentials") + # cred_refresh() + retries += 1 + else: + raise e + + # Run the pre-processing before returning + if (s3_df is None): + return s3_df + + s3_df = process_dataframe(df_in=s3_df, input_schema=schema) + + return s3_df + + +class DFPFileToDataFrameStage(SinglePortStage): + + def __init__(self, + c: Config, + schema: DataFrameInputSchema, + filter_null: bool = True, + file_type: FileTypes = FileTypes.Auto, + parser_kwargs: dict = None, + cache_dir: str = "./.cache/dfp"): + super().__init__(c) + + self._schema = schema + + self._file_type = file_type + self._filter_null = filter_null + self._parser_kwargs = {} if parser_kwargs is None else parser_kwargs + self._cache_dir = os.path.join(cache_dir, "file_cache") + + self._dask_cluster: Client = None + + self._download_method: typing.Literal["single_thread", "multiprocess", "dask", + "dask_thread"] = os.environ.get("MORPHEUS_FILE_DOWNLOAD_TYPE", + "dask_thread") + + @property + def name(self) -> str: + return "dfp-s3-to-df" + + def supports_cpp_node(self): + return False + + def accepted_types(self) -> typing.Tuple: + return (typing.Any, ) + + def _get_dask_cluster(self): + + if (self._dask_cluster is None): + logger.debug("Creating dask cluster...") + + # Up the heartbeat interval which can get violated with long download times + dask.config.set({"distributed.client.heartbeat": "30s"}) + + self._dask_cluster = LocalCluster(start=True, processes=not self._download_method == "dask_thread") + + logger.debug("Creating dask cluster... Done. Dashboard: %s", self._dask_cluster.dashboard_link) + + return self._dask_cluster + + def _close_dask_cluster(self): + if (self._dask_cluster is not None): + logger.debug("Stopping dask cluster...") + + self._dask_cluster.close() + + self._dask_cluster = None + + logger.debug("Stopping dask cluster... Done.") + + def _get_or_create_dataframe_from_s3_batch( + self, file_object_batch: typing.Tuple[fsspec.core.OpenFiles, int]) -> typing.Tuple[cudf.DataFrame, bool]: + + if (not file_object_batch): + return None, False + + file_list = file_object_batch[0] + batch_count = file_object_batch[1] + + fs: fsspec.AbstractFileSystem = file_list.fs + + # Create a list of dictionaries that only contains the information we are interested in hashing. `ukey` just + # hashes all of the output of `info()` which is perfect + hash_data = [{"ukey": fs.ukey(file_object.path)} for file_object in file_list] + + # Convert to base 64 encoding to remove - values + objects_hash_hex = hashlib.md5(json.dumps(hash_data, sort_keys=True).encode()).hexdigest() + + batch_cache_location = os.path.join(self._cache_dir, "batches", f"{objects_hash_hex}.pkl") + + # Return the cache if it exists + if (os.path.exists(batch_cache_location)): + output_df = pd.read_pickle(batch_cache_location) + output_df["origin_hash"] = objects_hash_hex + output_df["batch_count"] = batch_count + + return (output_df, True) + + # Cache miss + download_method = partial(_single_object_to_dataframe, + schema=self._schema, + file_type=self._file_type, + filter_null=self._filter_null, + parser_kwargs=self._parser_kwargs) + + download_buckets = file_list + + # Loop over dataframes and concat into one + try: + dfs = [] + if (self._download_method.startswith("dask")): + + # Create the client each time to ensure all connections to the cluster are closed (they can time out) + with Client(self._get_dask_cluster()) as client: + dfs = client.map(download_method, download_buckets) + + dfs = client.gather(dfs) + + elif (self._download_method == "multiprocessing"): + # Use multiprocessing here since parallel downloads are a pain + with mp.get_context("spawn").Pool(mp.cpu_count()) as p: + dfs = p.map(download_method, download_buckets) + else: + # Simply loop + for s3_object in download_buckets: + dfs.append(download_method(s3_object)) + + except Exception: + logger.exception("Failed to download logs. Error: ", exc_info=True) + return None, False + + if (not dfs): + logger.error("No logs were downloaded") + return None, False + + output_df: pd.DataFrame = pd.concat(dfs) + + # Finally sort by timestamp and then reset the index + output_df.sort_values(by=["timestamp"], inplace=True) + + output_df.reset_index(drop=True, inplace=True) + + # Save dataframe to cache future runs + os.makedirs(os.path.dirname(batch_cache_location), exist_ok=True) + + try: + output_df.to_pickle(batch_cache_location) + except Exception: + logger.warning("Failed to save batch cache. Skipping cache for this batch.", exc_info=True) + + output_df["batch_count"] = batch_count + output_df["origin_hash"] = objects_hash_hex + + return (output_df, False) + + def convert_to_dataframe(self, s3_object_batch: typing.Tuple[fsspec.core.OpenFiles, int]): + if (not s3_object_batch): + return None + + start_time = time.time() + + try: + + output_df, cache_hit = self._get_or_create_dataframe_from_s3_batch(s3_object_batch) + + duration = (time.time() - start_time) * 1000.0 + + logger.debug("S3 objects to DF complete. Rows: %s, Cache: %s, Duration: %s ms", + len(output_df), + "hit" if cache_hit else "miss", + duration) + + return output_df + except Exception: + logger.exception("Error while converting S3 buckets to DF.") + self._get_or_create_dataframe_from_s3_batch(s3_object_batch) + raise + + def _build_single(self, builder: srf.Builder, input_stream: StreamPair) -> StreamPair: + + def node_fn(obs: srf.Observable, sub: srf.Subscriber): + obs.pipe(ops.map(self.convert_to_dataframe), ops.on_completed(self._close_dask_cluster)).subscribe(sub) + + stream = builder.make_node_full(self.unique_name, node_fn) + builder.make_edge(input_stream[0], stream) + + return stream, cudf.DataFrame diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_inference_stage.py b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_inference_stage.py new file mode 100644 index 0000000000..fe65105c25 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_inference_stage.py @@ -0,0 +1,117 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import time +import typing + +import srf +from mlflow.tracking.client import MlflowClient + +from morpheus.config import Config +from morpheus.messages.multi_ae_message import MultiAEMessage +from morpheus.pipeline.single_port_stage import SinglePortStage +from morpheus.pipeline.stream_pair import StreamPair + +from ..messages.multi_dfp_message import MultiDFPMessage +from ..utils.model_cache import ModelCache +from ..utils.model_cache import ModelManager + +logger = logging.getLogger("morpheus.{}".format(__name__)) + + +class DFPInferenceStage(SinglePortStage): + + def __init__(self, c: Config, model_name_formatter: str = "dfp-{user_id}"): + super().__init__(c) + + self._client = MlflowClient() + self._fallback_user = self._config.ae.fallback_username + + self._model_cache: typing.Dict[str, ModelCache] = {} + self._model_cache_size_max = 10 + + self._cache_timeout_sec = 600 + + self._model_manager = ModelManager(model_name_formatter=model_name_formatter) + + @property + def name(self) -> str: + return "dfp-inference" + + def supports_cpp_node(self): + return False + + def accepted_types(self) -> typing.Tuple: + return (MultiDFPMessage, ) + + def get_model(self, user: str) -> ModelCache: + + return self._model_manager.load_user_model(self._client, user_id=user, fallback_user_ids=[self._fallback_user]) + + def on_data(self, message: MultiDFPMessage): + if (not message or message.mess_count == 0): + return None + + start_time = time.time() + + df_user = message.get_meta() + user_id = message.user_id + + try: + model_cache = self.get_model(user_id) + + if (model_cache is None): + raise RuntimeError("Could not find model for user {}".format(user_id)) + + loaded_model = model_cache.load_model(self._client) + + except Exception: # TODO + logger.exception("Error trying to get model") + return None + + post_model_time = time.time() + + results_df = loaded_model.get_results(df_user, return_abs=True) + + # Create an output message to allow setting meta + output_message = MultiAEMessage(message.meta, + mess_offset=message.mess_offset, + mess_count=message.mess_count, + model=loaded_model) + + output_message.set_meta(list(results_df.columns), results_df) + + output_message.set_meta('model_version', f"{model_cache.reg_model_name}:{model_cache.reg_model_version}") + + if logger.isEnabledFor(logging.DEBUG): + load_model_duration = (post_model_time - start_time) * 1000.0 + get_anomaly_duration = (time.time() - post_model_time) * 1000.0 + + logger.debug("Completed inference for user %s. Model load: %s ms, Model infer: %s ms. Start: %s, End: %s", + user_id, + load_model_duration, + get_anomaly_duration, + df_user[self._config.ae.timestamp_column_name].min(), + df_user[self._config.ae.timestamp_column_name].max()) + + return output_message + + def _build_single(self, builder: srf.Builder, input_stream: StreamPair) -> StreamPair: + node = builder.make_node(self.unique_name, self.on_data) + builder.make_edge(input_stream[0], node) + + # node.launch_options.pe_count = self._config.num_threads + + return node, MultiAEMessage diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_mlflow_model_writer.py b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_mlflow_model_writer.py new file mode 100644 index 0000000000..d540dd7b40 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_mlflow_model_writer.py @@ -0,0 +1,259 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import hashlib +import logging +import os +import typing +import urllib.parse + +import mlflow +import requests +import srf +from dfencoder import AutoEncoder +from mlflow.exceptions import MlflowException +from mlflow.models.signature import ModelSignature +from mlflow.protos.databricks_pb2 import RESOURCE_ALREADY_EXISTS +from mlflow.protos.databricks_pb2 import ErrorCode +from mlflow.store.artifact.runs_artifact_repo import RunsArtifactRepository +from mlflow.tracking import MlflowClient +from mlflow.types import ColSpec +from mlflow.types import Schema +from mlflow.types.utils import _infer_pandas_column +from mlflow.types.utils import _infer_schema +from srf.core import operators as ops + +from morpheus.config import Config +from morpheus.messages.multi_ae_message import MultiAEMessage +from morpheus.pipeline.single_port_stage import SinglePortStage +from morpheus.pipeline.stream_pair import StreamPair + +from ..utils.model_cache import user_to_model_name + +# Setup conda environment +conda_env = { + 'channels': ['defaults', 'conda-forge'], + 'dependencies': ['python={}'.format('3.8'), 'pip'], + 'pip': ['mlflow', 'dfencoder'], + 'name': 'mlflow-env' +} + +logger = logging.getLogger(f"morpheus.{__name__}") + + +class DFPMLFlowModelWriterStage(SinglePortStage): + + def __init__(self, + c: Config, + model_name_formatter: str = "dfp-{user_id}", + experiment_name_formatter: str = "/dfp-models/{reg_model_name}", + databricks_permissions: dict = None): + super().__init__(c) + + self._model_name_formatter = model_name_formatter + self._experiment_name_formatter = experiment_name_formatter + self._databricks_permissions = databricks_permissions + + @property + def name(self) -> str: + return "dfp-mlflow-model-writer" + + def supports_cpp_node(self): + return False + + def accepted_types(self) -> typing.Tuple: + return (MultiAEMessage, ) + + def user_id_to_model(self, user_id: str): + + return user_to_model_name(user_id=user_id, model_name_formatter=self._model_name_formatter) + + def user_id_to_experiment(self, user_id: str): + kwargs = { + "user_id": user_id, + "user_md5": hashlib.md5(user_id.encode('utf-8')).hexdigest(), + "reg_model_name": self.user_id_to_model(user_id=user_id) + } + + return self._experiment_name_formatter.format(**kwargs) + + def _apply_model_permissions(self, reg_model_name: str): + + # Check the required variables + databricks_host = os.environ.get("DATABRICKS_HOST", None) + databricks_token = os.environ.get("DATABRICKS_TOKEN", None) + + if (databricks_host is None or databricks_token is None): + raise RuntimeError("Cannot set Databricks model permissions. " + "Environment variables `DATABRICKS_HOST` and `DATABRICKS_TOKEN` must be set") + + headers = {"Authorization": f"Bearer {databricks_token}"} + + url_base = f"{databricks_host}" + + try: + # First get the registered model ID + get_registered_model_url = urllib.parse.urljoin(url_base, + "/api/2.0/mlflow/databricks/registered-models/get") + + get_registered_model_response = requests.get(url=get_registered_model_url, + headers=headers, + params={"name": reg_model_name}) + + registered_model_response = get_registered_model_response.json() + + reg_model_id = registered_model_response["registered_model_databricks"]["id"] + + # Now apply the permissions. If it exists already, it will be overwritten or it is a no-op + patch_registered_model_permissions_url = urllib.parse.urljoin( + url_base, f"/api/2.0/preview/permissions/registered-models/{reg_model_id}") + + patch_registered_model_permissions_body = { + "access_control_list": [{ + "group_name": group, "permission_level": permission + } for group, + permission in self._databricks_permissions.items()] + } + + requests.patch(url=patch_registered_model_permissions_url, + headers=headers, + json=patch_registered_model_permissions_body) + + except Exception: + logger.exception("Error occurred trying to apply model permissions to model: %s", + reg_model_name, + exc_info=True) + + def on_data(self, message: MultiAEMessage): + + user = message.meta.user_id + + model: AutoEncoder = message.model + + model_path = "dfencoder" + reg_model_name = self.user_id_to_model(user_id=user) + + # Write to ML Flow + try: + mlflow.end_run() + + experiment_name = self.user_id_to_experiment(user_id=user) + + # Creates a new experiment if it doesnt exist + experiment = mlflow.set_experiment(experiment_name) + + with mlflow.start_run(run_name="Duo autoencoder model training run", + experiment_id=experiment.experiment_id) as run: + + model_path = f"{model_path}-{run.info.run_uuid}" + + # Log all params in one dict to avoid round trips + mlflow.log_params({ + "Algorithm": "Denosing Autoencoder", + "Epochs": model.lr_decay.state_dict().get("last_epoch", "unknown"), + "Learning rate": model.lr, + "Batch size": model.batch_size, + "Start Epoch": message.get_meta("timestamp").min(), + "End Epoch": message.get_meta("timestamp").max(), + "Log Count": message.mess_count, + }) + + metrics_dict: typing.Dict[str, float] = {} + + # Add info on the embeddings + for k, v in model.categorical_fts.items(): + embedding = v.get("embedding", None) + + if (embedding is None): + continue + + metrics_dict[f"embedding-{k}-num_embeddings"] = embedding.num_embeddings + metrics_dict[f"embedding-{k}-embedding_dim"] = embedding.embedding_dim + + # Add metrics for all of the loss stats + if (hasattr(model, "feature_loss_stats")): + for k, v in model.feature_loss_stats.items(): + metrics_dict[f"loss-{k}-mean"] = v.get("mean", "unknown") + metrics_dict[f"loss-{k}-std"] = v.get("std", "unknown") + + mlflow.log_metrics(metrics_dict) + + # Use the prepare_df function to setup the direct inputs to the model. Only include features returned by + # prepare_df to show the actual inputs to the model (any extra are discarded) + input_df = message.get_meta().iloc[0:1] + prepared_df = model.prepare_df(input_df) + output_values = model.get_anomaly_score(input_df) + + input_schema = Schema([ + ColSpec(type=_infer_pandas_column(input_df[col_name]), name=col_name) + for col_name in list(prepared_df.columns) + ]) + output_schema = _infer_schema(output_values) + + model_sig = ModelSignature(inputs=input_schema, outputs=output_schema) + + model_info = mlflow.pytorch.log_model( + pytorch_model=model, + artifact_path=model_path, + conda_env=conda_env, + signature=model_sig, + ) + + client = MlflowClient() + + # First ensure a registered model has been created + try: + create_model_response = client.create_registered_model(reg_model_name) + logger.debug("Successfully registered model '%s'.", create_model_response.name) + except MlflowException as e: + if e.error_code == ErrorCode.Name(RESOURCE_ALREADY_EXISTS): + pass + else: + raise e + + # If we are using databricks, make sure we set the correct permissions + if (self._databricks_permissions is not None and mlflow.get_tracking_uri() == "databricks"): + # Need to apply permissions + self._apply_model_permissions(reg_model_name=reg_model_name) + + model_src = RunsArtifactRepository.get_underlying_uri(model_info.model_uri) + + tags = { + "start": message.get_meta(self._config.ae.timestamp_column_name).min(), + "end": message.get_meta(self._config.ae.timestamp_column_name).max(), + "count": message.get_meta(self._config.ae.timestamp_column_name).count() + } + + # Now create the model version + mv = client.create_model_version(name=reg_model_name, + source=model_src, + run_id=run.info.run_id, + tags=tags) + + logger.debug("ML Flow model upload complete: %s:%s:%s", user, reg_model_name, mv.version) + + except Exception: + logger.exception("Error uploading model to ML Flow", exc_info=True) + + return message + + def _build_single(self, builder: srf.Builder, input_stream: StreamPair) -> StreamPair: + + def node_fn(obs: srf.Observable, sub: srf.Subscriber): + obs.pipe(ops.map(self.on_data)).subscribe(sub) + + stream = builder.make_node_full(self.unique_name, node_fn) + builder.make_edge(input_stream[0], stream) + + return stream, MultiAEMessage diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_postprocessing_stage.py b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_postprocessing_stage.py new file mode 100644 index 0000000000..933188b2ac --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_postprocessing_stage.py @@ -0,0 +1,96 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import time +import typing +from datetime import datetime + +import numpy as np +import srf +from srf.core import operators as ops + +from morpheus.config import Config +from morpheus.messages.multi_ae_message import MultiAEMessage +from morpheus.pipeline.single_port_stage import SinglePortStage +from morpheus.pipeline.stream_pair import StreamPair + +from ..messages.multi_dfp_message import DFPMessageMeta + +logger = logging.getLogger("morpheus.{}".format(__name__)) + + +class DFPPostprocessingStage(SinglePortStage): + + def __init__(self, c: Config, z_score_threshold=2.0): + super().__init__(c) + + self._z_score_threshold = z_score_threshold + + @property + def name(self) -> str: + return "dfp-postproc" + + def supports_cpp_node(self): + return False + + def accepted_types(self) -> typing.Tuple: + return (MultiAEMessage, ) + + def _extract_events(self, message: MultiAEMessage): + + z_scores = message.get_meta("mean_abs_z") + + above_threshold_df = message.get_meta()[z_scores > self._z_score_threshold] + + if (not above_threshold_df.empty): + above_threshold_df['event_time'] = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') + above_threshold_df = above_threshold_df.replace(np.nan, 'NaN', regex=True) + + return above_threshold_df + + return None + + def on_data(self, message: MultiAEMessage): + if (not message): + return None + + start_time = time.time() + + extracted_events = self._extract_events(message) + + duration = (time.time() - start_time) * 1000.0 + + if logger.isEnabledFor(logging.DEBUG): + logger.debug("Completed postprocessing for user %s in %s ms. Event count: %s. Start: %s, End: %s", + message.meta.user_id, + duration, + 0 if extracted_events is None else len(extracted_events), + message.get_meta(self._config.ae.timestamp_column_name).min(), + message.get_meta(self._config.ae.timestamp_column_name).max()) + + if (extracted_events is None): + return None + + return DFPMessageMeta(extracted_events, user_id=message.meta.user_id) + + def _build_single(self, builder: srf.Builder, input_stream: StreamPair) -> StreamPair: + + def node_fn(obs: srf.Observable, sub: srf.Subscriber): + obs.pipe(ops.map(self.on_data), ops.filter(lambda x: x is not None)).subscribe(sub) + + stream = builder.make_node_full(self.unique_name, node_fn) + builder.make_edge(input_stream[0], stream) + + return stream, DFPMessageMeta diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_preprocessing_stage.py b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_preprocessing_stage.py new file mode 100644 index 0000000000..8b7a083863 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_preprocessing_stage.py @@ -0,0 +1,83 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import time +import typing + +import srf +from srf.core import operators as ops + +from morpheus.config import Config +from morpheus.pipeline.single_port_stage import SinglePortStage +from morpheus.pipeline.stream_pair import StreamPair + +from ..messages.multi_dfp_message import MultiDFPMessage +from ..utils.column_info import DataFrameInputSchema +from ..utils.column_info import process_dataframe + +logger = logging.getLogger("morpheus.{}".format(__name__)) + + +class DFPPreprocessingStage(SinglePortStage): + + def __init__(self, c: Config, input_schema: DataFrameInputSchema): + super().__init__(c) + + self._input_schema = input_schema + + @property + def name(self) -> str: + return "dfp-preproc" + + def supports_cpp_node(self): + return False + + def accepted_types(self) -> typing.Tuple: + return (MultiDFPMessage, ) + + def process_features(self, message: MultiDFPMessage): + if (message is None): + return None + + start_time = time.time() + + # Process the columns + df_processed = process_dataframe(message.get_meta_dataframe(), self._input_schema) + + # Apply the new dataframe, only the rows in the offset + message.set_meta_dataframe(list(df_processed.columns), df_processed) + + if logger.isEnabledFor(logging.DEBUG): + duration = (time.time() - start_time) * 1000.0 + + logger.debug("Preprocessed %s data for logs in %s to %s in %s ms", + message.mess_count, + message.get_meta(self._config.ae.timestamp_column_name).min(), + message.get_meta(self._config.ae.timestamp_column_name).max(), + duration) + + return message + + def _build_single(self, builder: srf.Builder, input_stream: StreamPair) -> StreamPair: + + def node_fn(obs: srf.Observable, sub: srf.Subscriber): + obs.pipe(ops.map(self.process_features)).subscribe(sub) + + node = builder.make_node_full(self.unique_name, node_fn) + builder.make_edge(input_stream[0], node) + + # node.launch_options.pe_count = self._config.num_threads + + return node, MultiDFPMessage diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_rolling_window_stage.py b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_rolling_window_stage.py new file mode 100644 index 0000000000..66c0f49b49 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_rolling_window_stage.py @@ -0,0 +1,326 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dataclasses +import logging +import os +import pickle +import typing +from contextlib import contextmanager +from datetime import datetime +from datetime import timedelta +from datetime import timezone + +import pandas as pd +import srf +from srf.core import operators as ops + +from morpheus.config import Config +from morpheus.pipeline.single_port_stage import SinglePortStage +from morpheus.pipeline.stream_pair import StreamPair + +from ..messages.multi_dfp_message import DFPMessageMeta +from ..messages.multi_dfp_message import MultiDFPMessage +from ..utils.logging_timer import log_time + +# Setup conda environment +conda_env = { + 'channels': ['defaults', 'conda-forge'], + 'dependencies': ['python={}'.format('3.8'), 'pip'], + 'pip': ['mlflow', 'dfencoder'], + 'name': 'mlflow-env' +} + +logger = logging.getLogger("morpheus.{}".format(__name__)) + + +@dataclasses.dataclass +class CachedUserWindow: + user_id: str + cache_location: str + timestamp_column: str = "timestamp" + total_count: int = 0 + count: int = 0 + min_epoch: datetime = datetime(1970, 1, 1, tzinfo=timezone(timedelta(hours=0))) + max_epoch: datetime = datetime(1970, 1, 1, tzinfo=timezone(timedelta(hours=0))) + batch_count: int = 0 + pending_batch_count: int = 0 + last_train_count: int = 0 + last_train_epoch: datetime = None + last_train_batch: int = 0 + + _trained_rows: pd.Series = dataclasses.field(init=False, repr=False, default_factory=pd.DataFrame) + _df: pd.DataFrame = dataclasses.field(init=False, repr=False, default_factory=pd.DataFrame) + + def append_dataframe(self, incoming_df: pd.DataFrame) -> bool: + + # # Get the row hashes + # row_hashes = pd.util.hash_pandas_object(incoming_df) + + # Filter the incoming df by epochs later than the current max_epoch + filtered_df = incoming_df[incoming_df["timestamp"] > self.max_epoch] + + if (len(filtered_df) == 0): + # We have nothing new to add. Double check that we fit within the window + before_history = incoming_df[incoming_df["timestamp"] < self.min_epoch] + + return len(before_history) == 0 + + # Increment the batch count + self.batch_count += 1 + self.pending_batch_count += 1 + + # Set the filtered index + filtered_df.index = range(self.total_count, self.total_count + len(filtered_df)) + + # Save the row hash to make it easier to find later. Do this before the batch so it doesnt participate + filtered_df["_row_hash"] = pd.util.hash_pandas_object(filtered_df, index=False) + + # Use batch id to distinguish groups in the same dataframe + filtered_df["_batch_id"] = self.batch_count + + # Append just the new rows + self._df = pd.concat([self._df, filtered_df]) + + self.total_count += len(filtered_df) + self.count = len(self._df) + + if (len(self._df) > 0): + self.min_epoch = self._df[self.timestamp_column].min() + self.max_epoch = self._df[self.timestamp_column].max() + + return True + + def get_train_df(self, max_history) -> pd.DataFrame: + + new_df = self.trim_dataframe(self._df, + max_history=max_history, + last_batch=self.batch_count - self.pending_batch_count, + timestamp_column=self.timestamp_column) + + self.last_train_count = self.total_count + self.last_train_epoch = datetime.now() + self.last_train_batch = self.batch_count + self.pending_batch_count = 0 + + self._df = new_df + + if (len(self._df) > 0): + self.min_epoch = self._df[self.timestamp_column].min() + self.max_epoch = self._df[self.timestamp_column].max() + + return new_df + + def save(self): + + # Make sure the directories exist + os.makedirs(os.path.dirname(self.cache_location), exist_ok=True) + + with open(self.cache_location, "wb") as f: + pickle.dump(self, f) + + @staticmethod + def trim_dataframe(df: pd.DataFrame, + max_history: typing.Union[int, str], + last_batch: int, + timestamp_column: str = "timestamp") -> pd.DataFrame: + if (max_history is None): + return df + + # Want to ensure we always see data once. So any new data is preserved + new_batches = df[df["_batch_id"] > last_batch] + + # See if max history is an int + if (isinstance(max_history, int)): + return df.tail(max(max_history, len(new_batches))) + + # If its a string, then its a duration + if (isinstance(max_history, str)): + # Get the latest timestamp + latest = df[timestamp_column].max() + + time_delta = pd.Timedelta(max_history) + + # Calc the earliest + earliest = min(latest - time_delta, new_batches[timestamp_column].min()) + + return df[df[timestamp_column] >= earliest] + + raise RuntimeError("Unsupported max_history") + + @staticmethod + def load(cache_location: str) -> "CachedUserWindow": + + with open(cache_location, "rb") as f: + return pickle.load(f) + + +class DFPRollingWindowStage(SinglePortStage): + + def __init__(self, + c: Config, + min_history: int, + min_increment: int, + max_history: typing.Union[int, str], + cache_dir: str = "./.cache/dfp"): + super().__init__(c) + + self._min_history = min_history + self._min_increment = min_increment + self._max_history = max_history + self._cache_dir = os.path.join(cache_dir, "rolling-user-data") + + # Map of user ids to total number of messages. Keeps indexes monotonic and increasing per user + self._user_cache_map: typing.Dict[str, CachedUserWindow] = {} + + @property + def name(self) -> str: + return "dfp-rolling-window" + + def supports_cpp_node(self): + return False + + def accepted_types(self) -> typing.Tuple: + return (DFPMessageMeta, ) + + def _trim_dataframe(self, df: pd.DataFrame): + + if (self._max_history is None): + return df + + # See if max history is an int + if (isinstance(self._max_history, int)): + return df.tail(self._max_history) + + # If its a string, then its a duration + if (isinstance(self._max_history, str)): + # Get the latest timestamp + latest = df[self._config.ae.timestamp_column_name].max() + + time_delta = pd.Timedelta(self._max_history) + + # Calc the earliest + earliest = latest - time_delta + + return df[df['timestamp'] >= earliest] + + raise RuntimeError("Unsupported max_history") + + @contextmanager + def _get_user_cache(self, user_id: str): + + # Determine cache location + cache_location = os.path.join(self._cache_dir, f"{user_id}.pkl") + + user_cache = None + + user_cache = self._user_cache_map.get(user_id, None) + + if (user_cache is None): + user_cache = CachedUserWindow(user_id=user_id, + cache_location=cache_location, + timestamp_column=self._config.ae.timestamp_column_name) + + self._user_cache_map[user_id] = user_cache + + yield user_cache + + # # When it returns, make sure to save + # user_cache.save() + + def _build_window(self, message: DFPMessageMeta) -> MultiDFPMessage: + + user_id = message.user_id + + with self._get_user_cache(user_id) as user_cache: + + incoming_df = message.get_df() + # existing_df = user_cache.df + + if (not user_cache.append_dataframe(incoming_df=incoming_df)): + # Then our incoming dataframe wasnt even covered by the window. Generate warning + logger.warn(("Incoming data preceeded existing history. " + "Consider deleting the rolling window cache and restarting.")) + return None + + # Exit early if we dont have enough data + if (user_cache.count < self._min_history): + return None + + # We have enough data, but has enough time since the last training taken place? + if (user_cache.total_count - user_cache.last_train_count < self._min_increment): + return None + + # Save the last train statistics + train_df = user_cache.get_train_df(max_history=self._max_history) + + # Hash the incoming data rows to find a match + incoming_hash = pd.util.hash_pandas_object(incoming_df.iloc[[0, -1]], index=False) + + # Find the index of the first and last row + match = train_df[train_df["_row_hash"] == incoming_hash.iloc[0]] + + if (len(match) == 0): + raise RuntimeError("Invalid rolling window") + + first_row_idx = match.index[0].item() + last_row_idx = train_df[train_df["_row_hash"] == incoming_hash.iloc[-1]].index[-1].item() + + found_count = (last_row_idx - first_row_idx) + 1 + + if (found_count != len(incoming_df)): + raise RuntimeError(("Overlapping rolling history detected. " + "Rolling history can only be used with non-overlapping batches")) + + train_offset = train_df.index.get_loc(first_row_idx) + + # Otherwise return a new message + return MultiDFPMessage(meta=DFPMessageMeta(df=train_df, user_id=user_id), + mess_offset=train_offset, + mess_count=found_count) + + def on_data(self, message: DFPMessageMeta): + + with log_time(logger.debug) as log_info: + + result = self._build_window(message) + + if (result is not None): + + log_info.set_log( + ("Rolling window complete for %s in {duration:0.2f} ms. " + "Input: %s rows from %s to %s. Output: %s rows from %s to %s"), + message.user_id, + len(message.df), + message.df[self._config.ae.timestamp_column_name].min(), + message.df[self._config.ae.timestamp_column_name].max(), + result.mess_count, + result.get_meta(self._config.ae.timestamp_column_name).min(), + result.get_meta(self._config.ae.timestamp_column_name).max(), + ) + else: + # Dont print anything + log_info.disable() + + return result + + def _build_single(self, builder: srf.Builder, input_stream: StreamPair) -> StreamPair: + + def node_fn(obs: srf.Observable, sub: srf.Subscriber): + obs.pipe(ops.map(self.on_data), ops.filter(lambda x: x is not None)).subscribe(sub) + + stream = builder.make_node_full(self.unique_name, node_fn) + builder.make_edge(input_stream[0], stream) + + return stream, MultiDFPMessage diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_split_users_stage.py b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_split_users_stage.py new file mode 100644 index 0000000000..f35f3b6f94 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_split_users_stage.py @@ -0,0 +1,139 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import typing + +import numpy as np +import srf +from srf.core import operators as ops + +import cudf + +from morpheus.config import Config +from morpheus.pipeline.single_port_stage import SinglePortStage +from morpheus.pipeline.stream_pair import StreamPair + +from ..messages.multi_dfp_message import DFPMessageMeta +from ..utils.logging_timer import log_time + +logger = logging.getLogger("morpheus.{}".format(__name__)) + + +class DFPSplitUsersStage(SinglePortStage): + + def __init__(self, + c: Config, + include_generic: bool, + include_individual: bool, + skip_users: typing.List[str] = None, + only_users: typing.List[str] = None): + super().__init__(c) + + self._include_generic = include_generic + self._include_individual = include_individual + self._skip_users = skip_users if skip_users is not None else [] + self._only_users = only_users if only_users is not None else [] + + # Map of user ids to total number of messages. Keeps indexes monotonic and increasing per user + self._user_index_map: typing.Dict[str, int] = {} + + @property + def name(self) -> str: + return "dfp-split-users" + + def supports_cpp_node(self): + return False + + def accepted_types(self) -> typing.Tuple: + return (cudf.DataFrame, ) + + def extract_users(self, message: cudf.DataFrame): + if (message is None): + return [] + + with log_time(logger.debug) as log_info: + + if (isinstance(message, cudf.DataFrame)): + # Convert to pandas because cudf is slow at this + message = message.to_pandas() + + split_dataframes: typing.Dict[str, cudf.DataFrame] = {} + + # If we are skipping users, do that here + if (len(self._skip_users) > 0): + message = message[~message[self._config.ae.userid_column_name].isin(self._skip_users)] + + if (len(self._only_users) > 0): + message = message[message[self._config.ae.userid_column_name].isin(self._only_users)] + + # Split up the dataframes + if (self._include_generic): + split_dataframes[self._config.ae.fallback_username] = message + + if (self._include_individual): + + split_dataframes.update( + {username: user_df + for username, user_df in message.groupby("username", sort=False)}) + + output_messages: typing.List[DFPMessageMeta] = [] + + for user_id in sorted(split_dataframes.keys()): + + if (user_id in self._skip_users): + continue + + user_df = split_dataframes[user_id] + + current_user_count = self._user_index_map.get(user_id, 0) + + # Reset the index so that users see monotonically increasing indexes + user_df.index = range(current_user_count, current_user_count + len(user_df)) + self._user_index_map[user_id] = current_user_count + len(user_df) + + output_messages.append(DFPMessageMeta(df=user_df, user_id=user_id)) + + # logger.debug("Emitting dataframe for user '%s'. Start: %s, End: %s, Count: %s", + # user, + # df_user[self._config.ae.timestamp_column_name].min(), + # df_user[self._config.ae.timestamp_column_name].max(), + # df_user[self._config.ae.timestamp_column_name].count()) + + rows_per_user = [len(x.df) for x in output_messages] + + if (len(output_messages) > 0): + log_info.set_log( + ("Batch split users complete. Input: %s rows from %s to %s. " + "Output: %s users, rows/user min: %s, max: %s, avg: %.2f. Duration: {duration:.2f} ms"), + len(message), + message[self._config.ae.timestamp_column_name].min(), + message[self._config.ae.timestamp_column_name].max(), + len(rows_per_user), + np.min(rows_per_user), + np.max(rows_per_user), + np.mean(rows_per_user), + ) + + return output_messages + + def _build_single(self, builder: srf.Builder, input_stream: StreamPair) -> StreamPair: + + def node_fn(obs: srf.Observable, sub: srf.Subscriber): + obs.pipe(ops.map(self.extract_users), ops.flatten()).subscribe(sub) + + stream = builder.make_node_full(self.unique_name, node_fn) + builder.make_edge(input_stream[0], stream) + + return stream, DFPMessageMeta diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_training.py b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_training.py new file mode 100644 index 0000000000..1852be263c --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_training.py @@ -0,0 +1,98 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import typing + +import srf +from dfencoder import AutoEncoder +from srf.core import operators as ops + +from morpheus.config import Config +from morpheus.messages.multi_ae_message import MultiAEMessage +from morpheus.pipeline.single_port_stage import SinglePortStage +from morpheus.pipeline.stream_pair import StreamPair + +from ..messages.multi_dfp_message import MultiDFPMessage + +logger = logging.getLogger("morpheus.{}".format(__name__)) + + +class DFPTraining(SinglePortStage): + + def __init__(self, c: Config, model_kwargs: dict = None): + super().__init__(c) + + self._model_kwargs = { + "encoder_layers": [512, 500], # layers of the encoding part + "decoder_layers": [512], # layers of the decoding part + "activation": 'relu', # activation function + "swap_p": 0.2, # noise parameter + "lr": 0.001, # learning rate + "lr_decay": .99, # learning decay + "batch_size": 512, + "verbose": False, + "optimizer": 'sgd', # SGD optimizer is selected(Stochastic gradient descent) + "scaler": 'standard', # feature scaling method + "min_cats": 1, # cut off for minority categories + "progress_bar": False, + "device": "cuda" + } + + # Update the defaults + self._model_kwargs.update(model_kwargs if model_kwargs is not None else {}) + + @property + def name(self) -> str: + return "dfp-training" + + def supports_cpp_node(self): + return False + + def accepted_types(self) -> typing.Tuple: + return (MultiDFPMessage, ) + + def on_data(self, message: MultiDFPMessage): + if (message is None or message.mess_count == 0): + return None + + user_id = message.user_id + + model = AutoEncoder(**self._model_kwargs) + + final_df = message.get_meta_dataframe() + + # Only train on the feature columns + final_df = final_df[final_df.columns.intersection(self._config.ae.feature_columns)] + + logger.debug("Training AE model for user: '%s'...", user_id) + model.fit(final_df, epochs=30) + logger.debug("Training AE model for user: '%s'... Complete.", user_id) + + output_message = MultiAEMessage(message.meta, + mess_offset=message.mess_offset, + mess_count=message.mess_count, + model=model) + + return output_message + + def _build_single(self, builder: srf.Builder, input_stream: StreamPair) -> StreamPair: + + def node_fn(obs: srf.Observable, sub: srf.Subscriber): + obs.pipe(ops.map(self.on_data), ops.filter(lambda x: x is not None)).subscribe(sub) + + stream = builder.make_node_full(self.unique_name, node_fn) + builder.make_edge(input_stream[0], stream) + + return stream, MultiAEMessage diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/stages/multi_file_source.py b/examples/digital_fingerprinting/production/morpheus/dfp/stages/multi_file_source.py new file mode 100644 index 0000000000..abeb08af88 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/stages/multi_file_source.py @@ -0,0 +1,87 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import typing + +import fsspec +import fsspec.utils +import srf + +from morpheus.config import Config +from morpheus.pipeline.single_output_source import SingleOutputSource +from morpheus.pipeline.stream_pair import StreamPair + +logger = logging.getLogger("morpheus.{}".format(__name__)) + + +class MultiFileSource(SingleOutputSource): + """ + Source stage is used to load messages from a file and dumping the contents into the pipeline immediately. Useful for + testing performance and accuracy of a pipeline. + + Parameters + ---------- + c : `morpheus.config.Config` + Pipeline configuration instance. + filenames : List[str] + List of paths to be read from, can be a list of S3 urls (`s3://path`) amd can include wildcard characters `*` + as defined by `fsspec`: + https://filesystem-spec.readthedocs.io/en/latest/api.html?highlight=open_files#fsspec.open_files + """ + + def __init__( + self, + c: Config, + filenames: typing.List[str], + ): + super().__init__(c) + + self._batch_size = c.pipeline_batch_size + + self._filenames = filenames + + self._input_count = None + self._max_concurrent = c.num_threads + + @property + def name(self) -> str: + return "from-multi-file" + + @property + def input_count(self) -> int: + """Return None for no max intput count""" + return self._input_count + + def supports_cpp_node(self): + return False + + def _generate_frames_fsspec(self): + + files: fsspec.core.OpenFiles = fsspec.open_files(self._filenames, filecache={'cache_storage': './.cache/s3tmp'}) + + if (len(files) == 0): + raise RuntimeError(f"No files matched input strings: '{self._filenames}'. " + "Check your input pattern and ensure any credentials are correct") + + yield files + + def _build_source(self, builder: srf.Builder) -> StreamPair: + + if self._build_cpp_node(): + raise RuntimeError("Does not support C++ nodes") + else: + out_stream = builder.make_source(self.unique_name, self._generate_frames_fsspec()) + + return out_stream, fsspec.core.OpenFiles diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/stages/write_to_s3_stage.py b/examples/digital_fingerprinting/production/morpheus/dfp/stages/write_to_s3_stage.py new file mode 100644 index 0000000000..86f6123a64 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/stages/write_to_s3_stage.py @@ -0,0 +1,70 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import typing + +import srf + +from morpheus.config import Config +from morpheus.pipeline.single_port_stage import SinglePortStage +from morpheus.pipeline.stream_pair import StreamPair + + +class WriteToS3Stage(SinglePortStage): + """ + This class writes messages to an s3 bucket. + + Parameters + ---------- + c : `morpheus.config.Config` + Pipeline configuration instance. + bucket: str + Name of the s3 bucket to write to. + + """ + + def __init__(self, c: Config, s3_writer): + super().__init__(c) + + self._s3_writer = s3_writer + + @property + def name(self) -> str: + return "to-s3-bucket" + + def accepted_types(self) -> typing.Tuple: + """ + Returns accepted input types for this stage. + + Returns + ------- + typing.Tuple(`morpheus.messages.message_meta.MessageMeta`, ) + Accepted input types. + + """ + return (typing.Any, ) + + def supports_cpp_node(self): + return False + + def _build_single(self, builder: srf.Builder, input_stream: StreamPair) -> StreamPair: + stream = input_stream[0] + + node = builder.make_node(self.unique_name, self._s3_writer) + builder.make_edge(stream, node) + + stream = node + + # Return input unchanged to allow passthrough + return stream, input_stream[1] diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/utils/__init__.py b/examples/digital_fingerprinting/production/morpheus/dfp/utils/__init__.py new file mode 100644 index 0000000000..d11ef3c507 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/utils/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/utils/column_info.py b/examples/digital_fingerprinting/production/morpheus/dfp/utils/column_info.py new file mode 100644 index 0000000000..3209d2b27e --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/utils/column_info.py @@ -0,0 +1,280 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dataclasses +import logging +import re +import typing +from datetime import datetime + +import pandas as pd + +import cudf + +logger = logging.getLogger("morpheus.{}".format(__name__)) + + +def create_increment_col(df, column_name: str, groupby_column="username", timestamp_column="timestamp"): + DEFAULT_DATE = '1970-01-01T00:00:00.000000+00:00' + + # Ensure we are pandas for this + if (isinstance(df, cudf.DataFrame)): + df = df.to_pandas() + + time_col = pd.to_datetime(df[timestamp_column], errors='coerce', utc=True).fillna(pd.to_datetime(DEFAULT_DATE)) + + per_day = time_col.dt.to_period("D") + + cat_col: pd.Series = df.groupby([per_day, groupby_column + ])[column_name].transform(lambda x: pd.factorize(x.fillna("nan"))[0] + 1) + + increment_col = pd.concat([cat_col, df[groupby_column]], + axis=1).groupby([per_day, groupby_column + ])[column_name].expanding(1).max().droplevel(0).droplevel(0) + + return increment_col + + +def column_listjoin(df, col_name): + if col_name in df: + return df[col_name].transform(lambda x: ",".join(x)).astype('string') + else: + return pd.Series(None, dtype='string') + + +@dataclasses.dataclass +class ColumnInfo: + name: str + dtype: str # The final type + + def get_pandas_dtype(self): + + if (issubclass(self.dtype, datetime)): + return "datetime64[ns]" + else: + return self.dtype + + def process_column(self, df: pd.DataFrame) -> pd.Series: + if (self.name not in df.columns): + return pd.Series(None, index=df.index, dtype=self.get_pandas_dtype()) + + return df[self.name] + + +@dataclasses.dataclass +class CustomColumn(ColumnInfo): + process_column_fn: typing.Callable + + def process_column(self, df: pd.DataFrame) -> pd.Series: + return self.process_column_fn(df) + + +@dataclasses.dataclass +class RenameColumn(ColumnInfo): + input_name: str + + def process_column(self, df: pd.DataFrame) -> pd.Series: + + if (self.input_name not in df.columns): + return pd.Series(None, index=df.index, dtype=self.get_pandas_dtype()) + + return df[self.input_name] + + +@dataclasses.dataclass +class BoolColumn(RenameColumn): + value_map: typing.Dict[str, bool] = dataclasses.field(init=False, default_factory=dict) + + true_value: dataclasses.InitVar[str] = None + false_value: dataclasses.InitVar[str] = None + + true_values: dataclasses.InitVar[typing.List[str]] = None + false_values: dataclasses.InitVar[typing.List[str]] = None + + def __post_init__(self, + true_value: str, + false_value: str, + true_values: typing.List[str], + false_values: typing.List[str]): + if (true_value is not None): + self.value_map.update({true_value: True}) + + if (false_value is not None): + self.value_map.update({false_value: False}) + + if (true_values is not None): + self.value_map.update({v: True for v in true_values}) + + if (false_values is not None): + self.value_map.update({v: False for v in false_values}) + + def process_column(self, df: pd.DataFrame) -> pd.Series: + return super().process_column(df).map(self.value_map).astype(bool) + + +@dataclasses.dataclass +class DateTimeColumn(RenameColumn): + + def process_column(self, df: pd.DataFrame) -> pd.Series: + return pd.to_datetime(super().process_column(df), infer_datetime_format=True, utc=True) + + +@dataclasses.dataclass +class StringJoinColumn(RenameColumn): + + sep: str + + def process_column(self, df: pd.DataFrame) -> pd.Series: + + return super().process_column(df).str.join(sep=self.sep) + + +@dataclasses.dataclass +class StringCatColumn(ColumnInfo): + + input_columns: typing.List[str] + sep: str + + def process_column(self, df: pd.DataFrame) -> pd.Series: + + first_col = df[self.input_columns[0]] + + return first_col.str.cat(others=df[self.input_columns[1:]], sep=self.sep) + + +@dataclasses.dataclass +class IncrementColumn(DateTimeColumn): + groupby_column: str + period: str = "D" + + def process_column(self, df: pd.DataFrame) -> pd.Series: + per_day = super().process_column(df).dt.to_period(self.period) + + # Create the per-user, per-day log count + return df.groupby([self.groupby_column, per_day]).cumcount() + + +@dataclasses.dataclass +class DataFrameInputSchema: + json_columns: typing.List[str] = dataclasses.field(default_factory=list) + column_info: typing.List[ColumnInfo] = dataclasses.field(default_factory=list) + preserve_columns: re.Pattern = dataclasses.field(default_factory=list) + row_filter: typing.Callable[[pd.DataFrame], pd.DataFrame] = None + + def __post_init__(self): + + input_preserve_columns = self.preserve_columns + + # Ensure preserve_columns is a list + if (not isinstance(input_preserve_columns, list)): + input_preserve_columns = [input_preserve_columns] + + # Compile the regex + if (input_preserve_columns is not None and len(input_preserve_columns) > 0): + input_preserve_columns = re.compile("({})".format("|".join(input_preserve_columns))) + else: + input_preserve_columns = None + + self.preserve_columns = input_preserve_columns + + +def _process_columns(df_in: pd.DataFrame, input_schema: DataFrameInputSchema): + + # TODO(MDD): See what causes this to have such a perf impact over using df_in + output_df = pd.DataFrame() + + # Iterate over the column info + for ci in input_schema.column_info: + try: + output_df[ci.name] = ci.process_column(df_in) + except Exception: + logger.exception("Failed to process column '%s'. Dataframe: \n%s", ci.name, df_in, exc_info=True) + raise + + if (input_schema.preserve_columns is not None): + # Get the list of remaining columns not already added + df_in_columns = set(df_in.columns) - set(output_df.columns) + + # Finally, keep any columns that match the preserve filters + match_columns = [y for y in df_in_columns if input_schema.preserve_columns.match(y)] + + output_df[match_columns] = df_in[match_columns] + + return output_df + + +def _normalize_dataframe(df_in: pd.DataFrame, input_schema: DataFrameInputSchema): + + if (input_schema.json_columns is None or len(input_schema.json_columns) == 0): + return df_in + + convert_to_cudf = False + + # Check if we are cudf + if (isinstance(df_in, cudf.DataFrame)): + df_in = df_in.to_pandas() + convert_to_cudf = True + + json_normalized = [] + remaining_columns = list(df_in.columns) + + for j_column in input_schema.json_columns: + + if (j_column not in remaining_columns): + continue + + normalized = pd.json_normalize(df_in[j_column]) + + # Prefix the columns + normalized.rename(columns={n: f"{j_column}.{n}" for n in normalized.columns}, inplace=True) + + # Reset the index otherwise there is a conflict + normalized.reset_index(drop=True, inplace=True) + + json_normalized.append(normalized) + + # Remove from the list of remaining columns + remaining_columns.remove(j_column) + + # Also need to reset the original index + df_in.reset_index(drop=True, inplace=True) + + df_normalized = pd.concat([df_in[remaining_columns]] + json_normalized, axis=1) + + if (convert_to_cudf): + return cudf.from_pandas(df_normalized) + + return df_normalized + + +def _filter_rows(df_in: pd.DataFrame, input_schema: DataFrameInputSchema): + + if (input_schema.row_filter is None): + return df_in + + return input_schema.row_filter(df_in) + + +def process_dataframe(df_in: pd.DataFrame, input_schema: DataFrameInputSchema): + + # Step 1 is to normalize any columns + df_processed = _normalize_dataframe(df_in, input_schema) + + # Step 2 is to process columns + df_processed = _process_columns(df_processed, input_schema) + + # Step 3 is to run the row filter if needed + df_processed = _filter_rows(df_processed, input_schema) + + return df_processed diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/utils/file_utils.py b/examples/digital_fingerprinting/production/morpheus/dfp/utils/file_utils.py new file mode 100644 index 0000000000..ea07771033 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/utils/file_utils.py @@ -0,0 +1,51 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +from datetime import datetime +from datetime import timezone + +import fsspec + +iso_date_regex = re.compile(r"(?P\d{4})-(?P\d{1,2})-(?P\d{1,2})" + r"T(?P\d{1,2}):(?P\d{1,2}):(?P\d{1,2})(?P\.\d{1,6})?Z") + + +def date_extractor(file_object: fsspec.core.OpenFile, filename_regex: re.Pattern): + + assert isinstance(file_object, fsspec.core.OpenFile) + + file_path = file_object.path + + # Match regex with the pathname since that can be more accurate + match = filename_regex.search(file_path) + + if (match): + # Convert the regex match + groups = match.groupdict() + + if ("microsecond" in groups): + groups["microsecond"] = int(float(groups["microsecond"]) * 1000000) + + groups = {key: int(value) for key, value in groups.items()} + + groups["tzinfo"] = timezone.utc + + ts_object = datetime(**groups) + else: + # Otherwise, fallback to the file modified (created?) time + ts_object = file_object.fs.modified(file_object.path) + + return ts_object diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/utils/logging_timer.py b/examples/digital_fingerprinting/production/morpheus/dfp/utils/logging_timer.py new file mode 100644 index 0000000000..f673dfe2a7 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/utils/logging_timer.py @@ -0,0 +1,64 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dataclasses +import typing +import warnings +from contextlib import contextmanager + + +@dataclasses.dataclass +class LogTimeInfo(): + log_fn: typing.Callable + msg: str + args: typing.Tuple + kwargs: typing.Dict + disabled: bool = False + + def disable(self): + self.disabled = True + + def set_log(self, msg: str, *args, **kwargs): + self.msg = msg + self.args = args + self.kwargs = kwargs + + def _do_log_message(self, duration_ms: float): + + if (self.disabled): + return + + if (self.msg is None): + warnings.warn("Must set log msg before end of context! Skipping log") + return + + # Call the log function + self.log_fn(self.msg.format(**{"duration": duration_ms}), *self.args, **self.kwargs) + + +@contextmanager +def log_time(log_fn, msg: str = None, *args, **kwargs): + + # Create an info object to allow users to set the message in the context block + info = LogTimeInfo(log_fn=log_fn, msg=msg, args=args, kwargs=kwargs) + + import time + + start_time = time.time() + + yield info + + duration = (time.time() - start_time) * 1000.0 + + info._do_log_message(duration) diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/utils/model_cache.py b/examples/digital_fingerprinting/production/morpheus/dfp/utils/model_cache.py new file mode 100644 index 0000000000..7441f68f64 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/utils/model_cache.py @@ -0,0 +1,335 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import hashlib +import logging +import threading +import typing +from contextlib import contextmanager +from datetime import datetime + +import mlflow +from dfencoder import AutoEncoder +from mlflow.entities.model_registry import RegisteredModel +from mlflow.exceptions import MlflowException +from mlflow.store.entities.paged_list import PagedList +from mlflow.tracking.client import MlflowClient + +from .logging_timer import log_time + +logger = logging.getLogger("morpheus.{}".format(__name__)) + + +@contextmanager +def timed_acquire(lock: threading.Lock, timeout: float): + result = lock.acquire(timeout=timeout) + + if (not result): + # Did not get the lock. Raise + raise TimeoutError() + + # Got the lock + try: + yield + + finally: + lock.release() + + +def user_to_model_name(user_id: str, model_name_formatter: str): + + kwargs = { + "user_id": user_id, + "user_md5": hashlib.md5(user_id.encode('utf-8')).hexdigest(), + } + + return model_name_formatter.format(**kwargs) + + +class ModelCache: + + def __init__(self, reg_model_name: str, reg_model_version: str, model_uri: str) -> None: + + self._reg_model_name = reg_model_name + self._reg_model_version = reg_model_version + self._model_uri = model_uri + + self._last_checked: datetime = datetime.now() + self._last_used: datetime = self._last_checked + + self._lock = threading.Lock() + self._model: AutoEncoder = None + + @property + def reg_model_name(self): + return self._reg_model_name + + @property + def reg_model_version(self): + return self._reg_model_version + + @property + def model_uri(self): + return self._model_uri + + @property + def last_used(self): + return self._last_used + + @property + def last_checked(self): + return self._last_checked + + def load_model(self, client) -> AutoEncoder: + + now = datetime.now() + + # Ensure multiple people do not try to load at the same time + with self._lock: + + if (self._model is None): + + # Cache miss. Release the lock while we check + try: + with log_time( + logger.debug, + f"Downloaded model '{self.reg_model_name}:{self.reg_model_version}' in {{duration}} ms"): + self._model = mlflow.pytorch.load_model(model_uri=self._model_uri) + + except MlflowException: + logger.error("Error downloading model for URI: %s", self._model_uri, exc_info=True) + raise + + # Update the last time this was used + self._last_used = now + + return self._model + + +class UserModelMap: + + def __init__(self, manager: "ModelManager", user_id: str, fallback_user_ids: typing.List[str]): + + self._manager = manager + self._user_id = user_id + self._fallback_user_ids = fallback_user_ids + self._reg_model_name = manager.user_id_to_model(user_id) + self._last_checked = None + + self._lock = threading.RLock() + self._child_user_model_cache: UserModelMap = None + + def load_model_cache(self, client) -> ModelCache: + + now = datetime.now() + + # Lock to prevent additional access + try: + with timed_acquire(self._lock, timeout=1.0): + + # Check if we have checked before or if we need to check again + if (self._last_checked is None or (now - self._last_checked).seconds < self._manager.cache_timeout_sec): + + # Save the last checked time + self._last_checked = now + + # Try to load from the manager + model_cache = self._manager.load_model_cache(client=client, reg_model_name=self._reg_model_name) + + # If we have a hit, there is nothing else to do + if (model_cache is None and len(self._fallback_user_ids) > 0): + # Our model does not exist, use fallback + self._child_user_model_cache = self._manager.load_user_model_cache( + self._fallback_user_ids[0], fallback_user_ids=self._fallback_user_ids[1:]) + else: + return model_cache + + # See if we have a child cache and use that + if (self._child_user_model_cache is not None): + return self._child_user_model_cache.load_model_cache(client=client) + + # Otherwise load the model + model_cache = self._manager.load_model_cache(client=client, reg_model_name=self._reg_model_name) + + if (model_cache is None): + raise RuntimeError("Model was found but now no longer exists. Model: {}".format( + self._reg_model_name)) + + return model_cache + except TimeoutError: + logger.error("Deadlock detected while loading model cache. Please report this to the developers.") + raise RuntimeError("Deadlock detected while loading model cache") + + +class ModelManager: + + def __init__(self, model_name_formatter: str) -> None: + self._model_name_formatter = model_name_formatter + + self._user_model_cache: typing.Dict[str, UserModelMap] = {} + + self._model_cache: typing.Dict[str, ModelCache] = {} + self._model_cache_size_max = 100 + + self._cache_timeout_sec = 600 + + self._user_model_cache_lock = threading.RLock() + self._model_cache_lock = threading.RLock() + + self._existing_models: typing.Set[str] = set() + self._existing_models_updated = datetime(1970, 1, 1) + + # Force an update of the existing models + self._model_exists("") + + @property + def cache_timeout_sec(self): + return self._cache_timeout_sec + + def _model_exists(self, reg_model_name: str) -> bool: + + now = datetime.now() + + # See if the list of models needs to be updated + if ((now - self._existing_models_updated).seconds > self._cache_timeout_sec): + + try: + with timed_acquire(self._model_cache_lock, timeout=1.0): + + logger.debug("Updating list of available models...") + client = MlflowClient() + + results: PagedList[RegisteredModel] = PagedList([], token=None) + + # clear the set to hanfle the case where a model has been removed + self._existing_models.clear() + + # Loop over the registered models with the pagination + while ((results := client.list_registered_models(max_results=1000, page_token=results.token)) + is not None): + + self._existing_models.update(model.name for model in results) + + if (len(results.token) == 0): + break + + logger.debug("Updating list of available models... Done.") + + # Save the update time + self._existing_models_updated = now + + except TimeoutError: + logger.error("Deadlock detected checking for new models. Please report this to the developers.") + raise RuntimeError("Deadlock detected checking for new models") + except Exception: + logger.exception("Exception occurred when querying the list of available models", exc_info=True) + raise + + return reg_model_name in self._existing_models + + def user_id_to_model(self, user_id: str): + return user_to_model_name(user_id=user_id, model_name_formatter=self._model_name_formatter) + + def load_user_model(self, client, user_id: str, fallback_user_ids: typing.List[str] = []) -> ModelCache: + + # First get the UserModel + user_model_cache = self.load_user_model_cache(user_id=user_id, fallback_user_ids=fallback_user_ids) + + return user_model_cache.load_model_cache(client=client) + + def load_model_cache(self, client: MlflowClient, reg_model_name: str) -> ModelCache: + + now = datetime.now() + + try: + with timed_acquire(self._model_cache_lock, timeout=1.0): + + model_cache = self._model_cache.get(reg_model_name, None) + + # Make sure it hasnt been too long since we checked + if (model_cache is not None and (now - model_cache.last_checked).seconds < self._cache_timeout_sec): + + return model_cache + + # Cache miss. Try to check for a model + try: + if (not self._model_exists(reg_model_name)): + # Break early + return None + + latest_versions = client.get_latest_versions(reg_model_name) + + if (len(latest_versions) == 0): + # Databricks doesnt like the `get_latest_versions` method for some reason. Before failing, try + # to just get the model and then use latest versions + reg_model_obj = client.get_registered_model(reg_model_name) + + latest_versions = None if reg_model_obj is None else reg_model_obj.latest_versions + + if (len(latest_versions) == 0): + logger.warning( + ("Registered model with no versions detected. Consider deleting this registered model." + "Using fallback model. Model: %s, "), + reg_model_name) + return None + + # Default to the first returned one + latest_model_version = latest_versions[0] + + if (len(latest_versions) > 1): + logger.warning(("Multiple models in different stages detected. " + "Defaulting to first returned. Model: %s, Version: %s, Stage: %s"), + reg_model_name, + latest_model_version.version, + latest_model_version.current_stage) + + model_cache = ModelCache(reg_model_name=reg_model_name, + reg_model_version=latest_model_version.version, + model_uri=latest_model_version.source) + + except MlflowException as e: + if e.error_code == 'RESOURCE_DOES_NOT_EXIST': + # No user found + return None + + raise + + # Save the cache + self._model_cache[reg_model_name] = model_cache + + # Check if we need to push out a cache entry + if (len(self._model_cache) > self._model_cache_size_max): + time_sorted = sorted([(k, v) for k, v in self._model_cache.items()], key=lambda x: x[1].last_used) + to_delete = time_sorted[0][0] + self._model_cache.pop(to_delete) + + return model_cache + + except TimeoutError: + logger.error("Deadlock when trying to acquire model cache lock") + raise RuntimeError("Deadlock when trying to acquire model cache lock") + + def load_user_model_cache(self, user_id: str, fallback_user_ids: typing.List[str] = []) -> UserModelMap: + try: + with timed_acquire(self._user_model_cache_lock, timeout=1.0): + + if (user_id not in self._user_model_cache): + self._user_model_cache[user_id] = UserModelMap(manager=self, + user_id=user_id, + fallback_user_ids=fallback_user_ids) + + return self._user_model_cache[user_id] + except TimeoutError: + logger.error("Deadlock when trying to acquire user model cache lock") + raise RuntimeError("Deadlock when trying to acquire user model cache lock") diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/utils/user_model_manager.py b/examples/digital_fingerprinting/production/morpheus/dfp/utils/user_model_manager.py new file mode 100644 index 0000000000..dc766e7d54 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp/utils/user_model_manager.py @@ -0,0 +1,251 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import typing + +import numpy as np +import pandas as pd +import torch +from dfencoder import AutoEncoder +from tqdm import tqdm + +from morpheus.config import Config + +logger = logging.getLogger("morpheus.{}".format(__name__)) + + +class DFPDataLoader: + + def __init__(self, batch_frames, filter_func, max_rows_per_batch=50000): + self._aggregate_cache = None + self._batch_frames = batch_frames + self._current_index = 0 + self._filter_func = filter_func + self._frame_count = len(self._batch_frames) + self._max_rows_per_batch = max_rows_per_batch + self._sample_frame = None + + def reset(self): + self._current_index = 0 + + def get_sample_frame(self): + return self._sample_frame + + def get_next_frame(self): + if (self._current_index == self._frame_count): + return None + + if (self._aggregate_cache is not None): + self._current_index = self._frame_count + return self._aggregate_cache + + total_frames = 0 + aggregate_rows = 0 + aggregate_frame = pd.DataFrame() + while (True): + df_frame = self._filter_func(pd.read_pickle(self._batch_frames[self._current_index])) + + # Save the first row and the last row from every batch. Helps with statistics down the line + if (self._sample_frame is None): + self._sample_frame = df_frame.head(1) + + self._sample_frame = self._sample_frame.append(df_frame.tail(1)) + + rows = df_frame.shape[0] + + if (aggregate_rows + rows < self._max_rows_per_batch): + aggregate_frame = pd.concat([aggregate_frame, df_frame]) + aggregate_rows += rows + total_frames += 1 + + self._current_index = min((self._current_index + 1), self._frame_count) + else: # Adding another frame would exceed our memory limit, return + if (total_frames == self._frame_count): + logger.debug("Caching full training set.") + self._aggregate_cache = aggregate_frame + + return aggregate_frame + + if (self._current_index != self._frame_count): + continue + + # Epoch rolled, return what we have + if (total_frames == self._frame_count): + logger.debug("Caching full training set.") + self._aggregate_cache = aggregate_frame + + return aggregate_frame + + +class InsufficientDataError(RuntimeError): + pass + + +class UserModelManager(object): + + def __init__(self, + c: Config, + user_id: str, + save_model: bool, + epochs: int, + min_history: int, + max_history: int, + seed: int = None, + batch_files: typing.List = [], + model_class=AutoEncoder) -> None: + super().__init__() + + self._user_id = user_id + self._history: pd.DataFrame = None + self._min_history: int = min_history + self._max_history: int = max_history + self._seed: int = seed + self._feature_columns = c.ae.feature_columns + self._epochs = epochs + self._save_model = save_model + self._model_class = model_class + self._batch_files = batch_files + + self._model: AutoEncoder = None + + self._last_train_count = 0 + + @property + def model(self): + return self._model + + def train_from_batch(self, filter_func=lambda df: df): + if (not self._batch_files): + return None + + # If the seed is set, enforce that here + if (self._seed is not None): + torch.manual_seed(self._seed) + torch.cuda.manual_seed(self._seed) + np.random.seed(self._seed) + torch.backends.cudnn.deterministic = True + + model = self._model_class( + encoder_layers=[512, 500], # layers of the encoding part + decoder_layers=[512], # layers of the decoding part + activation='relu', # activation function + swap_p=0.2, # noise parameter + lr=0.001, # learning rate + lr_decay=.99, # learning decay + batch_size=512, + # logger='ipynb', + verbose=False, + optimizer='sgd', # SGD optimizer is selected(Stochastic gradient descent) + scaler='standard', # feature scaling method + min_cats=1, # cut off for minority categories + progress_bar=False, + device="cuda") + + # Loop each epoch + logger.debug("Training AE model for user: '%s'...", self._user_id) + loader = DFPDataLoader(self._batch_files, filter_func) + try: + for _ in tqdm(range(self._epochs), desc="Training"): + batches = 0 + while (True): + df_batch = loader.get_next_frame() + if (df_batch is None): + break + + if (batches == 0 and (df_batch.shape[0] < self._min_history)): + raise InsufficientDataError("Insuffient training data.") + + if (df_batch.shape[0] < 10): # If we've already trained on some data, make sure we can tts this. + break + + model.fit(df_batch) + batches += 1 + + loader.reset() + + if (self._save_model): + self._model = model + + logger.debug("Training AE model for user: '%s'... Complete.", self._user_id) + + return model, loader.get_sample_frame() + except InsufficientDataError: + logger.debug(f"Training AE model for user: '{self._user_id}... Skipped") + return None, None + except Exception: + logger.exception("Error during training for user: %s", self._user_id, exc_info=True) + return None, None + + def train(self, df: pd.DataFrame) -> AutoEncoder: + + # Determine how much history to save + if (self._history is not None): + if (self._max_history > 0): + to_drop = max(len(df) + len(self._history) - self._max_history, 0) + else: + to_drop = 0 + + history = self._history.iloc[to_drop:, :] + + combined_df = pd.concat([history, df]) + else: + combined_df = df + + # Save the history for next time + if (self._max_history > 0): + self._history = combined_df.iloc[max(0, len(combined_df) - self._max_history):, :] + else: + self._history = combined_df + + # Ensure we have enough data + if (len(combined_df) < self._last_train_count + self._min_history): + return None + + # If the seed is set, enforce that here + if (self._seed is not None): + torch.manual_seed(self._seed) + torch.cuda.manual_seed(self._seed) + np.random.seed(self._seed) + torch.backends.cudnn.deterministic = True + + model = self._model_class( + encoder_layers=[512, 500], # layers of the encoding part + decoder_layers=[512], # layers of the decoding part + activation='relu', # activation function + swap_p=0.2, # noise parameter + lr=0.001, # learning rate + lr_decay=.99, # learning decay + batch_size=4096, + # logger='ipynb', + verbose=False, + optimizer='sgd', # SGD optimizer is selected(Stochastic gradient descent) + scaler='standard', # feature scaling method + min_cats=1, # cut off for minority categories + progress_bar=False, + device="cuda") + + final_df = combined_df[combined_df.columns.intersection(self._feature_columns)] + + logger.debug("Training AE model for user: '%s'...", self._user_id) + model.fit(final_df, epochs=self._epochs) + logger.debug("Training AE model for user: '%s'... Complete.", self._user_id) + + # Save the train count to prevent retrains + self._last_train_count = len(final_df) + + if (self._save_model): + self._model = model + + return model diff --git a/examples/digital_fingerprinting/production/morpheus/dfp_azure_inference.ipynb b/examples/digital_fingerprinting/production/morpheus/dfp_azure_inference.ipynb new file mode 100644 index 0000000000..06f9ab0352 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp_azure_inference.ipynb @@ -0,0 +1,453 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2941e94f-db20-44a5-ab87-2cab499825f7", + "metadata": {}, + "source": [ + "# Digital Finger Printing (DFP) with Morpheus - Azure Inference\n", + "## Introduction\n", + "\n", + "In this notebook, we will be building and running a DFP pipeline that performs inference on Azure logs. The goal is to use the pretrained models generated in the Duo Training notebook to generate anomaly scores for each log. These anomaly scores can be used by security teams to detect abnormal behavior when it happens so the proper action can be taken.\n", + "\n", + "
\n", + "Note: For more information on DFP, the Morpheus pipeline, and setup steps to run this notebook, please see the coresponding DFP training materials.\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6c1cb50-74f2-445d-b865-8c22c3b3798b", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "# Ensure that the morpheus directory is in the python path. This may not need to be run depending on the environment setup\n", + "import sys\n", + "import os\n", + "sys.path.insert(0, os.path.abspath(\"./morpheus\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "102ce011-3ca3-4f96-a72d-de28fad32003", + "metadata": {}, + "outputs": [], + "source": [ + "import functools\n", + "import logging\n", + "import os\n", + "import typing\n", + "from datetime import datetime\n", + "from functools import partial\n", + "\n", + "import click\n", + "import mlflow\n", + "from dfp.stages.dfp_file_batcher_stage import DFPFileBatcherStage\n", + "from dfp.stages.dfp_file_to_df import DFPFileToDataFrameStage\n", + "from dfp.stages.dfp_inference_stage import DFPInferenceStage\n", + "from dfp.stages.dfp_mlflow_model_writer import DFPMLFlowModelWriterStage\n", + "from dfp.stages.dfp_postprocessing_stage import DFPPostprocessingStage\n", + "from dfp.stages.dfp_preprocessing_stage import DFPPreprocessingStage\n", + "from dfp.stages.dfp_rolling_window_stage import DFPRollingWindowStage\n", + "from dfp.stages.dfp_split_users_stage import DFPSplitUsersStage\n", + "from dfp.stages.dfp_training import DFPTraining\n", + "from dfp.stages.multi_file_source import MultiFileSource\n", + "from dfp.utils.column_info import BoolColumn\n", + "from dfp.utils.column_info import ColumnInfo\n", + "from dfp.utils.column_info import CustomColumn\n", + "from dfp.utils.column_info import DataFrameInputSchema\n", + "from dfp.utils.column_info import DateTimeColumn\n", + "from dfp.utils.column_info import IncrementColumn\n", + "from dfp.utils.column_info import RenameColumn\n", + "from dfp.utils.column_info import StringCatColumn\n", + "from dfp.utils.column_info import create_increment_col\n", + "from dfp.utils.file_utils import date_extractor\n", + "from dfp.utils.file_utils import iso_date_regex\n", + "\n", + "from morpheus._lib.file_types import FileTypes\n", + "from morpheus.cli.utils import get_package_relative_file\n", + "from morpheus.cli.utils import load_labels_file\n", + "from morpheus.config import Config\n", + "from morpheus.config import ConfigAutoEncoder\n", + "from morpheus.config import CppConfig\n", + "from morpheus.pipeline import LinearPipeline\n", + "from morpheus.stages.general.monitor_stage import MonitorStage\n", + "from morpheus.stages.output.write_to_file_stage import WriteToFileStage\n", + "from morpheus.utils.logger import configure_logging\n", + "from morpheus.utils.logger import get_log_levels\n", + "from morpheus.utils.logger import parse_log_level\n", + "\n", + "# Left align all tables\n", + "from IPython.core.display import HTML\n", + "table_css = 'table {align:left;display:block}'\n", + "HTML(''.format(table_css))" + ] + }, + { + "cell_type": "markdown", + "id": "ca38c1b7-ce84-43e0-ac53-280562dc1642", + "metadata": {}, + "source": [ + "## High Level Configuration\n", + "\n", + "The following options significantly alter the functionality of the pipeline. These options are separated from the individual stage options since they may effect more than one stage. Additionally, the matching python script to this notebook, `dfp_azure_pipeline.py`, configures these options via command line arguments.\n", + "\n", + "### Options\n", + "\n", + "| Name | Type | Description |\n", + "| --- | --- | :-- |\n", + "| `train_users` | One of `[\"none\"]` | For inference, this option should always be `\"none\"` |\n", + "| `skip_users` | List of strings | Any user in this list will be dropped from the pipeline. Useful for debugging to remove automated accounts with many logs. |\n", + "| `cache_dir` | string | The location to store cached files. To aid with development and reduce bandwidth, the Morpheus pipeline will cache data from several stages of the pipeline. This option configures the location for those caches. |\n", + "| `input_files` | List of strings | List of files to process. Can specificy multiple arguments for multiple files. Also accepts glob (\\*) wildcards and schema prefixes such as `s3://`. For example, to make a local cache of an s3 bucket, use `filecache::s3://mybucket/*`. See `fsspec` documentation for list of possible options. |\n", + "| `model_name_formatter` | string | A format string to use when building the model name. The model name is constructed by calling `model_name_formatter.format(user_id=user_id)`. For example, with `model_name_formatter=\"my_model-{user_id}\"` and a user ID of `\"first:last\"` would result in the model name of `\"my_model-first:last\"`. This should match the value used in `DFPMLFlowModelWriterStage`. Available keyword arguments: `user_id`, `user_md5`. |\n", + "| `experiment_name_formatter` | string | A format string (without the `f`) that will be used when creating an experiment in ML Flow. Available keyword arguments: `user_id`, `user_md5`, `reg_model_name`. |\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ee00703-75c5-46fc-890c-86733da906c4", + "metadata": {}, + "outputs": [], + "source": [ + "# Global options\n", + "train_users = \"none\"\n", + "\n", + "# Enter any users to skip here\n", + "skip_users: typing.List[str] = []\n", + "\n", + "# Location where cache objects will be saved\n", + "cache_dir = \"./.cache/dfp\"\n", + "\n", + "# Input files to read from\n", + "input_files = [\n", + " \"/workspace/examples/data/dfp/azure/public_preview_final/s3/AZUREAD_2022-08-3*.json\",\n", + "]\n", + "\n", + "# The format to use for models\n", + "model_name_formatter = \"DFP-azure-{user_id}\"\n", + "\n", + "# === Derived Options ===\n", + "# To include the generic, we must be training all or generic\n", + "include_generic = train_users == \"all\" or train_users == \"generic\"\n", + "\n", + "# To include individual, we must be either training or inferring\n", + "include_individual = train_users != \"generic\"\n", + "\n", + "# None indicates we arent training anything\n", + "is_training = train_users != \"none\"" + ] + }, + { + "cell_type": "markdown", + "id": "1cfc24c9-c85e-4977-a348-692c8f0aceaa", + "metadata": {}, + "source": [ + "### Global Config Object\n", + "Before creating the pipeline, we need to setup logging and set the parameters for the Morpheus config object. This config object is responsible for the following:\n", + " - Indicating whether to use C++ or Python stages\n", + " - C++ stages are not supported for the DFP pipeline. This should always be `False`\n", + " - Setting the number of threads to use in the pipeline. Defaults to the thread count of the OS.\n", + " - Sets the feature column names that will be used in model training\n", + " - This option allows extra columns to be used in the pipeline that will not be part of the training algorithm.\n", + " - The final features that the model will be trained on will be an intersection of this list with the log columns.\n", + " - The column name that indicates the user's unique identifier\n", + " - It is required for DFP to have a user ID column\n", + " - The column name that indicates the timestamp for the log\n", + " - It is required for DFP to know when each log occurred" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "01abd537-9162-49dc-8e83-d9465592f1d5", + "metadata": {}, + "outputs": [], + "source": [ + "# Enable the Morpheus logger\n", + "configure_logging(log_level=logging.DEBUG)\n", + "\n", + "config = Config()\n", + "\n", + "CppConfig.set_should_use_cpp(False)\n", + "\n", + "config.num_threads = os.cpu_count()\n", + "\n", + "config.ae = ConfigAutoEncoder()\n", + "\n", + "config.ae.feature_columns = [\n", + " \"appDisplayName\", \"clientAppUsed\", \"deviceDetailbrowser\", \"deviceDetaildisplayName\", \"deviceDetailoperatingSystem\", \"statusfailureReason\", \"appincrement\", \"locincrement\", \"logcount\", \n", + "]\n", + "config.ae.userid_column_name = \"username\"\n", + "config.ae.timestamp_column_name = \"timestamp\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a73a4d53-32b6-4ab8-a5d7-c0104b31c69b", + "metadata": {}, + "outputs": [], + "source": [ + "# Specify the column names to ensure all data is uniform\n", + "source_column_info = [\n", + " DateTimeColumn(name=config.ae.timestamp_column_name, dtype=datetime, input_name=\"time\"),\n", + " RenameColumn(name=config.ae.userid_column_name, dtype=str, input_name=\"properties.userPrincipalName\"),\n", + " RenameColumn(name=\"appDisplayName\", dtype=str, input_name=\"properties.appDisplayName\"),\n", + " ColumnInfo(name=\"category\", dtype=str),\n", + " RenameColumn(name=\"clientAppUsed\", dtype=str, input_name=\"properties.clientAppUsed\"),\n", + " RenameColumn(name=\"deviceDetailbrowser\", dtype=str, input_name=\"properties.deviceDetail.browser\"),\n", + " RenameColumn(name=\"deviceDetaildisplayName\", dtype=str, input_name=\"properties.deviceDetail.displayName\"),\n", + " RenameColumn(name=\"deviceDetailoperatingSystem\",\n", + " dtype=str,\n", + " input_name=\"properties.deviceDetail.operatingSystem\"),\n", + " StringCatColumn(name=\"location\",\n", + " dtype=str,\n", + " input_columns=[\n", + " \"properties.location.city\",\n", + " \"properties.location.countryOrRegion\",\n", + " ],\n", + " sep=\", \"),\n", + " RenameColumn(name=\"statusfailureReason\", dtype=str, input_name=\"properties.status.failureReason\"),\n", + "]\n", + "\n", + "source_schema = DataFrameInputSchema(json_columns=[\"properties\"], column_info=source_column_info)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7a0cb0a-e65a-444a-a06c-a4525d543790", + "metadata": {}, + "outputs": [], + "source": [ + "# Preprocessing schema\n", + "preprocess_column_info = [\n", + " ColumnInfo(name=config.ae.timestamp_column_name, dtype=datetime),\n", + " ColumnInfo(name=config.ae.userid_column_name, dtype=str),\n", + " ColumnInfo(name=\"appDisplayName\", dtype=str),\n", + " ColumnInfo(name=\"clientAppUsed\", dtype=str),\n", + " ColumnInfo(name=\"deviceDetailbrowser\", dtype=str),\n", + " ColumnInfo(name=\"deviceDetaildisplayName\", dtype=str),\n", + " ColumnInfo(name=\"deviceDetailoperatingSystem\", dtype=str),\n", + " ColumnInfo(name=\"statusfailureReason\", dtype=str),\n", + "\n", + " # Derived columns\n", + " IncrementColumn(name=\"logcount\",\n", + " dtype=int,\n", + " input_name=config.ae.timestamp_column_name,\n", + " groupby_column=config.ae.userid_column_name),\n", + " CustomColumn(name=\"locincrement\",\n", + " dtype=int,\n", + " process_column_fn=partial(create_increment_col, column_name=\"location\")),\n", + " CustomColumn(name=\"appincrement\",\n", + " dtype=int,\n", + " process_column_fn=partial(create_increment_col, column_name=\"appDisplayName\")),\n", + "]\n", + "\n", + "preprocess_schema = DataFrameInputSchema(column_info=preprocess_column_info, preserve_columns=[\"_batch_id\"])\n" + ] + }, + { + "cell_type": "markdown", + "id": "bdfc59de-dea8-4e5f-98e3-98eba1e4621d", + "metadata": {}, + "source": [ + "## Pipeline Construction\n", + "From this point on we begin constructing the stages that will make up the pipeline. To make testing easier, constructing the pipeline object, adding the stages, and running the pipeline, is provided as a single cell. The below cell can be rerun multiple times as needed for debugging.\n", + "\n", + "### Source Stage (`MultiFileSource`)\n", + "\n", + "This pipeline read input logs from one or more input files. This source stage will construct a list of files to be processed and pass to downstream stages. It is capable of reading files from many different source types, both local and remote. This is possible by utilizing the `fsspec` library (similar to `pandas`). See the [`fsspec`](https://filesystem-spec.readthedocs.io/) documentation for more information on the supported file types. Once all of the logs have been read, the source completes. \n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `filenames` | List of strings | | Any files to read into the pipeline. All files will be combined into a single `DataFrame` |\n", + "\n", + "### File Batcher Stage (`DFPFileBatcherStage`)\n", + "\n", + "To improve performance, multiple small input files can be batched together into a single DataFrame for processing. This stage is responsible for determining the timestamp of input files, grouping input files into batches by time, and sending the batches to be processed into a single DataFrame. Batches of files that have been seen before will be loaded from cache resulting in increased performance. For example, when performaing a 60 day training run, 59 days can be cached with a period of `\"D\"` and retraining once per day.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `period` | `str` | `\"D\"` | The period to create batches. See `pandas` windowing frequency documentation for available options. |\n", + "| `date_conversion_func` | Function of `typing.Callable[[fsspec.core.OpenFile], datetime]` | | A callback which is responsible for determining the date for a specified file. |\n", + "\n", + "### File to DataFrame Stage (`DFPFileToDataFrameStage`)\n", + "\n", + "After files have been batched into groups, this stage is responsible for reading the files and converting into a DataFrame. The specified input schema converts the raw DataFrame into one suitable for caching and processing. Any columns that are not needed should be excluded from the schema.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `schema` | `DataFrameInputSchema` | | After the raw `DataFrame` is read from each file, this schema will be applied to ensure a consisten output from the source. |\n", + "| `file_type` | `FileTypes` | `FileTypes.Auto` | Allows overriding the file type. When set to `Auto`, the file extension will be used. Options are `CSV`, `JSON`, `Auto`. |\n", + "| `parser_kwargs` | `dict` | `{}` | This dictionary will be passed to the `DataFrame` parser class. Allows for customization of log parsing. |\n", + "| `cache_dir` | `str` | `./.cache/dfp` | The location to write cached input files to. |\n", + "\n", + "### Split Users Stage (`DFPSplitUsersStage`)\n", + "\n", + "Once the input logs have been read into a `DataFrame`, this stage is responsible for breaking that single `DataFrame` with many users into multiple `DataFrame`s for each user. This is also where the pipeline chooses whether to train individual users or the generic user (or both).\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `include_generic` | `bool` | | Whether or not to combine all user logs into a single `DataFrame` with the username 'generic_user' |\n", + "| `include_individual` | `bool` | | Whether or not to output individual `DataFrame` objects for each user |\n", + "| `skip_users` | List of `str` | `[]` | Any users to remove from the `DataFrame`. Useful for debugging to remove automated accounts with many logs. Mutually exclusive with `only_users`. |\n", + "| `only_users` | List of `str` | `[]` | Only allow these users in the final `DataFrame`. Useful for debugging to focus on specific users. Mutually exclusive with `skip_users`. |\n", + "\n", + "### Rolling Window Stage (`DFPRollingWindowStage`)\n", + "\n", + "The Rolling Window Stage performs several key pieces of functionality for DFP.\n", + "1. This stage keeps a moving window of logs on a per user basis\n", + " 1. These logs are saved to disk to reduce memory requirements between logs from the same user\n", + "1. It only emits logs when the window history requirements are met\n", + " 1. Until all of the window history requirements are met, no messages will be sent to the rest of the pipeline.\n", + " 1. See the below options for configuring the window history requirements\n", + "1. It repeats the necessary logs to properly calculate log dependent features.\n", + " 1. To support all column feature types, incoming log messages can be combined with existing history and sent to downstream stages.\n", + " 1. For example, to calculate a feature that increments a counter for the number of logs a particular user has generated in a single day, we must have the user's log history for the past 24 hours. To support this, this stage will combine new logs with existing history into a single `DataFrame`.\n", + " 1. It is the responsibility of downstream stages to distinguish between new logs and existing history.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `min_history` | `int` | `1` | The minimum number of logs a user must have before emitting any messages. Logs below this threshold will be saved to disk. |\n", + "| `min_increment` | `int` or `str` | `0` | Once the min history requirement is met, this stage must receive `min_increment` *new* logs before emmitting another message. Logs received before this threshold is met will be saved to disk. Can be specified as an integer count or a string duration. |\n", + "| `max_history` | `int` or `str` | `\"1d\"` | Once `min_history` and `min_increment` requirements have been met, this puts an upper bound on the maximum number of messages to forward into the pipeline and also the maximum amount of messages to retain in the history. Can be specified as an integer count or a string duration. |\n", + "| `cache_dir` | `str` | `./.cache/dfp` | The location to write log history to disk. |\n", + "\n", + "### Preprocessing Stage (`DFPPreprocessingStage`)\n", + "\n", + "This stage performs the final, row dependent, feature calculations as specified by the input schema object. Once calculated, this stage can forward on all received logs, or optionally can only forward on new logs, removing any history information.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `input_schema` | `DataFrameInputSchema` | | The final, row dependent, schema to apply to the incoming columns |\n", + "| `only_new_batches` | `bool` | | Whether or not to foward on all received logs, or just new logs. |\n", + "\n", + "### Inference Stage (`DFPInference`)\n", + "\n", + "This stage performs several tasks to aid in performing inference. This stage will:\n", + "1. Download models as needed from MLFlow\n", + "1. Cache previously downloaded models to improve performance\n", + " 1. Models in the cache will be periodically refreshed from MLFlow at a configured rate\n", + "1. Perform inference using the downloaded model\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `model_name_formatter` | `str` | `\"\"` | A format string to use when building the model name. The model name is constructed by calling `model_name_formatter.format(user_id=user_id)`. For example, with `model_name_formatter=\"my_model-{user_id}\"` and a user ID of `\"first:last\"` would result in the model name of `\"my_model-first:last\"`. This should match the value used in `DFPMLFlowModelWriterStage` |\n", + "\n", + "### Post Processing Stage (`DFPPostprocessingStage`)\n", + "\n", + "This stage filters the output from the inference stage for any anomalous messages. Logs which exceed the specified Z-Score will be passed onto the next stage. All remaining logs which are below the threshold will be dropped.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `z_score_threshold` | `float` | `2.0` | The Z-Score used to separate anomalous logs from normal logs. All normal logs will be filterd out and anomalous logs will be passed on. |\n", + "\n", + "### Write to File Stage (`WriteToFileStage`)\n", + "\n", + "This final stage will write all received messages to a single output file in either CSV or JSON format.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `filename` | `str` | | The file to write anomalous log messages to. |\n", + "| `overwrite` | `bool` | `False` | If the file specified in `filename` already exists, it will be overwritten if this option is set to `True` |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "825390ad-ce64-4949-b324-33039ffdf264", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a linear pipeline object\n", + "pipeline = LinearPipeline(config)\n", + "\n", + "# Source stage\n", + "pipeline.set_source(MultiFileSource(config, filenames=input_files))\n", + "\n", + "# Batch files into buckets by time. Use the default ISO date extractor from the filename\n", + "pipeline.add_stage(\n", + " DFPFileBatcherStage(config,\n", + " period=\"D\",\n", + " date_conversion_func=functools.partial(date_extractor, filename_regex=iso_date_regex)))\n", + "\n", + "# Output is S3 Buckets. Convert to DataFrames. This caches downloaded S3 data\n", + "pipeline.add_stage(\n", + " DFPFileToDataFrameStage(config,\n", + " schema=source_schema,\n", + " file_type=FileTypes.JSON,\n", + " parser_kwargs={\n", + " \"lines\": False, \"orient\": \"records\"\n", + " },\n", + " cache_dir=cache_dir))\n", + "\n", + "\n", + "# This will split users or just use one single user\n", + "pipeline.add_stage(\n", + " DFPSplitUsersStage(config,\n", + " include_generic=include_generic,\n", + " include_individual=include_individual,\n", + " skip_users=skip_users))\n", + "\n", + "# Next, have a stage that will create rolling windows\n", + "pipeline.add_stage(\n", + " DFPRollingWindowStage(\n", + " config,\n", + " min_history=300 if is_training else 1,\n", + " min_increment=300 if is_training else 0,\n", + " # For inference, we only ever want 1 day max\n", + " max_history=\"60d\" if is_training else \"1d\",\n", + " cache_dir=cache_dir))\n", + "\n", + "# Output is UserMessageMeta -- Cached frame set\n", + "pipeline.add_stage(DFPPreprocessingStage(config, input_schema=preprocess_schema, only_new_batches=not is_training))\n", + "\n", + "# Perform inference on the preprocessed data\n", + "pipeline.add_stage(DFPInferenceStage(config, model_name_formatter=model_name_formatter))\n", + "\n", + "# Filter for only the anomalous logs\n", + "pipeline.add_stage(DFPPostprocessingStage(config, z_score_threshold=2.0))\n", + "\n", + "# Write all anomalies to a CSV file\n", + "pipeline.add_stage(WriteToFileStage(config, filename=\"dfp_detections_azure.csv\", overwrite=True))\n", + "\n", + "# Run the pipeline\n", + "await pipeline._do_run()\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:morpheus] *", + "language": "python", + "name": "conda-env-morpheus-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + }, + "vscode": { + "interpreter": { + "hash": "e26783b24f020aa0bcaa00e6ba122db5d0e3da2d892d80be664969895e06a7e1" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/digital_fingerprinting/production/morpheus/dfp_azure_pipeline.py b/examples/digital_fingerprinting/production/morpheus/dfp_azure_pipeline.py new file mode 100644 index 0000000000..4999bad60b --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp_azure_pipeline.py @@ -0,0 +1,293 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +import logging +import os +import typing +from datetime import datetime +from functools import partial + +import click +import mlflow +from dfp.stages.dfp_file_batcher_stage import DFPFileBatcherStage +from dfp.stages.dfp_file_to_df import DFPFileToDataFrameStage +from dfp.stages.dfp_inference_stage import DFPInferenceStage +from dfp.stages.dfp_mlflow_model_writer import DFPMLFlowModelWriterStage +from dfp.stages.dfp_postprocessing_stage import DFPPostprocessingStage +from dfp.stages.dfp_preprocessing_stage import DFPPreprocessingStage +from dfp.stages.dfp_rolling_window_stage import DFPRollingWindowStage +from dfp.stages.dfp_split_users_stage import DFPSplitUsersStage +from dfp.stages.dfp_training import DFPTraining +from dfp.stages.multi_file_source import MultiFileSource +from dfp.utils.column_info import ColumnInfo +from dfp.utils.column_info import CustomColumn +from dfp.utils.column_info import DataFrameInputSchema +from dfp.utils.column_info import DateTimeColumn +from dfp.utils.column_info import IncrementColumn +from dfp.utils.column_info import RenameColumn +from dfp.utils.column_info import StringCatColumn +from dfp.utils.column_info import create_increment_col +from dfp.utils.file_utils import date_extractor +from dfp.utils.file_utils import iso_date_regex + +from morpheus._lib.file_types import FileTypes +from morpheus.cli.utils import get_package_relative_file +from morpheus.cli.utils import load_labels_file +from morpheus.config import Config +from morpheus.config import ConfigAutoEncoder +from morpheus.config import CppConfig +from morpheus.pipeline import LinearPipeline +from morpheus.stages.general.monitor_stage import MonitorStage +from morpheus.stages.output.write_to_file_stage import WriteToFileStage +from morpheus.utils.logger import configure_logging +from morpheus.utils.logger import get_log_levels +from morpheus.utils.logger import parse_log_level + + +@click.command() +@click.option( + "--train_users", + type=click.Choice(["all", "generic", "individual", "none"], case_sensitive=False), + help="Indicates whether or not to train per user or a generic model for all users", +) +@click.option( + "--skip_user", + multiple=True, + type=str, + help="User IDs to skip. Mutually exclusive with only_user", +) +@click.option( + "--only_user", + multiple=True, + type=str, + help="Only users specified by this option will be included. Mutually exclusive with skip_user", +) +@click.option( + "--duration", + type=str, + default="60d", + help="The duration to run starting from now", +) +@click.option( + "--cache_dir", + type=str, + default="./.cache/dfp", + show_envvar=True, + help="The location to cache data such as S3 downloads and pre-processed data", +) +@click.option("--log_level", + default=logging.getLevelName(Config().log_level), + type=click.Choice(get_log_levels(), case_sensitive=False), + callback=parse_log_level, + help="Specify the logging level to use.") +@click.option("--sample_rate_s", + type=int, + default=0, + show_envvar=True, + help="Minimum time step, in milliseconds, between object logs.") +@click.option( + "--input_file", + "-f", + type=str, + multiple=True, + help=("List of files to process. Can specificy multiple arguments for multiple files. " + "Also accepts glob (*) wildcards and schema prefixes such as `s3://`. " + "For example, to make a local cache of an s3 bucket, use `filecache::s3://mybucket/*`. " + "See fsspec documentation for list of possible options."), +) +@click.option('--tracking_uri', + type=str, + default="http://localhost:5000", + help=("The ML Flow tracking URI to connect to the tracking backend. If not speficied, MF Flow will use " + "'file:///mlruns' relative to the current directory")) +def run_pipeline(train_users, + skip_user: typing.Tuple[str], + only_user: typing.Tuple[str], + duration, + cache_dir, + log_level, + sample_rate_s, + **kwargs): + # To include the generic, we must be training all or generic + include_generic = train_users == "all" or train_users == "generic" + + # To include individual, we must be either training or inferring + include_individual = train_users != "generic" + + # None indicates we arent training anything + is_training = train_users != "none" + + skip_users = list(skip_user) + only_users = list(only_user) + + # Enable the Morpheus logger + configure_logging(log_level=log_level) + + if (len(skip_users) > 0 and len(only_users) > 0): + logging.error("Option --skip_user and --only_user are mutually exclusive. Exiting") + + logger = logging.getLogger("morpheus.{}".format(__name__)) + + logger.info("Running training pipeline with the following options: ") + logger.info("Train generic_user: %s", include_generic) + logger.info("Skipping users: %s", skip_users) + logger.info("Duration: %s", duration) + logger.info("Cache Dir: %s", cache_dir) + + if ("tracking_uri" in kwargs): + # Initialize ML Flow + mlflow.set_tracking_uri(kwargs["tracking_uri"]) + logger.info("Tracking URI: %s", mlflow.get_tracking_uri()) + + config = Config() + + CppConfig.set_should_use_cpp(False) + + config.num_threads = os.cpu_count() + + config.ae = ConfigAutoEncoder() + + config.ae.feature_columns = load_labels_file(get_package_relative_file("data/columns_ae_azure.txt")) + config.ae.userid_column_name = "username" + config.ae.timestamp_column_name = "timestamp" + + # Specify the column names to ensure all data is uniform + source_column_info = [ + DateTimeColumn(name=config.ae.timestamp_column_name, dtype=datetime, input_name="time"), + RenameColumn(name=config.ae.userid_column_name, dtype=str, input_name="properties.userPrincipalName"), + RenameColumn(name="appDisplayName", dtype=str, input_name="properties.appDisplayName"), + ColumnInfo(name="category", dtype=str), + RenameColumn(name="clientAppUsed", dtype=str, input_name="properties.clientAppUsed"), + RenameColumn(name="deviceDetailbrowser", dtype=str, input_name="properties.deviceDetail.browser"), + RenameColumn(name="deviceDetaildisplayName", dtype=str, input_name="properties.deviceDetail.displayName"), + RenameColumn(name="deviceDetailoperatingSystem", + dtype=str, + input_name="properties.deviceDetail.operatingSystem"), + StringCatColumn(name="location", + dtype=str, + input_columns=[ + "properties.location.city", + "properties.location.countryOrRegion", + ], + sep=", "), + RenameColumn(name="statusfailureReason", dtype=str, input_name="properties.status.failureReason"), + ] + + source_schema = DataFrameInputSchema(json_columns=["properties"], column_info=source_column_info) + + # Preprocessing schema + preprocess_column_info = [ + ColumnInfo(name=config.ae.timestamp_column_name, dtype=datetime), + ColumnInfo(name=config.ae.userid_column_name, dtype=str), + ColumnInfo(name="appDisplayName", dtype=str), + ColumnInfo(name="clientAppUsed", dtype=str), + ColumnInfo(name="deviceDetailbrowser", dtype=str), + ColumnInfo(name="deviceDetaildisplayName", dtype=str), + ColumnInfo(name="deviceDetailoperatingSystem", dtype=str), + ColumnInfo(name="statusfailureReason", dtype=str), + + # Derived columns + IncrementColumn(name="logcount", + dtype=int, + input_name=config.ae.timestamp_column_name, + groupby_column=config.ae.userid_column_name), + CustomColumn(name="locincrement", + dtype=int, + process_column_fn=partial(create_increment_col, column_name="location")), + CustomColumn(name="appincrement", + dtype=int, + process_column_fn=partial(create_increment_col, column_name="appDisplayName")), + ] + + preprocess_schema = DataFrameInputSchema(column_info=preprocess_column_info, preserve_columns=["_batch_id"]) + + # Create a linear pipeline object + pipeline = LinearPipeline(config) + + pipeline.set_source(MultiFileSource(config, filenames=list(kwargs["input_file"]))) + + # Batch files into buckets by time. Use the default ISO date extractor from the filename + pipeline.add_stage( + DFPFileBatcherStage(config, + period="D", + sampling_rate_s=sample_rate_s, + date_conversion_func=functools.partial(date_extractor, filename_regex=iso_date_regex))) + + # Output is S3 Buckets. Convert to DataFrames. This caches downloaded S3 data + pipeline.add_stage( + DFPFileToDataFrameStage(config, + schema=source_schema, + file_type=FileTypes.JSON, + parser_kwargs={ + "lines": False, "orient": "records" + }, + cache_dir=cache_dir)) + + pipeline.add_stage(MonitorStage(config, description="Input data rate")) + + # This will split users or just use one single user + pipeline.add_stage( + DFPSplitUsersStage(config, + include_generic=include_generic, + include_individual=include_individual, + skip_users=skip_users, + only_users=only_users)) + + # Next, have a stage that will create rolling windows + pipeline.add_stage( + DFPRollingWindowStage( + config, + min_history=300 if is_training else 1, + min_increment=300 if is_training else 0, + # For inference, we only ever want 1 day max + max_history="60d" if is_training else "1d", + cache_dir=cache_dir)) + + # Output is UserMessageMeta -- Cached frame set + pipeline.add_stage(DFPPreprocessingStage(config, input_schema=preprocess_schema, only_new_batches=not is_training)) + + model_name_formatter = "DFP-azure-{user_id}" + experiment_name_formatter = "dfp/azure/training/{reg_model_name}" + + if (is_training): + + # Finally, perform training which will output a model + pipeline.add_stage(DFPTraining(config)) + + pipeline.add_stage(MonitorStage(config, description="Training rate", smoothing=0.001)) + + # Write that model to MLFlow + pipeline.add_stage( + DFPMLFlowModelWriterStage(config, + model_name_formatter=model_name_formatter, + experiment_name_formatter=experiment_name_formatter)) + else: + # Perform inference on the preprocessed data + pipeline.add_stage(DFPInferenceStage(config, model_name_formatter=model_name_formatter)) + + pipeline.add_stage(MonitorStage(config, description="Inference rate", smoothing=0.001)) + + # Filter for only the anomalous logs + pipeline.add_stage(DFPPostprocessingStage(config, z_score_threshold=2.0)) + + # Write all anomalies to a CSV file + pipeline.add_stage(WriteToFileStage(config, filename="dfp_detections_azure.csv", overwrite=True)) + + # Run the pipeline + pipeline.run() + + +if __name__ == "__main__": + run_pipeline(obj={}, auto_envvar_prefix='DFP', show_default=True, prog_name="dfp") diff --git a/examples/digital_fingerprinting/production/morpheus/dfp_azure_training.ipynb b/examples/digital_fingerprinting/production/morpheus/dfp_azure_training.ipynb new file mode 100644 index 0000000000..50e2a2865e --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp_azure_training.ipynb @@ -0,0 +1,446 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2941e94f-db20-44a5-ab87-2cab499825f7", + "metadata": {}, + "source": [ + "# Digital Finger Printing (DFP) with Morpheus - Azure Training\n", + "## Introduction\n", + "\n", + "In this notebook, we will be building and running a DFP pipeline that performs training on Azure logs. The goal is to train an autoencoder PyTorch model to recogize the patterns of users in the sample data. The model will then be used by a second Morpheus pipeline to generate anomaly scores for each individual log. These anomaly scores can be used by security teams to detect abnormal behavior when it happens so the proper action can be taken.\n", + "\n", + "
\n", + "Note: For more information on DFP, the Morpheus pipeline, and setup steps to run this notebook, please see the coresponding DFP training materials.\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6c1cb50-74f2-445d-b865-8c22c3b3798b", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "# Ensure that the morpheus directory is in the python path. This may not need to be run depending on the environment setup\n", + "import sys\n", + "import os\n", + "sys.path.insert(0, os.path.abspath(\"./morpheus\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "102ce011-3ca3-4f96-a72d-de28fad32003", + "metadata": {}, + "outputs": [], + "source": [ + "import functools\n", + "import logging\n", + "import os\n", + "import typing\n", + "from datetime import datetime\n", + "from functools import partial\n", + "\n", + "import click\n", + "import mlflow\n", + "from dfp.stages.dfp_file_batcher_stage import DFPFileBatcherStage\n", + "from dfp.stages.dfp_file_to_df import DFPFileToDataFrameStage\n", + "from dfp.stages.dfp_inference_stage import DFPInferenceStage\n", + "from dfp.stages.dfp_mlflow_model_writer import DFPMLFlowModelWriterStage\n", + "from dfp.stages.dfp_postprocessing_stage import DFPPostprocessingStage\n", + "from dfp.stages.dfp_preprocessing_stage import DFPPreprocessingStage\n", + "from dfp.stages.dfp_rolling_window_stage import DFPRollingWindowStage\n", + "from dfp.stages.dfp_split_users_stage import DFPSplitUsersStage\n", + "from dfp.stages.dfp_training import DFPTraining\n", + "from dfp.stages.multi_file_source import MultiFileSource\n", + "from dfp.utils.column_info import BoolColumn\n", + "from dfp.utils.column_info import ColumnInfo\n", + "from dfp.utils.column_info import CustomColumn\n", + "from dfp.utils.column_info import DataFrameInputSchema\n", + "from dfp.utils.column_info import DateTimeColumn\n", + "from dfp.utils.column_info import IncrementColumn\n", + "from dfp.utils.column_info import RenameColumn\n", + "from dfp.utils.column_info import StringCatColumn\n", + "from dfp.utils.column_info import create_increment_col\n", + "from dfp.utils.file_utils import date_extractor\n", + "from dfp.utils.file_utils import iso_date_regex\n", + "\n", + "from morpheus._lib.file_types import FileTypes\n", + "from morpheus.cli.utils import get_package_relative_file\n", + "from morpheus.cli.utils import load_labels_file\n", + "from morpheus.config import Config\n", + "from morpheus.config import ConfigAutoEncoder\n", + "from morpheus.config import CppConfig\n", + "from morpheus.pipeline import LinearPipeline\n", + "from morpheus.stages.general.monitor_stage import MonitorStage\n", + "from morpheus.stages.output.write_to_file_stage import WriteToFileStage\n", + "from morpheus.utils.logger import configure_logging\n", + "from morpheus.utils.logger import get_log_levels\n", + "from morpheus.utils.logger import parse_log_level\n", + "\n", + "# Left align all tables\n", + "from IPython.core.display import HTML\n", + "table_css = 'table {align:left;display:block}'\n", + "HTML(''.format(table_css))" + ] + }, + { + "cell_type": "markdown", + "id": "ca38c1b7-ce84-43e0-ac53-280562dc1642", + "metadata": {}, + "source": [ + "## High Level Configuration\n", + "\n", + "The following options significantly alter the functionality of the pipeline. These options are separated from the individual stage options since they may effect more than one stage. Additionally, the matching python script to this notebook, `dfp_azure_pipeline.py`, configures these options via command line arguments.\n", + "\n", + "### Options\n", + "\n", + "| Name | Type | Description |\n", + "| --- | --- | :-- |\n", + "| `train_users` | One of `[\"all\", \"generic\", \"individual\"]` | This indicates which users to train for this pipeline:
  • `\"generic\"`: Combine all users into a single model with the username 'generic_user'. Skips individual users.
  • `\"individual\"`: Trains a separate model for each individual user. Skips 'generic_user'.
  • `\"all\"`: Combination of `\"generic\"` and `\"individual\"`. Both the 'generic_user' and individual users are trained in the same pipeline.
|\n", + "| `skip_users` | List of strings | Any user in this list will be dropped from the pipeline. Useful for debugging to remove automated accounts with many logs. |\n", + "| `cache_dir` | string | The location to store cached files. To aid with development and reduce bandwidth, the Morpheus pipeline will cache data from several stages of the pipeline. This option configures the location for those caches. |\n", + "| `input_files` | List of strings | List of files to process. Can specificy multiple arguments for multiple files. Also accepts glob (\\*) wildcards and schema prefixes such as `s3://`. For example, to make a local cache of an s3 bucket, use `filecache::s3://mybucket/*`. See `fsspec` documentation for list of possible options. |\n", + "| `model_name_formatter` | string | A format string to use when building the model name. The model name is constructed by calling `model_name_formatter.format(user_id=user_id)`. For example, with `model_name_formatter=\"my_model-{user_id}\"` and a user ID of `\"first:last\"` would result in the model name of `\"my_model-first:last\"`. This should match the value used in `DFPMLFlowModelWriterStage`. Available keyword arguments: `user_id`, `user_md5`. |\n", + "| `experiment_name_formatter` | string | A format string (without the `f`) that will be used when creating an experiment in ML Flow. Available keyword arguments: `user_id`, `user_md5`, `reg_model_name`. |\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ee00703-75c5-46fc-890c-86733da906c4", + "metadata": {}, + "outputs": [], + "source": [ + "# Global options\n", + "train_users = \"all\"\n", + "\n", + "# Enter any users to skip here\n", + "skip_users: typing.List[str] = []\n", + "\n", + "# Location where cache objects will be saved\n", + "cache_dir = \"/workspace/.cache/dfp\"\n", + "\n", + "# Input files to read from\n", + "input_files = [\n", + " \"/workspace/examples/data/dfp/azure/public_preview_final/s3/AZUREAD_2022-08-0*.json\",\n", + " \"/workspace/examples/data/dfp/azure/public_preview_final/s3/AZUREAD_2022-08-1*.json\",\n", + " \"/workspace/examples/data/dfp/azure/public_preview_final/s3/AZUREAD_2022-08-2*.json\",\n", + "]\n", + "\n", + "# The format to use for models\n", + "model_name_formatter = \"DFP-azure-{user_id}\"\n", + "\n", + "# The format to use for experiment names\n", + "experiment_name_formatter = \"dfp/azure/training/{reg_model_name}\"\n", + "\n", + "# === Derived Options ===\n", + "# To include the generic, we must be training all or generic\n", + "include_generic = train_users == \"all\" or train_users == \"generic\"\n", + "\n", + "# To include individual, we must be either training or inferring\n", + "include_individual = train_users != \"generic\"\n", + "\n", + "# None indicates we arent training anything\n", + "is_training = train_users != \"none\"" + ] + }, + { + "cell_type": "markdown", + "id": "1cfc24c9-c85e-4977-a348-692c8f0aceaa", + "metadata": {}, + "source": [ + "### Global Config Object\n", + "Before creating the pipeline, we need to setup logging and set the parameters for the Morpheus config object. This config object is responsible for the following:\n", + " - Indicating whether to use C++ or Python stages\n", + " - C++ stages are not supported for the DFP pipeline. This should always be `False`\n", + " - Setting the number of threads to use in the pipeline. Defaults to the thread count of the OS.\n", + " - Sets the feature column names that will be used in model training\n", + " - This option allows extra columns to be used in the pipeline that will not be part of the training algorithm.\n", + " - The final features that the model will be trained on will be an intersection of this list with the log columns.\n", + " - The column name that indicates the user's unique identifier\n", + " - It is required for DFP to have a user ID column\n", + " - The column name that indicates the timestamp for the log\n", + " - It is required for DFP to know when each log occurred" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "01abd537-9162-49dc-8e83-d9465592f1d5", + "metadata": {}, + "outputs": [], + "source": [ + "# Enable the Morpheus logger\n", + "configure_logging(log_level=logging.DEBUG)\n", + "\n", + "config = Config()\n", + "\n", + "CppConfig.set_should_use_cpp(False)\n", + "\n", + "config.num_threads = os.cpu_count()\n", + "\n", + "config.ae = ConfigAutoEncoder()\n", + "\n", + "config.ae.feature_columns = [\n", + " \"appDisplayName\", \"clientAppUsed\", \"deviceDetailbrowser\", \"deviceDetaildisplayName\", \"deviceDetailoperatingSystem\", \"statusfailureReason\", \"appincrement\", \"locincrement\", \"logcount\", \n", + "]\n", + "config.ae.userid_column_name = \"username\"\n", + "config.ae.timestamp_column_name = \"timestamp\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a73a4d53-32b6-4ab8-a5d7-c0104b31c69b", + "metadata": {}, + "outputs": [], + "source": [ + "# Specify the column names to ensure all data is uniform\n", + "source_column_info = [\n", + " DateTimeColumn(name=config.ae.timestamp_column_name, dtype=datetime, input_name=\"time\"),\n", + " RenameColumn(name=config.ae.userid_column_name, dtype=str, input_name=\"properties.userPrincipalName\"),\n", + " RenameColumn(name=\"appDisplayName\", dtype=str, input_name=\"properties.appDisplayName\"),\n", + " ColumnInfo(name=\"category\", dtype=str),\n", + " RenameColumn(name=\"clientAppUsed\", dtype=str, input_name=\"properties.clientAppUsed\"),\n", + " RenameColumn(name=\"deviceDetailbrowser\", dtype=str, input_name=\"properties.deviceDetail.browser\"),\n", + " RenameColumn(name=\"deviceDetaildisplayName\", dtype=str, input_name=\"properties.deviceDetail.displayName\"),\n", + " RenameColumn(name=\"deviceDetailoperatingSystem\",\n", + " dtype=str,\n", + " input_name=\"properties.deviceDetail.operatingSystem\"),\n", + " StringCatColumn(name=\"location\",\n", + " dtype=str,\n", + " input_columns=[\n", + " \"properties.location.city\",\n", + " \"properties.location.countryOrRegion\",\n", + " ],\n", + " sep=\", \"),\n", + " RenameColumn(name=\"statusfailureReason\", dtype=str, input_name=\"properties.status.failureReason\"),\n", + "]\n", + "\n", + "source_schema = DataFrameInputSchema(json_columns=[\"properties\"], column_info=source_column_info)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7a0cb0a-e65a-444a-a06c-a4525d543790", + "metadata": {}, + "outputs": [], + "source": [ + "# Preprocessing schema\n", + "preprocess_column_info = [\n", + " ColumnInfo(name=config.ae.timestamp_column_name, dtype=datetime),\n", + " ColumnInfo(name=config.ae.userid_column_name, dtype=str),\n", + " ColumnInfo(name=\"appDisplayName\", dtype=str),\n", + " ColumnInfo(name=\"clientAppUsed\", dtype=str),\n", + " ColumnInfo(name=\"deviceDetailbrowser\", dtype=str),\n", + " ColumnInfo(name=\"deviceDetaildisplayName\", dtype=str),\n", + " ColumnInfo(name=\"deviceDetailoperatingSystem\", dtype=str),\n", + " ColumnInfo(name=\"statusfailureReason\", dtype=str),\n", + "\n", + " # Derived columns\n", + " IncrementColumn(name=\"logcount\",\n", + " dtype=int,\n", + " input_name=config.ae.timestamp_column_name,\n", + " groupby_column=config.ae.userid_column_name),\n", + " CustomColumn(name=\"locincrement\",\n", + " dtype=int,\n", + " process_column_fn=partial(create_increment_col, column_name=\"location\")),\n", + " CustomColumn(name=\"appincrement\",\n", + " dtype=int,\n", + " process_column_fn=partial(create_increment_col, column_name=\"appDisplayName\")),\n", + "]\n", + "\n", + "preprocess_schema = DataFrameInputSchema(column_info=preprocess_column_info, preserve_columns=[\"_batch_id\"])\n" + ] + }, + { + "cell_type": "markdown", + "id": "bdfc59de-dea8-4e5f-98e3-98eba1e4621d", + "metadata": {}, + "source": [ + "## Pipeline Construction\n", + "From this point on we begin constructing the stages that will make up the pipeline. To make testing easier, constructing the pipeline object, adding the stages, and running the pipeline, is provided as a single cell. The below cell can be rerun multiple times as needed for debugging.\n", + "\n", + "### Source Stage (`MultiFileSource`)\n", + "\n", + "This pipeline read input logs from one or more input files. This source stage will construct a list of files to be processed and pass to downstream stages. It is capable of reading files from many different source types, both local and remote. This is possible by utilizing the `fsspec` library (similar to `pandas`). See the [`fsspec`](https://filesystem-spec.readthedocs.io/) documentation for more information on the supported file types. Once all of the logs have been read, the source completes. \n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `filenames` | List of strings | | Any files to read into the pipeline. All files will be combined into a single `DataFrame` |\n", + "\n", + "### File Batcher Stage (`DFPFileBatcherStage`)\n", + "\n", + "To improve performance, multiple small input files can be batched together into a single DataFrame for processing. This stage is responsible for determining the timestamp of input files, grouping input files into batches by time, and sending the batches to be processed into a single DataFrame. Batches of files that have been seen before will be loaded from cache resulting in increased performance. For example, when performaing a 60 day training run, 59 days can be cached with a period of `\"D\"` and retraining once per day.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `period` | `str` | `\"D\"` | The period to create batches. See `pandas` windowing frequency documentation for available options. |\n", + "| `date_conversion_func` | Function of `typing.Callable[[fsspec.core.OpenFile], datetime]` | | A callback which is responsible for determining the date for a specified file. |\n", + "\n", + "### File to DataFrame Stage (`DFPFileToDataFrameStage`)\n", + "\n", + "After files have been batched into groups, this stage is responsible for reading the files and converting into a DataFrame. The specified input schema converts the raw DataFrame into one suitable for caching and processing. Any columns that are not needed should be excluded from the schema.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `schema` | `DataFrameInputSchema` | | After the raw `DataFrame` is read from each file, this schema will be applied to ensure a consisten output from the source. |\n", + "| `file_type` | `FileTypes` | `FileTypes.Auto` | Allows overriding the file type. When set to `Auto`, the file extension will be used. Options are `CSV`, `JSON`, `Auto`. |\n", + "| `parser_kwargs` | `dict` | `{}` | This dictionary will be passed to the `DataFrame` parser class. Allows for customization of log parsing. |\n", + "| `cache_dir` | `str` | `./.cache/dfp` | The location to write cached input files to. |\n", + "\n", + "### Split Users Stage (`DFPSplitUsersStage`)\n", + "\n", + "Once the input logs have been read into a `DataFrame`, this stage is responsible for breaking that single `DataFrame` with many users into multiple `DataFrame`s for each user. This is also where the pipeline chooses whether to train individual users or the generic user (or both).\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `include_generic` | `bool` | | Whether or not to combine all user logs into a single `DataFrame` with the username 'generic_user' |\n", + "| `include_individual` | `bool` | | Whether or not to output individual `DataFrame` objects for each user |\n", + "| `skip_users` | List of `str` | `[]` | Any users to remove from the `DataFrame`. Useful for debugging to remove automated accounts with many logs. Mutually exclusive with `only_users`. |\n", + "| `only_users` | List of `str` | `[]` | Only allow these users in the final `DataFrame`. Useful for debugging to focus on specific users. Mutually exclusive with `skip_users`. |\n", + "\n", + "### Rolling Window Stage (`DFPRollingWindowStage`)\n", + "\n", + "The Rolling Window Stage performs several key pieces of functionality for DFP.\n", + "1. This stage keeps a moving window of logs on a per user basis\n", + " 1. These logs are saved to disk to reduce memory requirements between logs from the same user\n", + "1. It only emits logs when the window history requirements are met\n", + " 1. Until all of the window history requirements are met, no messages will be sent to the rest of the pipeline.\n", + " 1. See the below options for configuring the window history requirements\n", + "1. It repeats the necessary logs to properly calculate log dependent features.\n", + " 1. To support all column feature types, incoming log messages can be combined with existing history and sent to downstream stages.\n", + " 1. For example, to calculate a feature that increments a counter for the number of logs a particular user has generated in a single day, we must have the user's log history for the past 24 hours. To support this, this stage will combine new logs with existing history into a single `DataFrame`.\n", + " 1. It is the responsibility of downstream stages to distinguish between new logs and existing history.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `min_history` | `int` | `300` | The minimum number of logs a user must have before emitting any messages. Logs below this threshold will be saved to disk. |\n", + "| `min_increment` | `int` or `str` | `300` | Once the min history requirement is met, this stage must receive `min_increment` *new* logs before emmitting another message. Logs received before this threshold is met will be saved to disk. Can be specified as an integer count or a string duration. |\n", + "| `max_history` | `int` or `str` | `\"60d\"` | Once `min_history` and `min_increment` requirements have been met, this puts an upper bound on the maximum number of messages to forward into the pipeline and also the maximum amount of messages to retain in the history. Can be specified as an integer count or a string duration. |\n", + "| `cache_dir` | `str` | `./.cache/dfp` | The location to write log history to disk. |\n", + "\n", + "### Preprocessing Stage (`DFPPreprocessingStage`)\n", + "\n", + "This stage performs the final, row dependent, feature calculations as specified by the input schema object. Once calculated, this stage can forward on all received logs, or optionally can only forward on new logs, removing any history information.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `input_schema` | `DataFrameInputSchema` | | The final, row dependent, schema to apply to the incoming columns |\n", + "| `only_new_batches` | `bool` | | Whether or not to foward on all received logs, or just new logs. |\n", + "\n", + "### Training Stage (`DFPTraining`)\n", + "\n", + "This stage is responsible for performing the actual training calculations. Training will be performed on all received data. Resulting message will contain the input data paired with the trained model.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `model_kwargs` | `dict` | `{}` | The options to use when creating a new model instance. See `DFPAutoEncoder` for information on the available options. |\n", + "\n", + "### MLFlow Model Writer Stage (`DFPMLFlowModelWriterStage`)\n", + "\n", + "This stage is the last step in training. It will upload the trained model from the previous stage to MLFlow. The tracking URI for which MLFlow instance to use is configured using the static method `mlflow.set_tracking_uri()`.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `model_name_formatter` | `str` | `\"\"` | A format string to use when building the model name. The model name is constructed by calling `model_name_formatter.format(user_id=user_id)`. For example, with `model_name_formatter=\"my_model-{user_id}\"` and a user ID of `\"first:last\"` would result in the model name of `\"my_model-first:last\"` |\n", + "| `experiment_name` | `str` | | All models are created inside of an experiment to allow metrics to be saved with each model. This option specifies the experiment name. The final experiment name for each model will be in the form of `{experiment_name}/{model_name}` |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "825390ad-ce64-4949-b324-33039ffdf264", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a linear pipeline object\n", + "pipeline = LinearPipeline(config)\n", + "\n", + "# Source stage\n", + "pipeline.set_source(MultiFileSource(config, filenames=input_files))\n", + "\n", + "# Batch files into buckets by time. Use the default ISO date extractor from the filename\n", + "pipeline.add_stage(\n", + " DFPFileBatcherStage(config,\n", + " period=\"D\",\n", + " date_conversion_func=functools.partial(date_extractor, filename_regex=iso_date_regex)))\n", + "\n", + "# Output is S3 Buckets. Convert to DataFrames. This caches downloaded S3 data\n", + "pipeline.add_stage(\n", + " DFPFileToDataFrameStage(config,\n", + " schema=source_schema,\n", + " file_type=FileTypes.JSON,\n", + " parser_kwargs={\n", + " \"lines\": False, \"orient\": \"records\"\n", + " },\n", + " cache_dir=cache_dir))\n", + "\n", + "\n", + "# This will split users or just use one single user\n", + "pipeline.add_stage(\n", + " DFPSplitUsersStage(config,\n", + " include_generic=include_generic,\n", + " include_individual=include_individual,\n", + " skip_users=skip_users))\n", + "\n", + "# Next, have a stage that will create rolling windows\n", + "pipeline.add_stage(\n", + " DFPRollingWindowStage(\n", + " config,\n", + " min_history=300 if is_training else 1,\n", + " min_increment=300 if is_training else 0,\n", + " # For inference, we only ever want 1 day max\n", + " max_history=\"60d\" if is_training else \"1d\",\n", + " cache_dir=cache_dir))\n", + "\n", + "# Output is UserMessageMeta -- Cached frame set\n", + "pipeline.add_stage(DFPPreprocessingStage(config, input_schema=preprocess_schema, only_new_batches=not is_training))\n", + "\n", + "# Finally, perform training which will output a model\n", + "pipeline.add_stage(DFPTraining(config))\n", + "\n", + "# Write that model to MLFlow\n", + "pipeline.add_stage(\n", + " DFPMLFlowModelWriterStage(config,\n", + " model_name_formatter=model_name_formatter,\n", + " experiment_name_formatter=experiment_name_formatter))\n", + "\n", + "# Run the pipeline\n", + "await pipeline._do_run()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + }, + "vscode": { + "interpreter": { + "hash": "f7a30172b4be85fcd6fc3717815fa43e2969e39e7c3ddd169e51bb2fb4d7b2e9" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/digital_fingerprinting/production/morpheus/dfp_duo_inference.ipynb b/examples/digital_fingerprinting/production/morpheus/dfp_duo_inference.ipynb new file mode 100644 index 0000000000..8683c4bc96 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp_duo_inference.ipynb @@ -0,0 +1,459 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2941e94f-db20-44a5-ab87-2cab499825f7", + "metadata": {}, + "source": [ + "# Digital Finger Printing (DFP) with Morpheus - DUO Inference\n", + "## Introduction\n", + "\n", + "In this notebook, we will be building and running a DFP pipeline that performs inference on Duo authentication logs. The goal is to use the pretrained models generated in the Duo Training notebook to generate anomaly scores for each log. These anomaly scores can be used by security teams to detect abnormal behavior when it happens so the proper action can be taken.\n", + "\n", + "
\n", + "Note: For more information on DFP, the Morpheus pipeline, and setup steps to run this notebook, please see the coresponding DFP training materials.\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6c1cb50-74f2-445d-b865-8c22c3b3798b", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "# Ensure that the morpheus directory is in the python path. This may not need to be run depending on the environment setup\n", + "import sys\n", + "import os\n", + "sys.path.insert(0, os.path.abspath(\"./morpheus\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "102ce011-3ca3-4f96-a72d-de28fad32003", + "metadata": {}, + "outputs": [], + "source": [ + "import functools\n", + "import logging\n", + "import os\n", + "import typing\n", + "from datetime import datetime\n", + "from functools import partial\n", + "\n", + "import click\n", + "import mlflow\n", + "from dfp.stages.dfp_file_batcher_stage import DFPFileBatcherStage\n", + "from dfp.stages.dfp_file_to_df import DFPFileToDataFrameStage\n", + "from dfp.stages.dfp_inference_stage import DFPInferenceStage\n", + "from dfp.stages.dfp_mlflow_model_writer import DFPMLFlowModelWriterStage\n", + "from dfp.stages.dfp_postprocessing_stage import DFPPostprocessingStage\n", + "from dfp.stages.dfp_preprocessing_stage import DFPPreprocessingStage\n", + "from dfp.stages.dfp_rolling_window_stage import DFPRollingWindowStage\n", + "from dfp.stages.dfp_split_users_stage import DFPSplitUsersStage\n", + "from dfp.stages.dfp_training import DFPTraining\n", + "from dfp.stages.multi_file_source import MultiFileSource\n", + "from dfp.utils.column_info import BoolColumn\n", + "from dfp.utils.column_info import ColumnInfo\n", + "from dfp.utils.column_info import CustomColumn\n", + "from dfp.utils.column_info import DataFrameInputSchema\n", + "from dfp.utils.column_info import DateTimeColumn\n", + "from dfp.utils.column_info import IncrementColumn\n", + "from dfp.utils.column_info import RenameColumn\n", + "from dfp.utils.column_info import StringCatColumn\n", + "from dfp.utils.column_info import create_increment_col\n", + "from dfp.utils.file_utils import date_extractor\n", + "from dfp.utils.file_utils import iso_date_regex\n", + "\n", + "from morpheus._lib.file_types import FileTypes\n", + "from morpheus.cli.utils import get_package_relative_file\n", + "from morpheus.cli.utils import load_labels_file\n", + "from morpheus.config import Config\n", + "from morpheus.config import ConfigAutoEncoder\n", + "from morpheus.config import CppConfig\n", + "from morpheus.pipeline import LinearPipeline\n", + "from morpheus.stages.general.monitor_stage import MonitorStage\n", + "from morpheus.stages.output.write_to_file_stage import WriteToFileStage\n", + "from morpheus.utils.logger import configure_logging\n", + "from morpheus.utils.logger import get_log_levels\n", + "from morpheus.utils.logger import parse_log_level\n", + "\n", + "# Left align all tables\n", + "from IPython.core.display import HTML\n", + "table_css = 'table {align:left;display:block}'\n", + "HTML(''.format(table_css))" + ] + }, + { + "cell_type": "markdown", + "id": "ca38c1b7-ce84-43e0-ac53-280562dc1642", + "metadata": {}, + "source": [ + "## High Level Configuration\n", + "\n", + "The following options significantly alter the functionality of the pipeline. These options are separated from the individual stage options since they may effect more than one stage. Additionally, the matching python script to this notebook, `dfp_duo_pipeline.py`, configures these options via command line arguments.\n", + "\n", + "### Options\n", + "\n", + "| Name | Type | Description |\n", + "| --- | --- | :-- |\n", + "| `train_users` | One of `[\"none\"]` | For inference, this option should always be `\"none\"` |\n", + "| `skip_users` | List of strings | Any user in this list will be dropped from the pipeline. Useful for debugging to remove automated accounts with many logs. |\n", + "| `cache_dir` | string | The location to store cached files. To aid with development and reduce bandwidth, the Morpheus pipeline will cache data from several stages of the pipeline. This option configures the location for those caches. |\n", + "| `input_files` | List of strings | List of files to process. Can specificy multiple arguments for multiple files. Also accepts glob (\\*) wildcards and schema prefixes such as `s3://`. For example, to make a local cache of an s3 bucket, use `filecache::s3://mybucket/*`. See `fsspec` documentation for list of possible options. |\n", + "| `model_name_formatter` | string | A format string to use when building the model name. The model name is constructed by calling `model_name_formatter.format(user_id=user_id)`. For example, with `model_name_formatter=\"my_model-{user_id}\"` and a user ID of `\"first:last\"` would result in the model name of `\"my_model-first:last\"`. This should match the value used in `DFPMLFlowModelWriterStage`. Available keyword arguments: `user_id`, `user_md5`. |\n", + "| `experiment_name_formatter` | string | A format string (without the `f`) that will be used when creating an experiment in ML Flow. Available keyword arguments: `user_id`, `user_md5`, `reg_model_name`. |\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ee00703-75c5-46fc-890c-86733da906c4", + "metadata": {}, + "outputs": [], + "source": [ + "# Global options\n", + "train_users = \"none\"\n", + "\n", + "# Enter any users to skip here\n", + "skip_users: typing.List[str] = []\n", + "\n", + "# Location where cache objects will be saved\n", + "cache_dir = \"./.cache/dfp\"\n", + "\n", + "# Input files to read from\n", + "input_files = [\n", + " \"/workspace/examples/data/dfp/duo/public_preview_final/s3/DUO_2022-08-3*.json\",\n", + "]\n", + "\n", + "# The format to use for models\n", + "model_name_formatter = \"DFP-duo-{user_id}\"\n", + "\n", + "# === Derived Options ===\n", + "# To include the generic, we must be training all or generic\n", + "include_generic = train_users == \"all\" or train_users == \"generic\"\n", + "\n", + "# To include individual, we must be either training or inferring\n", + "include_individual = train_users != \"generic\"\n", + "\n", + "# None indicates we arent training anything\n", + "is_training = train_users != \"none\"" + ] + }, + { + "cell_type": "markdown", + "id": "1cfc24c9-c85e-4977-a348-692c8f0aceaa", + "metadata": {}, + "source": [ + "### Global Config Object\n", + "Before creating the pipeline, we need to setup logging and set the parameters for the Morpheus config object. This config object is responsible for the following:\n", + " - Indicating whether to use C++ or Python stages\n", + " - C++ stages are not supported for the DFP pipeline. This should always be `False`\n", + " - Setting the number of threads to use in the pipeline. Defaults to the thread count of the OS.\n", + " - Sets the feature column names that will be used in model training\n", + " - This option allows extra columns to be used in the pipeline that will not be part of the training algorithm.\n", + " - The final features that the model will be trained on will be an intersection of this list with the log columns.\n", + " - The column name that indicates the user's unique identifier\n", + " - It is required for DFP to have a user ID column\n", + " - The column name that indicates the timestamp for the log\n", + " - It is required for DFP to know when each log occurred" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "01abd537-9162-49dc-8e83-d9465592f1d5", + "metadata": {}, + "outputs": [], + "source": [ + "# Enable the Morpheus logger\n", + "configure_logging(log_level=logging.DEBUG)\n", + "\n", + "config = Config()\n", + "\n", + "CppConfig.set_should_use_cpp(False)\n", + "\n", + "config.num_threads = os.cpu_count()\n", + "\n", + "config.ae = ConfigAutoEncoder()\n", + "\n", + "config.ae.feature_columns = [\n", + " 'accessdevicebrowser', 'accessdeviceos', 'authdevicename', 'result', 'reason', 'logcount', \"locincrement\"\n", + "]\n", + "config.ae.userid_column_name = \"username\"\n", + "config.ae.timestamp_column_name = \"timestamp\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a73a4d53-32b6-4ab8-a5d7-c0104b31c69b", + "metadata": {}, + "outputs": [], + "source": [ + "# Specify the column names to ensure all data is uniform\n", + "source_column_info = [\n", + " DateTimeColumn(name=config.ae.timestamp_column_name, dtype=datetime, input_name=\"timestamp\"),\n", + " RenameColumn(name=config.ae.userid_column_name, dtype=str, input_name=\"user.name\"),\n", + " RenameColumn(name=\"accessdevicebrowser\", dtype=str, input_name=\"access_device.browser\"),\n", + " RenameColumn(name=\"accessdeviceos\", dtype=str, input_name=\"access_device.os\"),\n", + " StringCatColumn(name=\"location\",\n", + " dtype=str,\n", + " input_columns=[\n", + " \"access_device.location.city\",\n", + " \"access_device.location.state\",\n", + " \"access_device.location.country\"\n", + " ],\n", + " sep=\", \"),\n", + " RenameColumn(name=\"authdevicename\", dtype=str, input_name=\"auth_device.name\"),\n", + " BoolColumn(name=\"result\",\n", + " dtype=bool,\n", + " input_name=\"result\",\n", + " true_values=[\"success\", \"SUCCESS\"],\n", + " false_values=[\"denied\", \"DENIED\", \"FRAUD\"]),\n", + " ColumnInfo(name=\"reason\", dtype=str),\n", + " # CustomColumn(name=\"user.groups\", dtype=str, process_column_fn=partial(column_listjoin, col_name=\"user.groups\"))\n", + "]\n", + "\n", + "source_schema = DataFrameInputSchema(json_columns=[\"access_device\", \"application\", \"auth_device\", \"user\"],\n", + " column_info=source_column_info)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7a0cb0a-e65a-444a-a06c-a4525d543790", + "metadata": {}, + "outputs": [], + "source": [ + "# Preprocessing schema\n", + "preprocess_column_info = [\n", + " ColumnInfo(name=config.ae.timestamp_column_name, dtype=datetime),\n", + " ColumnInfo(name=config.ae.userid_column_name, dtype=str),\n", + " ColumnInfo(name=\"accessdevicebrowser\", dtype=str),\n", + " ColumnInfo(name=\"accessdeviceos\", dtype=str),\n", + " ColumnInfo(name=\"authdevicename\", dtype=str),\n", + " ColumnInfo(name=\"result\", dtype=bool),\n", + " ColumnInfo(name=\"reason\", dtype=str),\n", + " # Derived columns\n", + " IncrementColumn(name=\"logcount\",\n", + " dtype=int,\n", + " input_name=config.ae.timestamp_column_name,\n", + " groupby_column=config.ae.userid_column_name),\n", + " CustomColumn(name=\"locincrement\",\n", + " dtype=int,\n", + " process_column_fn=partial(create_increment_col, column_name=\"location\")),\n", + "]\n", + "\n", + "preprocess_schema = DataFrameInputSchema(column_info=preprocess_column_info, preserve_columns=[\"_batch_id\"])\n" + ] + }, + { + "cell_type": "markdown", + "id": "bdfc59de-dea8-4e5f-98e3-98eba1e4621d", + "metadata": {}, + "source": [ + "## Pipeline Construction\n", + "From this point on we begin constructing the stages that will make up the pipeline. To make testing easier, constructing the pipeline object, adding the stages, and running the pipeline, is provided as a single cell. The below cell can be rerun multiple times as needed for debugging.\n", + "\n", + "### Source Stage (`MultiFileSource`)\n", + "\n", + "This pipeline read input logs from one or more input files. This source stage will read all specified log files, combine them into a single `DataFrame`, and pass it into the pipeline. Once all of the logs have been read, the source completes. \n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `filenames` | List of strings | | Any files to read into the pipeline. All files will be combined into a single `DataFrame` |\n", + "\n", + "### File Batcher Stage (`DFPFileBatcherStage`)\n", + "\n", + "To improve performance, multiple small input files can be batched together into a single DataFrame for processing. This stage is responsible for determining the timestamp of input files, grouping input files into batches by time, and sending the batches to be processed into a single DataFrame. Batches of files that have been seen before will be loaded from cache resulting in increased performance. For example, when performaing a 60 day training run, 59 days can be cached with a period of `\"D\"` and retraining once per day.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `period` | `str` | `\"D\"` | The period to create batches. See `pandas` windowing frequency documentation for available options. |\n", + "| `date_conversion_func` | Function of `typing.Callable[[fsspec.core.OpenFile], datetime]` | | A callback which is responsible for determining the date for a specified file. |\n", + "\n", + "### File to DataFrame Stage (`DFPFileToDataFrameStage`)\n", + "\n", + "After files have been batched into groups, this stage is responsible for reading the files and converting into a DataFrame. The specified input schema converts the raw DataFrame into one suitable for caching and processing. Any columns that are not needed should be excluded from the schema.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `schema` | `DataFrameInputSchema` | | After the raw `DataFrame` is read from each file, this schema will be applied to ensure a consisten output from the source. |\n", + "| `file_type` | `FileTypes` | `FileTypes.Auto` | Allows overriding the file type. When set to `Auto`, the file extension will be used. Options are `CSV`, `JSON`, `Auto`. |\n", + "| `parser_kwargs` | `dict` | `{}` | This dictionary will be passed to the `DataFrame` parser class. Allows for customization of log parsing. |\n", + "| `cache_dir` | `str` | `./.cache/dfp` | The location to write cached input files to. |\n", + "\n", + "### Split Users Stage (`DFPSplitUsersStage`)\n", + "\n", + "Once the input logs have been read into a `DataFrame`, this stage is responsible for breaking that single `DataFrame` with many users into multiple `DataFrame`s for each user. This is also where the pipeline chooses whether to train individual users or the generic user (or both).\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `include_generic` | `bool` | | Whether or not to combine all user logs into a single `DataFrame` with the username 'generic_user' |\n", + "| `include_individual` | `bool` | | Whether or not to output individual `DataFrame` objects for each user |\n", + "| `skip_users` | List of `str` | `[]` | Any users to remove from the `DataFrame`. Useful for debugging to remove automated accounts with many logs. Mutually exclusive with `only_users`. |\n", + "| `only_users` | List of `str` | `[]` | Only allow these users in the final `DataFrame`. Useful for debugging to focus on specific users. Mutually exclusive with `skip_users`. |\n", + "\n", + "### Rolling Window Stage (`DFPRollingWindowStage`)\n", + "\n", + "The Rolling Window Stage performs several key pieces of functionality for DFP.\n", + "1. This stage keeps a moving window of logs on a per user basis\n", + " 1. These logs are saved to disk to reduce memory requirements between logs from the same user\n", + "1. It only emits logs when the window history requirements are met\n", + " 1. Until all of the window history requirements are met, no messages will be sent to the rest of the pipeline.\n", + " 1. See the below options for configuring the window history requirements\n", + "1. It repeats the necessary logs to properly calculate log dependent features.\n", + " 1. To support all column feature types, incoming log messages can be combined with existing history and sent to downstream stages.\n", + " 1. For example, to calculate a feature that increments a counter for the number of logs a particular user has generated in a single day, we must have the user's log history for the past 24 hours. To support this, this stage will combine new logs with existing history into a single `DataFrame`.\n", + " 1. It is the responsibility of downstream stages to distinguish between new logs and existing history.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `min_history` | `int` | `1` | The minimum number of logs a user must have before emitting any messages. Logs below this threshold will be saved to disk. |\n", + "| `min_increment` | `int` or `str` | `0` | Once the min history requirement is met, this stage must receive `min_increment` *new* logs before emmitting another message. Logs received before this threshold is met will be saved to disk. Can be specified as an integer count or a string duration. |\n", + "| `max_history` | `int` or `str` | `\"1d\"` | Once `min_history` and `min_increment` requirements have been met, this puts an upper bound on the maximum number of messages to forward into the pipeline and also the maximum amount of messages to retain in the history. Can be specified as an integer count or a string duration. |\n", + "| `cache_dir` | `str` | `./.cache/dfp` | The location to write log history to disk. |\n", + "\n", + "### Preprocessing Stage (`DFPPreprocessingStage`)\n", + "\n", + "This stage performs the final, row dependent, feature calculations as specified by the input schema object. Once calculated, this stage can forward on all received logs, or optionally can only forward on new logs, removing any history information.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `input_schema` | `DataFrameInputSchema` | | The final, row dependent, schema to apply to the incoming columns |\n", + "| `only_new_batches` | `bool` | | Whether or not to foward on all received logs, or just new logs. |\n", + "\n", + "### Inference Stage (`DFPInference`)\n", + "\n", + "This stage performs several tasks to aid in performing inference. This stage will:\n", + "1. Download models as needed from MLFlow\n", + "1. Cache previously downloaded models to improve performance\n", + " 1. Models in the cache will be periodically refreshed from MLFlow at a configured rate\n", + "1. Perform inference using the downloaded model\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `model_name_formatter` | `str` | `\"\"` | A format string to use when building the model name. The model name is constructed by calling `model_name_formatter.format(user_id=user_id)`. For example, with `model_name_formatter=\"my_model-{user_id}\"` and a user ID of `\"first:last\"` would result in the model name of `\"my_model-first:last\"`. This should match the value used in `DFPMLFlowModelWriterStage` |\n", + "\n", + "### Post Processing Stage (`DFPPostprocessingStage`)\n", + "\n", + "This stage filters the output from the inference stage for any anomalous messages. Logs which exceed the specified Z-Score will be passed onto the next stage. All remaining logs which are below the threshold will be dropped.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `z_score_threshold` | `float` | `2.0` | The Z-Score used to separate anomalous logs from normal logs. All normal logs will be filterd out and anomalous logs will be passed on. |\n", + "\n", + "### Write to File Stage (`WriteToFileStage`)\n", + "\n", + "This final stage will write all received messages to a single output file in either CSV or JSON format.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `filename` | `str` | | The file to write anomalous log messages to. |\n", + "| `overwrite` | `bool` | `False` | If the file specified in `filename` already exists, it will be overwritten if this option is set to `True` |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "825390ad-ce64-4949-b324-33039ffdf264", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a linear pipeline object\n", + "pipeline = LinearPipeline(config)\n", + "\n", + "# Source stage\n", + "pipeline.set_source(MultiFileSource(config, filenames=input_files))\n", + "\n", + "# Batch files into buckets by time. Use the default ISO date extractor from the filename\n", + "pipeline.add_stage(\n", + " DFPFileBatcherStage(config,\n", + " period=\"D\",\n", + " date_conversion_func=functools.partial(date_extractor, filename_regex=iso_date_regex)))\n", + "\n", + "# Output is S3 Buckets. Convert to DataFrames. This caches downloaded S3 data\n", + "pipeline.add_stage(\n", + " DFPFileToDataFrameStage(config,\n", + " schema=source_schema,\n", + " file_type=FileTypes.JSON,\n", + " parser_kwargs={\n", + " \"lines\": False, \"orient\": \"records\"\n", + " },\n", + " cache_dir=cache_dir))\n", + "\n", + "\n", + "# This will split users or just use one single user\n", + "pipeline.add_stage(\n", + " DFPSplitUsersStage(config,\n", + " include_generic=include_generic,\n", + " include_individual=include_individual,\n", + " skip_users=skip_users))\n", + "\n", + "# Next, have a stage that will create rolling windows\n", + "pipeline.add_stage(\n", + " DFPRollingWindowStage(\n", + " config,\n", + " min_history=300 if is_training else 1,\n", + " min_increment=300 if is_training else 0,\n", + " # For inference, we only ever want 1 day max\n", + " max_history=\"60d\" if is_training else \"1d\",\n", + " cache_dir=cache_dir))\n", + "\n", + "# Output is UserMessageMeta -- Cached frame set\n", + "pipeline.add_stage(DFPPreprocessingStage(config, input_schema=preprocess_schema, only_new_batches=not is_training))\n", + "\n", + "# Perform inference on the preprocessed data\n", + "pipeline.add_stage(DFPInferenceStage(config, model_name_formatter=model_name_formatter))\n", + "\n", + "# Filter for only the anomalous logs\n", + "pipeline.add_stage(DFPPostprocessingStage(config, z_score_threshold=2.0))\n", + "\n", + "# Write all anomalies to a CSV file\n", + "pipeline.add_stage(WriteToFileStage(config, filename=\"dfp_detections_duo.csv\", overwrite=True))\n", + "\n", + "# Run the pipeline\n", + "await pipeline._do_run()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75c0cf6b-8255-4d90-b67c-151518c7423b", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:morpheus] *", + "language": "python", + "name": "conda-env-morpheus-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + }, + "vscode": { + "interpreter": { + "hash": "e26783b24f020aa0bcaa00e6ba122db5d0e3da2d892d80be664969895e06a7e1" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/digital_fingerprinting/production/morpheus/dfp_duo_pipeline.py b/examples/digital_fingerprinting/production/morpheus/dfp_duo_pipeline.py new file mode 100644 index 0000000000..52cff2fa53 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp_duo_pipeline.py @@ -0,0 +1,288 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +import logging +import os +import typing +from datetime import datetime +from functools import partial + +import click +import mlflow +from dfp.stages.dfp_file_batcher_stage import DFPFileBatcherStage +from dfp.stages.dfp_file_to_df import DFPFileToDataFrameStage +from dfp.stages.dfp_inference_stage import DFPInferenceStage +from dfp.stages.dfp_mlflow_model_writer import DFPMLFlowModelWriterStage +from dfp.stages.dfp_postprocessing_stage import DFPPostprocessingStage +from dfp.stages.dfp_preprocessing_stage import DFPPreprocessingStage +from dfp.stages.dfp_rolling_window_stage import DFPRollingWindowStage +from dfp.stages.dfp_split_users_stage import DFPSplitUsersStage +from dfp.stages.dfp_training import DFPTraining +from dfp.stages.multi_file_source import MultiFileSource +from dfp.utils.column_info import BoolColumn +from dfp.utils.column_info import ColumnInfo +from dfp.utils.column_info import CustomColumn +from dfp.utils.column_info import DataFrameInputSchema +from dfp.utils.column_info import DateTimeColumn +from dfp.utils.column_info import IncrementColumn +from dfp.utils.column_info import RenameColumn +from dfp.utils.column_info import StringCatColumn +from dfp.utils.column_info import create_increment_col +from dfp.utils.file_utils import date_extractor +from dfp.utils.file_utils import iso_date_regex + +from morpheus._lib.file_types import FileTypes +from morpheus.cli.utils import get_package_relative_file +from morpheus.cli.utils import load_labels_file +from morpheus.config import Config +from morpheus.config import ConfigAutoEncoder +from morpheus.config import CppConfig +from morpheus.pipeline import LinearPipeline +from morpheus.stages.general.monitor_stage import MonitorStage +from morpheus.stages.output.write_to_file_stage import WriteToFileStage +from morpheus.utils.logger import configure_logging +from morpheus.utils.logger import get_log_levels +from morpheus.utils.logger import parse_log_level + + +@click.command() +@click.option( + "--train_users", + type=click.Choice(["all", "generic", "individual", "none"], case_sensitive=False), + help="Indicates whether or not to train per user or a generic model for all users", +) +@click.option( + "--skip_user", + multiple=True, + type=str, + help="User IDs to skip. Mutually exclusive with only_user", +) +@click.option( + "--only_user", + multiple=True, + type=str, + help="Only users specified by this option will be included. Mutually exclusive with skip_user", +) +@click.option( + "--duration", + type=str, + default="60d", + help="The duration to run starting from now", +) +@click.option( + "--cache_dir", + type=str, + default="./.cache/dfp", + show_envvar=True, + help="The location to cache data such as S3 downloads and pre-processed data", +) +@click.option("--log_level", + default=logging.getLevelName(Config().log_level), + type=click.Choice(get_log_levels(), case_sensitive=False), + callback=parse_log_level, + help="Specify the logging level to use.") +@click.option("--sample_rate_s", + type=int, + default=0, + show_envvar=True, + help="Minimum time step, in milliseconds, between object logs.") +@click.option( + "--input_file", + "-f", + type=str, + multiple=True, + help=("List of files to process. Can specificy multiple arguments for multiple files. " + "Also accepts glob (*) wildcards and schema prefixes such as `s3://`. " + "For example, to make a local cache of an s3 bucket, use `filecache::s3://mybucket/*`. " + "See fsspec documentation for list of possible options."), +) +@click.option('--tracking_uri', + type=str, + default="http://localhost:5000", + help=("The ML Flow tracking URI to connect to the tracking backend. If not speficied, MF Flow will use " + "'file:///mlruns' relative to the current directory")) +def run_pipeline(train_users, + skip_user: typing.Tuple[str], + only_user: typing.Tuple[str], + duration, + cache_dir, + log_level, + sample_rate_s, + **kwargs): + # To include the generic, we must be training all or generic + include_generic = train_users == "all" or train_users == "generic" + + # To include individual, we must be either training or inferring + include_individual = train_users != "generic" + + # None indicates we arent training anything + is_training = train_users != "none" + + skip_users = list(skip_user) + only_users = list(only_user) + + # Enable the Morpheus logger + configure_logging(log_level=log_level) + + if (len(skip_users) > 0 and len(only_users) > 0): + logging.error("Option --skip_user and --only_user are mutually exclusive. Exiting") + + logger = logging.getLogger("morpheus.{}".format(__name__)) + + logger.info("Running training pipeline with the following options: ") + logger.info("Train generic_user: %s", include_generic) + logger.info("Skipping users: %s", skip_users) + logger.info("Duration: %s", duration) + logger.info("Cache Dir: %s", cache_dir) + + if ("tracking_uri" in kwargs): + # Initialize ML Flow + mlflow.set_tracking_uri(kwargs["tracking_uri"]) + logger.info("Tracking URI: %s", mlflow.get_tracking_uri()) + + config = Config() + + CppConfig.set_should_use_cpp(False) + + config.num_threads = os.cpu_count() + + config.ae = ConfigAutoEncoder() + + config.ae.feature_columns = load_labels_file(get_package_relative_file("data/columns_ae_duo.txt")) + config.ae.userid_column_name = "username" + config.ae.timestamp_column_name = "timestamp" + + # Specify the column names to ensure all data is uniform + source_column_info = [ + DateTimeColumn(name=config.ae.timestamp_column_name, dtype=datetime, input_name="timestamp"), + RenameColumn(name=config.ae.userid_column_name, dtype=str, input_name="user.name"), + RenameColumn(name="accessdevicebrowser", dtype=str, input_name="access_device.browser"), + RenameColumn(name="accessdeviceos", dtype=str, input_name="access_device.os"), + StringCatColumn(name="location", + dtype=str, + input_columns=[ + "access_device.location.city", + "access_device.location.state", + "access_device.location.country" + ], + sep=", "), + RenameColumn(name="authdevicename", dtype=str, input_name="auth_device.name"), + BoolColumn(name="result", + dtype=bool, + input_name="result", + true_values=["success", "SUCCESS"], + false_values=["denied", "DENIED", "FRAUD"]), + ColumnInfo(name="reason", dtype=str), + ] + + source_schema = DataFrameInputSchema(json_columns=["access_device", "application", "auth_device", "user"], + column_info=source_column_info) + + # Preprocessing schema + preprocess_column_info = [ + ColumnInfo(name=config.ae.timestamp_column_name, dtype=datetime), + ColumnInfo(name=config.ae.userid_column_name, dtype=str), + ColumnInfo(name="accessdevicebrowser", dtype=str), + ColumnInfo(name="accessdeviceos", dtype=str), + ColumnInfo(name="authdevicename", dtype=str), + ColumnInfo(name="result", dtype=bool), + ColumnInfo(name="reason", dtype=str), + # Derived columns + IncrementColumn(name="logcount", + dtype=int, + input_name=config.ae.timestamp_column_name, + groupby_column=config.ae.userid_column_name), + CustomColumn(name="locincrement", + dtype=int, + process_column_fn=partial(create_increment_col, column_name="location")), + ] + + preprocess_schema = DataFrameInputSchema(column_info=preprocess_column_info, preserve_columns=["_batch_id"]) + + # Create a linear pipeline object + pipeline = LinearPipeline(config) + + pipeline.set_source(MultiFileSource(config, filenames=list(kwargs["input_file"]))) + + # Batch files into buckets by time. Use the default ISO date extractor from the filename + pipeline.add_stage( + DFPFileBatcherStage(config, + period="D", + sampling_rate_s=sample_rate_s, + date_conversion_func=functools.partial(date_extractor, filename_regex=iso_date_regex))) + + # Output is S3 Buckets. Convert to DataFrames. This caches downloaded S3 data + pipeline.add_stage( + DFPFileToDataFrameStage(config, + schema=source_schema, + file_type=FileTypes.JSON, + parser_kwargs={ + "lines": False, "orient": "records" + }, + cache_dir=cache_dir)) + + pipeline.add_stage(MonitorStage(config, description="Input data rate")) + + # This will split users or just use one single user + pipeline.add_stage( + DFPSplitUsersStage(config, + include_generic=include_generic, + include_individual=include_individual, + skip_users=skip_users, + only_users=only_users)) + + # Next, have a stage that will create rolling windows + pipeline.add_stage( + DFPRollingWindowStage( + config, + min_history=300 if is_training else 1, + min_increment=300 if is_training else 0, + # For inference, we only ever want 1 day max + max_history="60d" if is_training else "1d", + cache_dir=cache_dir)) + + # Output is UserMessageMeta -- Cached frame set + pipeline.add_stage(DFPPreprocessingStage(config, input_schema=preprocess_schema, only_new_batches=not is_training)) + + model_name_formatter = "DFP-duo-{user_id}" + experiment_name_formatter = "dfp/duo/training/{reg_model_name}" + + if (is_training): + + # Finally, perform training which will output a model + pipeline.add_stage(DFPTraining(config)) + + pipeline.add_stage(MonitorStage(config, description="Training rate", smoothing=0.001)) + + # Write that model to MLFlow + pipeline.add_stage( + DFPMLFlowModelWriterStage(config, + model_name_formatter=model_name_formatter, + experiment_name_formatter=experiment_name_formatter)) + else: + pipeline.add_stage(DFPInferenceStage(config, model_name_formatter=model_name_formatter)) + + pipeline.add_stage(MonitorStage(config, description="Inference rate", smoothing=0.001)) + + pipeline.add_stage(DFPPostprocessingStage(config, z_score_threshold=2.0)) + + pipeline.add_stage(WriteToFileStage(config, filename="dfp_detections_duo.csv", overwrite=True)) + + # Run the pipeline + pipeline.run() + + +if __name__ == "__main__": + run_pipeline(obj={}, auto_envvar_prefix='DFP', show_default=True, prog_name="dfp") diff --git a/examples/digital_fingerprinting/production/morpheus/dfp_duo_training.ipynb b/examples/digital_fingerprinting/production/morpheus/dfp_duo_training.ipynb new file mode 100644 index 0000000000..744c2f19b4 --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/dfp_duo_training.ipynb @@ -0,0 +1,452 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2941e94f-db20-44a5-ab87-2cab499825f7", + "metadata": {}, + "source": [ + "# Digital Finger Printing (DFP) with Morpheus - DUO Training\n", + "## Introduction\n", + "\n", + "In this notebook, we will be building and running a DFP pipeline that performs training on Duo authentication logs. The goal is to train an autoencoder PyTorch model to recogize the patterns of users in the sample data. The model will then be used by a second Morpheus pipeline to generate anomaly scores for each individual log. These anomaly scores can be used by security teams to detect abnormal behavior when it happens so the proper action can be taken.\n", + "\n", + "
\n", + "Note: For more information on DFP, the Morpheus pipeline, and setup steps to run this notebook, please see the coresponding DFP training materials.\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6c1cb50-74f2-445d-b865-8c22c3b3798b", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "# Ensure that the morpheus directory is in the python path. This may not need to be run depending on the environment setup\n", + "import sys\n", + "import os\n", + "sys.path.insert(0, os.path.abspath(\"./morpheus\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "102ce011-3ca3-4f96-a72d-de28fad32003", + "metadata": {}, + "outputs": [], + "source": [ + "import functools\n", + "import logging\n", + "import os\n", + "import typing\n", + "from datetime import datetime\n", + "from functools import partial\n", + "\n", + "import click\n", + "import mlflow\n", + "from dfp.stages.dfp_file_batcher_stage import DFPFileBatcherStage\n", + "from dfp.stages.dfp_file_to_df import DFPFileToDataFrameStage\n", + "from dfp.stages.dfp_inference_stage import DFPInferenceStage\n", + "from dfp.stages.dfp_mlflow_model_writer import DFPMLFlowModelWriterStage\n", + "from dfp.stages.dfp_postprocessing_stage import DFPPostprocessingStage\n", + "from dfp.stages.dfp_preprocessing_stage import DFPPreprocessingStage\n", + "from dfp.stages.dfp_rolling_window_stage import DFPRollingWindowStage\n", + "from dfp.stages.dfp_split_users_stage import DFPSplitUsersStage\n", + "from dfp.stages.dfp_training import DFPTraining\n", + "from dfp.stages.multi_file_source import MultiFileSource\n", + "from dfp.utils.column_info import BoolColumn\n", + "from dfp.utils.column_info import ColumnInfo\n", + "from dfp.utils.column_info import CustomColumn\n", + "from dfp.utils.column_info import DataFrameInputSchema\n", + "from dfp.utils.column_info import DateTimeColumn\n", + "from dfp.utils.column_info import IncrementColumn\n", + "from dfp.utils.column_info import RenameColumn\n", + "from dfp.utils.column_info import StringCatColumn\n", + "from dfp.utils.column_info import create_increment_col\n", + "from dfp.utils.file_utils import date_extractor\n", + "from dfp.utils.file_utils import iso_date_regex\n", + "\n", + "from morpheus._lib.file_types import FileTypes\n", + "from morpheus.cli.utils import get_package_relative_file\n", + "from morpheus.cli.utils import load_labels_file\n", + "from morpheus.config import Config\n", + "from morpheus.config import ConfigAutoEncoder\n", + "from morpheus.config import CppConfig\n", + "from morpheus.pipeline import LinearPipeline\n", + "from morpheus.stages.general.monitor_stage import MonitorStage\n", + "from morpheus.stages.output.write_to_file_stage import WriteToFileStage\n", + "from morpheus.utils.logger import configure_logging\n", + "from morpheus.utils.logger import get_log_levels\n", + "from morpheus.utils.logger import parse_log_level\n", + "\n", + "# Left align all tables\n", + "from IPython.core.display import HTML\n", + "table_css = 'table {align:left;display:block}'\n", + "HTML(''.format(table_css))" + ] + }, + { + "cell_type": "markdown", + "id": "ca38c1b7-ce84-43e0-ac53-280562dc1642", + "metadata": {}, + "source": [ + "## High Level Configuration\n", + "\n", + "The following options significantly alter the functionality of the pipeline. These options are separated from the individual stage options since they may effect more than one stage. Additionally, the matching python script to this notebook, `dfp_pipeline_duo.py`, configures these options via command line arguments.\n", + "\n", + "### Options\n", + "\n", + "| Name | Type | Description |\n", + "| --- | --- | :-- |\n", + "| `train_users` | One of `[\"all\", \"generic\", \"individual\"]` | This indicates which users to train for this pipeline:
  • `\"generic\"`: Combine all users into a single model with the username 'generic_user'. Skips individual users.
  • `\"individual\"`: Trains a separate model for each individual user. Skips 'generic_user'.
  • `\"all\"`: Combination of `\"generic\"` and `\"individual\"`. Both the 'generic_user' and individual users are trained in the same pipeline.
|\n", + "| `skip_users` | List of strings | Any user in this list will be dropped from the pipeline. Useful for debugging to remove automated accounts with many logs. |\n", + "| `cache_dir` | string | The location to store cached files. To aid with development and reduce bandwidth, the Morpheus pipeline will cache data from several stages of the pipeline. This option configures the location for those caches. |\n", + "| `input_files` | List of strings | List of files to process. Can specificy multiple arguments for multiple files. Also accepts glob (\\*) wildcards and schema prefixes such as `s3://`. For example, to make a local cache of an s3 bucket, use `filecache::s3://mybucket/*`. See `fsspec` documentation for list of possible options. |\n", + "| `model_name_formatter` | string | A format string to use when building the model name. The model name is constructed by calling `model_name_formatter.format(user_id=user_id)`. For example, with `model_name_formatter=\"my_model-{user_id}\"` and a user ID of `\"first:last\"` would result in the model name of `\"my_model-first:last\"`. This should match the value used in `DFPMLFlowModelWriterStage`. Available keyword arguments: `user_id`, `user_md5`. |\n", + "| `experiment_name_formatter` | string | A format string (without the `f`) that will be used when creating an experiment in ML Flow. Available keyword arguments: `user_id`, `user_md5`, `reg_model_name`. |\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ee00703-75c5-46fc-890c-86733da906c4", + "metadata": {}, + "outputs": [], + "source": [ + "# Global options\n", + "train_users = \"all\"\n", + "\n", + "# Enter any users to skip here\n", + "skip_users: typing.List[str] = []\n", + "\n", + "# Location where cache objects will be saved\n", + "cache_dir = \"/workspace/.cache/dfp\"\n", + "\n", + "# Input files to read from\n", + "input_files = [\n", + " \"/workspace/examples/data/dfp/duo/public_preview_final/s3/DUO_2022-08-0*.json\",\n", + " \"/workspace/examples/data/dfp/duo/public_preview_final/s3/DUO_2022-08-1*.json\",\n", + " \"/workspace/examples/data/dfp/duo/public_preview_final/s3/DUO_2022-08-2*.json\",\n", + "]\n", + "\n", + "# The format to use for models\n", + "model_name_formatter = \"DFP-duo-{user_id}\"\n", + "\n", + "# The format to use for experiment names\n", + "experiment_name_formatter = \"dfp/duo/training/{reg_model_name}\"\n", + "\n", + "# === Derived Options ===\n", + "# To include the generic, we must be training all or generic\n", + "include_generic = train_users == \"all\" or train_users == \"generic\"\n", + "\n", + "# To include individual, we must be either training or inferring\n", + "include_individual = train_users != \"generic\"\n", + "\n", + "# None indicates we arent training anything\n", + "is_training = train_users != \"none\"" + ] + }, + { + "cell_type": "markdown", + "id": "1cfc24c9-c85e-4977-a348-692c8f0aceaa", + "metadata": {}, + "source": [ + "### Global Config Object\n", + "Before creating the pipeline, we need to setup logging and set the parameters for the Morpheus config object. This config object is responsible for the following:\n", + " - Indicating whether to use C++ or Python stages\n", + " - C++ stages are not supported for the DFP pipeline. This should always be `False`\n", + " - Setting the number of threads to use in the pipeline. Defaults to the thread count of the OS.\n", + " - Sets the feature column names that will be used in model training\n", + " - This option allows extra columns to be used in the pipeline that will not be part of the training algorithm.\n", + " - The final features that the model will be trained on will be an intersection of this list with the log columns.\n", + " - The column name that indicates the user's unique identifier\n", + " - It is required for DFP to have a user ID column\n", + " - The column name that indicates the timestamp for the log\n", + " - It is required for DFP to know when each log occurred" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "01abd537-9162-49dc-8e83-d9465592f1d5", + "metadata": {}, + "outputs": [], + "source": [ + "# Enable the Morpheus logger\n", + "configure_logging(log_level=logging.DEBUG)\n", + "\n", + "config = Config()\n", + "\n", + "CppConfig.set_should_use_cpp(False)\n", + "\n", + "config.num_threads = os.cpu_count()\n", + "\n", + "config.ae = ConfigAutoEncoder()\n", + "\n", + "config.ae.feature_columns = [\n", + " 'accessdevicebrowser', 'accessdeviceos', 'authdevicename', 'result', 'reason', 'logcount', \"locincrement\"\n", + "]\n", + "config.ae.userid_column_name = \"username\"\n", + "config.ae.timestamp_column_name = \"timestamp\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a73a4d53-32b6-4ab8-a5d7-c0104b31c69b", + "metadata": {}, + "outputs": [], + "source": [ + "# Specify the column names to ensure all data is uniform\n", + "source_column_info = [\n", + " DateTimeColumn(name=config.ae.timestamp_column_name, dtype=datetime, input_name=\"timestamp\"),\n", + " RenameColumn(name=config.ae.userid_column_name, dtype=str, input_name=\"user.name\"),\n", + " RenameColumn(name=\"accessdevicebrowser\", dtype=str, input_name=\"access_device.browser\"),\n", + " RenameColumn(name=\"accessdeviceos\", dtype=str, input_name=\"access_device.os\"),\n", + " StringCatColumn(name=\"location\",\n", + " dtype=str,\n", + " input_columns=[\n", + " \"access_device.location.city\",\n", + " \"access_device.location.state\",\n", + " \"access_device.location.country\"\n", + " ],\n", + " sep=\", \"),\n", + " RenameColumn(name=\"authdevicename\", dtype=str, input_name=\"auth_device.name\"),\n", + " BoolColumn(name=\"result\",\n", + " dtype=bool,\n", + " input_name=\"result\",\n", + " true_values=[\"success\", \"SUCCESS\"],\n", + " false_values=[\"denied\", \"DENIED\", \"FRAUD\"]),\n", + " ColumnInfo(name=\"reason\", dtype=str),\n", + " # CustomColumn(name=\"user.groups\", dtype=str, process_column_fn=partial(column_listjoin, col_name=\"user.groups\"))\n", + "]\n", + "\n", + "source_schema = DataFrameInputSchema(json_columns=[\"access_device\", \"application\", \"auth_device\", \"user\"],\n", + " column_info=source_column_info)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7a0cb0a-e65a-444a-a06c-a4525d543790", + "metadata": {}, + "outputs": [], + "source": [ + "# Preprocessing schema\n", + "preprocess_column_info = [\n", + " ColumnInfo(name=config.ae.timestamp_column_name, dtype=datetime),\n", + " ColumnInfo(name=config.ae.userid_column_name, dtype=str),\n", + " ColumnInfo(name=\"accessdevicebrowser\", dtype=str),\n", + " ColumnInfo(name=\"accessdeviceos\", dtype=str),\n", + " ColumnInfo(name=\"authdevicename\", dtype=str),\n", + " ColumnInfo(name=\"result\", dtype=bool),\n", + " ColumnInfo(name=\"reason\", dtype=str),\n", + " # Derived columns\n", + " IncrementColumn(name=\"logcount\",\n", + " dtype=int,\n", + " input_name=config.ae.timestamp_column_name,\n", + " groupby_column=config.ae.userid_column_name),\n", + " CustomColumn(name=\"locincrement\",\n", + " dtype=int,\n", + " process_column_fn=partial(create_increment_col, column_name=\"location\")),\n", + "]\n", + "\n", + "preprocess_schema = DataFrameInputSchema(column_info=preprocess_column_info, preserve_columns=[\"_batch_id\"])\n" + ] + }, + { + "cell_type": "markdown", + "id": "bdfc59de-dea8-4e5f-98e3-98eba1e4621d", + "metadata": {}, + "source": [ + "## Pipeline Construction\n", + "From this point on we begin constructing the stages that will make up the pipeline. To make testing easier, constructing the pipeline object, adding the stages, and running the pipeline, is provided as a single cell. The below cell can be rerun multiple times as needed for debugging.\n", + "\n", + "### Source Stage (`MultiFileSource`)\n", + "\n", + "This pipeline read input logs from one or more input files. This source stage will construct a list of files to be processed and pass to downstream stages. It is capable of reading files from many different source types, both local and remote. This is possible by utilizing the `fsspec` library (similar to `pandas`). See the [`fsspec`](https://filesystem-spec.readthedocs.io/) documentation for more information on the supported file types. Once all of the logs have been read, the source completes. \n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `filenames` | List of strings | | Any files to read into the pipeline. All files will be combined into a single `DataFrame` |\n", + "\n", + "### File Batcher Stage (`DFPFileBatcherStage`)\n", + "\n", + "To improve performance, multiple small input files can be batched together into a single DataFrame for processing. This stage is responsible for determining the timestamp of input files, grouping input files into batches by time, and sending the batches to be processed into a single DataFrame. Batches of files that have been seen before will be loaded from cache resulting in increased performance. For example, when performaing a 60 day training run, 59 days can be cached with a period of `\"D\"` and retraining once per day.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `period` | `str` | `\"D\"` | The period to create batches. See `pandas` windowing frequency documentation for available options. |\n", + "| `date_conversion_func` | Function of `typing.Callable[[fsspec.core.OpenFile], datetime]` | | A callback which is responsible for determining the date for a specified file. |\n", + "\n", + "### File to DataFrame Stage (`DFPFileToDataFrameStage`)\n", + "\n", + "After files have been batched into groups, this stage is responsible for reading the files and converting into a DataFrame. The specified input schema converts the raw DataFrame into one suitable for caching and processing. Any columns that are not needed should be excluded from the schema.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `schema` | `DataFrameInputSchema` | | After the raw `DataFrame` is read from each file, this schema will be applied to ensure a consisten output from the source. |\n", + "| `file_type` | `FileTypes` | `FileTypes.Auto` | Allows overriding the file type. When set to `Auto`, the file extension will be used. Options are `CSV`, `JSON`, `Auto`. |\n", + "| `parser_kwargs` | `dict` | `{}` | This dictionary will be passed to the `DataFrame` parser class. Allows for customization of log parsing. |\n", + "| `cache_dir` | `str` | `./.cache/dfp` | The location to write cached input files to. |\n", + "\n", + "### Split Users Stage (`DFPSplitUsersStage`)\n", + "\n", + "Once the input logs have been read into a `DataFrame`, this stage is responsible for breaking that single `DataFrame` with many users into multiple `DataFrame`s for each user. This is also where the pipeline chooses whether to train individual users or the generic user (or both).\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `include_generic` | `bool` | | Whether or not to combine all user logs into a single `DataFrame` with the username 'generic_user' |\n", + "| `include_individual` | `bool` | | Whether or not to output individual `DataFrame` objects for each user |\n", + "| `skip_users` | List of `str` | `[]` | Any users to remove from the `DataFrame`. Useful for debugging to remove automated accounts with many logs. Mutually exclusive with `only_users`. |\n", + "| `only_users` | List of `str` | `[]` | Only allow these users in the final `DataFrame`. Useful for debugging to focus on specific users. Mutually exclusive with `skip_users`. |\n", + "\n", + "### Rolling Window Stage (`DFPRollingWindowStage`)\n", + "\n", + "The Rolling Window Stage performs several key pieces of functionality for DFP.\n", + "1. This stage keeps a moving window of logs on a per user basis\n", + " 1. These logs are saved to disk to reduce memory requirements between logs from the same user\n", + "1. It only emits logs when the window history requirements are met\n", + " 1. Until all of the window history requirements are met, no messages will be sent to the rest of the pipeline.\n", + " 1. See the below options for configuring the window history requirements\n", + "1. It repeats the necessary logs to properly calculate log dependent features.\n", + " 1. To support all column feature types, incoming log messages can be combined with existing history and sent to downstream stages.\n", + " 1. For example, to calculate a feature that increments a counter for the number of logs a particular user has generated in a single day, we must have the user's log history for the past 24 hours. To support this, this stage will combine new logs with existing history into a single `DataFrame`.\n", + " 1. It is the responsibility of downstream stages to distinguish between new logs and existing history.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `min_history` | `int` | `300` | The minimum number of logs a user must have before emitting any messages. Logs below this threshold will be saved to disk. |\n", + "| `min_increment` | `int` or `str` | `300` | Once the min history requirement is met, this stage must receive `min_increment` *new* logs before emmitting another message. Logs received before this threshold is met will be saved to disk. Can be specified as an integer count or a string duration. |\n", + "| `max_history` | `int` or `str` | `\"60d\"` | Once `min_history` and `min_increment` requirements have been met, this puts an upper bound on the maximum number of messages to forward into the pipeline and also the maximum amount of messages to retain in the history. Can be specified as an integer count or a string duration. |\n", + "| `cache_dir` | `str` | `./.cache/dfp` | The location to write log history to disk. |\n", + "\n", + "### Preprocessing Stage (`DFPPreprocessingStage`)\n", + "\n", + "This stage performs the final, row dependent, feature calculations as specified by the input schema object. Once calculated, this stage can forward on all received logs, or optionally can only forward on new logs, removing any history information.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `input_schema` | `DataFrameInputSchema` | | The final, row dependent, schema to apply to the incoming columns |\n", + "| `only_new_batches` | `bool` | | Whether or not to foward on all received logs, or just new logs. |\n", + "\n", + "### Training Stage (`DFPTraining`)\n", + "\n", + "This stage is responsible for performing the actual training calculations. Training will be performed on all received data. Resulting message will contain the input data paired with the trained model.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `model_kwargs` | `dict` | `{}` | The options to use when creating a new model instance. See `DFPAutoEncoder` for information on the available options. |\n", + "\n", + "### MLFlow Model Writer Stage (`DFPMLFlowModelWriterStage`)\n", + "\n", + "This stage is the last step in training. It will upload the trained model from the previous stage to MLFlow. The tracking URI for which MLFlow instance to use is configured using the static method `mlflow.set_tracking_uri()`.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `model_name_formatter` | `str` | `\"\"` | A format string to use when building the model name. The model name is constructed by calling `model_name_formatter.format(user_id=user_id)`. For example, with `model_name_formatter=\"my_model-{user_id}\"` and a user ID of `\"first:last\"` would result in the model name of `\"my_model-first:last\"` |\n", + "| `experiment_name` | `str` | | All models are created inside of an experiment to allow metrics to be saved with each model. This option specifies the experiment name. The final experiment name for each model will be in the form of `{experiment_name}/{model_name}` |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "825390ad-ce64-4949-b324-33039ffdf264", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a linear pipeline object\n", + "pipeline = LinearPipeline(config)\n", + "\n", + "# Source stage\n", + "pipeline.set_source(MultiFileSource(config, filenames=input_files))\n", + "\n", + "# Batch files into buckets by time. Use the default ISO date extractor from the filename\n", + "pipeline.add_stage(\n", + " DFPFileBatcherStage(config,\n", + " period=\"D\",\n", + " date_conversion_func=functools.partial(date_extractor, filename_regex=iso_date_regex)))\n", + "\n", + "# Output is S3 Buckets. Convert to DataFrames. This caches downloaded S3 data\n", + "pipeline.add_stage(\n", + " DFPFileToDataFrameStage(config,\n", + " schema=source_schema,\n", + " file_type=FileTypes.JSON,\n", + " parser_kwargs={\n", + " \"lines\": False, \"orient\": \"records\"\n", + " },\n", + " cache_dir=cache_dir))\n", + "\n", + "\n", + "# This will split users or just use one single user\n", + "pipeline.add_stage(\n", + " DFPSplitUsersStage(config,\n", + " include_generic=include_generic,\n", + " include_individual=include_individual,\n", + " skip_users=skip_users))\n", + "\n", + "# Next, have a stage that will create rolling windows\n", + "pipeline.add_stage(\n", + " DFPRollingWindowStage(\n", + " config,\n", + " min_history=300 if is_training else 1,\n", + " min_increment=300 if is_training else 0,\n", + " # For inference, we only ever want 1 day max\n", + " max_history=\"60d\" if is_training else \"1d\",\n", + " cache_dir=cache_dir))\n", + "\n", + "# Output is UserMessageMeta -- Cached frame set\n", + "pipeline.add_stage(DFPPreprocessingStage(config, input_schema=preprocess_schema, only_new_batches=not is_training))\n", + "\n", + "# Finally, perform training which will output a model\n", + "pipeline.add_stage(DFPTraining(config))\n", + "\n", + "# Write that model to MLFlow\n", + "pipeline.add_stage(\n", + " DFPMLFlowModelWriterStage(config,\n", + " model_name_formatter=model_name_formatter,\n", + " experiment_name_formatter=experiment_name_formatter))\n", + "\n", + "# Run the pipeline\n", + "await pipeline._do_run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6eed0657-6f4b-4f21-97fa-051eeb7f4fee", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:morpheus] *", + "language": "python", + "name": "conda-env-morpheus-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + }, + "vscode": { + "interpreter": { + "hash": "e26783b24f020aa0bcaa00e6ba122db5d0e3da2d892d80be664969895e06a7e1" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/digital_fingerprinting/production/morpheus/launch.sh b/examples/digital_fingerprinting/production/morpheus/launch.sh new file mode 100755 index 0000000000..da1e8ee77f --- /dev/null +++ b/examples/digital_fingerprinting/production/morpheus/launch.sh @@ -0,0 +1,17 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Run the training forwarding any args +python dfp_pipeline_duo.py "$@" diff --git a/examples/digital_fingerprinting/starter/README.md b/examples/digital_fingerprinting/starter/README.md new file mode 100644 index 0000000000..f5c1da88e3 --- /dev/null +++ b/examples/digital_fingerprinting/starter/README.md @@ -0,0 +1,291 @@ + + +# "Starter" Digital Fingerprinting Pipeline + +We show here how to set up and run the DFP pipeline for three log types: CloudTrail, Duo and Azure. Each of these log types uses a built-in source stage that handles that specific data format. New source stages can be added to allow the DFP pipeline to process different log types. All stages after the source stages are identical across all log types but can be configured differently via pipeline or stage configuration options. + +## Environment Setup + +Follow the instructions [here](https://github.com/nv-morpheus/Morpheus/blob/branch-22.09/CONTRIBUTING.md) to set up your development environment in either a Docker container or conda environment. + +## Morpheus CLI + +DFP pipelines can be constructed and run using the Morpheus CLI command `morpheus run pipeline-ae ...` + +Use `--help` to display information about the autoencoder pipeline command line options: + +``` +morpheus run pipeline-ae --help + +Usage: morpheus run pipeline-ae [OPTIONS] COMMAND1 [ARGS]... [COMMAND2 + [ARGS]...]... + + Configure and run the pipeline. To configure the pipeline, list the stages + in the order that data should flow. The output of each stage will become the + input for the next stage. For example, to read, classify and write to a + file, the following stages could be used + + pipeline from-file --filename=my_dataset.json deserialize preprocess inf-triton --model_name=my_model + --server_url=localhost:8001 filter --threshold=0.5 to-file --filename=classifications.json + + Pipelines must follow a few rules: + 1. Data must originate in a source stage. Current options are `from-file` or `from-kafka` + 2. A `deserialize` stage must be placed between the source stages and the rest of the pipeline + 3. Only one inference stage can be used. Zero is also fine + 4. The following stages must come after an inference stage: `add-class`, `filter`, `gen-viz` + +Options: + --columns_file FILE [default: ./morpheus/data/columns_ae_cloudtrail.txt] + --labels_file FILE Specifies a file to read labels from in + order to convert class IDs into labels. A + label file is a simple text file where each + line corresponds to a label. If unspecified, + only a single output label is created for + FIL + --userid_column_name TEXT Which column to use as the User ID. + [default: userIdentityaccountId; required] + --userid_filter TEXT Specifying this value will filter all + incoming data to only use rows with matching + User IDs. Which column is used for the User + ID is specified by `userid_column_name` + --feature_scaler TEXT Autoencoder feature scaler [default: + standard] + --use_generic_model BOOLEAN Whether to use a generic model when user does + not have minimum number of training rows + [default: False] + --viz_file FILE Save a visualization of the pipeline at the + specified location + --help Show this message and exit. + +Commands: + add-class Add detected classifications to each message + add-scores Add probability scores to each message + buffer (Deprecated) Buffer results + delay (Deprecated) Delay results for a certain duration + filter Filter message by a classification threshold + from-azure Load messages from a Duo directory + from-cloudtrail Load messages from a Cloudtrail directory + from-duo Load messages from a Duo directory + gen-viz (Deprecated) Write out vizualization data frames + inf-pytorch Perform inference with PyTorch + inf-triton Perform inference with Triton + monitor Display throughput numbers at a specific point in the + pipeline + preprocess Convert messages to tokens + serialize Include & exclude columns from messages + timeseries Perform time series anomaly detection and add prediction. + to-file Write all messages to a file + to-kafka Write all messages to a Kafka cluster + train-ae Deserialize source data from JSON + validate Validates pipeline output against an expected output +``` +The commands above correspond to the Morpheus stages that can be used to construct your DFP pipeline. Options are available to configure pipeline and stages. +The following table shows mapping between the main Morpheus CLI commands and underlying Morpheus Python stage classes: + +| CLI Command | Stage Class | Python File | +| ---------------| -------------------------| --------------------------------------------------------- +| from-azure | AzureSourceStage | morpheus/stages/input/azure_source_stage.py +| from-cloudtrail| CloudTrailSourceStage | morpheus/stages/input/clout_trail_source_stage.py +| from-duo | DuoSourceStage | morpheus/stages/input/duo_source_stage.py +| train-ae | TrainAEStage | morpheus/stages/preprocess/train_ae_stage.py +| preprocess | PreprocessAEStage | morpheus/stages/preprocess/preprocess_ae_stage.py +| inf-pytorch | AutoEncoderInferenceStage| morpheus/stages/inference/auto_encoder_inference_stage.py +| add-scores | AddScoresStage | morpheus/stages/postprocess/add_scores_stage.py +| serialize | SerializeStage | morpheus/stages/postprocess/serialize_stage.py +| to-file | WriteToFileStage | morpheus/stages/output/write_to_file_stage.py + + +## Morpheus DFP Stages + +**Source stages** - These include `AzureSourceStage`, `CloudTrailSourceStage` and `DuoSourceStage`. They are responsible for reading log file(s) that match provided `--input_glob` (e.g. `/duo_logs/*.json`). Data is grouped by user so that each batch processed by the pipeline will only contain rows corresponding to a single user. Feature engineering also happens in this stage. All DFP source stages must extend `AutoencoderSourceStage` and implement the `files_to_dfs_per_user` abstract method. Feature columns can be managed by overriding the `derive_features` method. Otherwise, all columns from input data pass through to next stage. + +**Preprocessing stages** + +`TrainAEStage` can either train user models using data matching a provided `--train_data_glob` or load pre-trained models from file using `--pretrained_filename`. When using `--train_data_glob`, user models can be saved using the `--models_output_filename` option. The `--source_stage_class` must also be used with `--train_data_glob` so that the training stage knows how to read the training data. The autoencoder implementation from this [fork](https://github.com/efajardo-nv/dfencoder/tree/morpheus-22.08) is used for user model training. The following are the available CLI options for the `TrainAEStage` (train-ae): + +| Option | Description +| ----------------------| --------------------------------------------------------- +| pretrained_filename | File path to pickled user models saved from previous training run using `--models_output_filename`. +| train_data_glob | Glob path to training data. +| source_stage_class | Source stage so that training stage knows how to read/parse training data. +| train_epochs | Number of training epochs. Default is 25. +| min_train_rows | Minimum number of training rows required to train user model. Default is 300. +| train_max_history | Maximum number of training rows per user. Default is 1000. +| seed | When not None, ensure random number generators are seeded with `seed` to control reproducibility of user model. +| sort_glob | If true the list of files matching `input_glob` will be processed in sorted order. Default is False. +| models_output_filename| Can be used with `--train_data_glob` to save trained user models to file using provided file path. Models can be loaded later using `--pretrained_filename`. + +The `PreprocessAEStage` is responsible for creating a Morpheus message that contains everything needed by the inference stage. For DFP inference, this stage must pass a `MultiInferenceAEMessage` to the inference stage. Each message will correspond to a single user and include the input feature columns, the user's model and training data anomaly scores. + +**Inference stage** - `AutoEncoderInferenceStage` calculates anomaly scores (i.e. reconstruction loss) and z-scores for each user input dataset. + +**Postprocessing stage** - The DFP pipeline uses the `AddScoresStage` for postprocessing to add anomaly scores and zscores from previous inference stage with matching labels. + +**Serialize stage** - `SerializeStage` is used to convert `MultiResponseProbsMessage` from previous stage to a `MessageMeta` to make it suitable for output (i.e. write to file or Kafka). + +**Write stage** - `WriteToFileStage` writes input data with inference results to an output file path. + + +## CloudTrail DFP Pipeline + +Run the following in your Morpheus container to start the CloudTrail DFP pipeline: + +``` +morpheus --log_level=DEBUG \ +run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=False \ +pipeline-ae \ +--columns_file=morpheus/data/columns_ae_cloudtrail.txt \ +--userid_column_name=userIdentitysessionContextsessionIssueruserName \ +--userid_filter=user123 \ +--feature_scaler=standard \ +from-cloudtrail \ +--input_glob=models/datasets/validation-data/dfp-cloudtrail-*-input.csv \ +--max_files=200 \ +train-ae \ +--train_data_glob=models/datasets/training-data/dfp-cloudtrail-*.csv \ +--source_stage_class=morpheus.stages.input.cloud_trail_source_stage.CloudTrailSourceStage \ +--seed=42 \ +preprocess \ +inf-pytorch \ +add-scores \ +serialize \ +to-file --filename=./cloudtrail-dfp-detections.csv --overwrite +``` + +## Duo DFP Pipeline + +First, trains user models from files in `models/datasets/training-data/duo` and saves user models to file. Pipeline then uses these models to run inference +on validation data in `models/datasets/validation-data/duo`. Inference results are written to `duo-detections.csv`. +``` +morpheus --log_level=DEBUG \ +run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=False \ +pipeline-ae \ +--columns_file=morpheus/data/columns_ae_duo.txt \ +--userid_column_name=username \ +--feature_scaler=standard \ +from-duo \ +--input_glob=models/datasets/validation-data/duo/*.json \ +--max_files=200 \ +monitor --description='Input rate' \ +train-ae \ +--train_data_glob=models/datasets/training-data/duo/*.json \ +--source_stage_class=morpheus.stages.input.duo_source_stage.DuoSourceStage \ +--seed=42 \ +--train_epochs=1 \ +--models_output_filename=models/dfp-models/duo_ae_user_models.pkl \ +preprocess \ +inf-pytorch \ +monitor --description='Inference rate' --unit inf \ +add-scores \ +serialize \ +to-file --filename=./duo-detections.csv --overwrite +``` + +The following example shows how we can load pre-trained user models from the file (`models/dfp-models/duo_ae_user_models.pkl`) we created in the previous example. Pipeline then uses these models to run inference on validation data in `models/datasets/validation-data/duo`. Inference results are written to `duo-detections.csv`. +``` +morpheus --log_level=DEBUG \ +run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=False \ +pipeline-ae \ +--columns_file=morpheus/data/columns_ae_duo.txt \ +--userid_column_name=username \ +--feature_scaler=standard \ +from-duo \ +--input_glob=models/datasets/validation-data/duo/*.json \ +--max_files=200 \ +monitor --description='Input rate' \ +train-ae \ +--pretrained_filename=models/dfp-models/duo_ae_user_models.pkl \ +preprocess \ +inf-pytorch \ +monitor --description='Inference rate' --unit inf \ +add-scores \ +serialize \ +to-file --filename=./duo-detections.csv --overwrite +``` + +## Azure DFP Pipeline + +First, trains user models from files in `models/datasets/training-data/azure` and saves user models to file. Pipeline then uses these models to run inference +on validation data in `models/datasets/validation-data/azure`. Inference results are written to `azure-detections.csv`. +``` +morpheus --log_level=DEBUG \ +run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=False \ +pipeline-ae \ +--columns_file=morpheus/data/columns_ae_azure.txt \ +--userid_column_name=userPrincipalName \ +--feature_scaler=standard \ +from-azure \ +--input_glob=models/datasets/validation-data/azure/*.json \ +--max_files=200 \ +train-ae \ +--train_data_glob=models/datasets/training-data/azure/*.json \ +--source_stage_class=morpheus.stages.input.azure_source_stage.AzureSourceStage \ +--seed=42 \ +--models_output_filename=models/dfp-models/azure_ae_user_models.pkl \ +preprocess \ +inf-pytorch \ +monitor --description='Inference rate' --unit inf \ +add-scores \ +serialize \ +to-file --filename=./azure-detections.csv --overwrite +``` + +The following example shows how we can load pre-trained user models from the file (`models/dfp-models/azure_ae_user_models.pkl`) we created in the previous example. Pipeline then uses these models to run inference on validation data in `models/datasets/validation-data/azure`. Inference results are written to `azure-detections.csv`. +``` +morpheus --log_level=DEBUG \ +run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=False \ +pipeline-ae \ +--columns_file=morpheus/data/columns_ae_azure.txt \ +--userid_column_name=userPrincipalName \ +--feature_scaler=standard \ +from-azure \ +--input_glob=models/datasets/validation-data/azure/*.json \ +--max_files=200 \ +train-ae \ +--pretrained_filename=models/dfp-models/azure_ae_user_models.pkl \ +preprocess \ +inf-pytorch \ +monitor --description='Inference rate' --unit inf \ +add-scores \ +serialize \ +to-file --filename=./azure-detections.csv --overwrite +``` + + +## Using Morpheus Python API + +The DFP pipelines can also be constructed and run via the Morpheus Python API. An [example](./run_cloudtrail_dfp.py) is included for the Cloudtrail DFP pipeline. The following are some commands to +run the example. + +Train user models from files in `models/datasets/training-data/dfp-cloudtrail-*.csv` and saves user models to file. Pipeline then uses these models to run inference on Cloudtrail validation data in `models/datasets/validation-data/dfp-cloudtrail-*-input.csv`. Inference results are written to `cloudtrail-dfp-results.csv`. +``` +python ./examples/digital_fingerprinting/run_cloudtrail_dfp.py \ + --columns_file=morpheus/data/columns_ae_cloudtrail.txt \ + --input_glob=models/datasets/validation-data/dfp-cloudtrail-*-input.csv \ + --train_data_glob=models/datasets/training-data/dfp-*.csv \ + --models_output_filename=models/dfp-models/cloudtrail_ae_user_models.pkl \ + --output_file ./cloudtrail-dfp-results.csv +``` + +Here we load pre-trained user models from the file (`models/dfp-models/cloudtrail_ae_user_models.pkl`) we created in the previous example. Pipeline then uses these models to run inference on validation data in `models/datasets/validation-data/dfp-cloudtrail-*-input.csv`. Inference results are written to `cloudtrail-dfp-results.csv`. +``` +python ./examples/digital_fingerprinting/run_cloudtrail_dfp.py \ + --columns_file=morpheus/data/columns_ae_cloudtrail.txt \ + --input_glob=models/datasets/validation-data/dfp-cloudtrail-*-input.csv \ + --pretrained_filename=models/dfp-models/cloudtrail_ae_user_models.pkl \ + --output_file=./cloudtrail-dfp-results.csv +``` diff --git a/examples/digital_fingerprinting/run_cloudtrail_dfp.py b/examples/digital_fingerprinting/starter/run_cloudtrail_dfp.py similarity index 100% rename from examples/digital_fingerprinting/run_cloudtrail_dfp.py rename to examples/digital_fingerprinting/starter/run_cloudtrail_dfp.py diff --git a/morpheus/cli/commands.py b/morpheus/cli/commands.py index c3050c2e7c..79f5b72dac 100644 --- a/morpheus/cli/commands.py +++ b/morpheus/cli/commands.py @@ -27,6 +27,7 @@ from morpheus.cli.utils import get_enum_values from morpheus.cli.utils import get_log_levels from morpheus.cli.utils import get_pipeline_from_ctx +from morpheus.cli.utils import load_labels_file from morpheus.cli.utils import parse_enum from morpheus.cli.utils import parse_log_level from morpheus.cli.utils import prepare_command @@ -340,9 +341,8 @@ def pipeline_nlp(ctx: click.Context, **kwargs): if len(labels): config.class_labels = list(labels) else: - with open(kwargs["labels_file"], "r") as lf: - config.class_labels = [x.strip() for x in lf.readlines()] - logger.debug("Loaded labels file. Current labels: [%s]", str(config.class_labels)) + config.class_labels = load_labels_file(kwargs["labels_file"]) + logger.debug("Loaded labels file. Current labels: [%s]", str(config.class_labels)) from morpheus.pipeline import LinearPipeline @@ -409,16 +409,14 @@ def pipeline_fil(ctx: click.Context, **kwargs): labels_file = kwargs.get("labels_file") if (labels_file is not None): - with open(labels_file, "r") as lf: - config.class_labels = [x.strip() for x in lf.readlines()] - logger.debug("Loaded labels file. Current labels: [%s]", str(config.class_labels)) + config.class_labels = load_labels_file(labels_file) + logger.debug("Loaded labels file. Current labels: [%s]", str(config.class_labels)) else: config.class_labels = list(kwargs['label']) if ("columns_file" in kwargs and kwargs["columns_file"] is not None): - with open(kwargs["columns_file"], "r") as lf: - config.fil.feature_columns = [x.strip() for x in lf.readlines()] - logger.debug("Loaded columns. Current columns: [%s]", str(config.fil.feature_columns)) + config.fil.feature_columns = load_labels_file(kwargs["columns_file"]) + logger.debug("Loaded columns. Current columns: [%s]", str(config.fil.feature_columns)) else: raise ValueError('Unable to find columns file') @@ -502,17 +500,15 @@ def pipeline_ae(ctx: click.Context, **kwargs): config.ae.use_generic_model = kwargs["use_generic_model"] if ("columns_file" in kwargs and kwargs["columns_file"] is not None): - with open(kwargs["columns_file"], "r") as lf: - config.ae.feature_columns = [x.strip() for x in lf.readlines()] - logger.debug("Loaded columns. Current columns: [%s]", str(config.ae.feature_columns)) + config.ae.feature_columns = load_labels_file(kwargs["columns_file"]) + logger.debug("Loaded columns. Current columns: [%s]", str(config.ae.feature_columns)) else: # Use a default single label config.class_labels = ["reconstruct_loss", "zscore"] if ("labels_file" in kwargs and kwargs["labels_file"] is not None): - with open(kwargs["labels_file"], "r") as lf: - config.class_labels = [x.strip() for x in lf.readlines()] - logger.debug("Loaded labels file. Current labels: [%s]", str(config.class_labels)) + config.class_labels = load_labels_file(kwargs["labels_file"]) + logger.debug("Loaded labels file. Current labels: [%s]", str(config.class_labels)) else: # Use a default single label config.class_labels = ["reconstruct_loss", "zscore"] @@ -582,9 +578,8 @@ def pipeline_other(ctx: click.Context, **kwargs): labels_file = kwargs.get("labels_file") if (labels_file is not None): - with open(labels_file, "r") as lf: - config.class_labels = [x.strip() for x in lf.readlines()] - logger.debug("Loaded labels file. Current labels: [%s]", str(config.class_labels)) + config.class_labels = load_labels_file(labels_file) + logger.debug("Loaded labels file. Current labels: [%s]", str(config.class_labels)) else: labels = kwargs["label"] if len(labels): diff --git a/morpheus/cli/register_stage.py b/morpheus/cli/register_stage.py index 7d273a9ef7..45e3821c61 100644 --- a/morpheus/cli/register_stage.py +++ b/morpheus/cli/register_stage.py @@ -381,14 +381,6 @@ def command_callback(ctx: click.Context, **kwargs): # Not registered, add to global registry GlobalStageRegistry.get().add_stage_info(stage_info) - import sys - import weakref - - def unregister_command(): - GlobalStageRegistry.get().remove_stage_info(stage_info) - - weakref.finalize(sys.modules[stage_class.__module__], unregister_command) - return stage_class return register_stage_inner diff --git a/morpheus/cli/utils.py b/morpheus/cli/utils.py index 889f7524bb..d36654ae7f 100644 --- a/morpheus/cli/utils.py +++ b/morpheus/cli/utils.py @@ -184,6 +184,32 @@ def parse_enum(_: click.Context, _2: click.Parameter, value: str, enum_class: ty return result +def load_labels_file(labels_file: str) -> typing.List[str]: + with open(labels_file, "r") as lf: + return [x.strip() for x in lf.readlines()] + + +def get_package_relative_file(filename: str): + # First check if the path is relative + if (not os.path.isabs(filename)): + + # See if the file exists. + does_exist = os.path.exists(filename) + + if (not does_exist): + # If it doesnt exist, then try to make it relative to the morpheus library root + morpheus_root = os.path.dirname(morpheus.__file__) + + value_abs_to_root = os.path.join(morpheus_root, filename) + + # If the file relative to our package exists, use that instead + if (os.path.exists(value_abs_to_root)): + + return value_abs_to_root + + return filename + + class MorpheusRelativePath(click.Path): """ A specialization of the `click.Path` class that falls back to using package relative paths if the file cannot be @@ -201,26 +227,13 @@ def convert(self, param: typing.Optional["click.Parameter"], ctx: typing.Optional["click.Context"]) -> typing.Any: - # First check if the path is relative - if (not os.path.isabs(value)): - - # See if the file exists. - does_exist = os.path.exists(value) - - if (not does_exist): - # If it doesnt exist, then try to make it relative to the morpheus library root - morpheus_root = os.path.dirname(morpheus.__file__) - - value_abs_to_root = os.path.join(morpheus_root, value) - - # If the file relative to our package exists, use that instead - if (os.path.exists(value_abs_to_root)): - logger.debug(("Parameter, '%s', with relative path, '%s', does not exist. " - "Using package relative location: '%s'"), - param.name, - value, - value_abs_to_root) + package_relative = get_package_relative_file(value) - return super().convert(value_abs_to_root, param, ctx) + if (package_relative != value): + logger.debug(("Parameter, '%s', with relative path, '%s', does not exist. " + "Using package relative location: '%s'"), + param.name, + value, + package_relative) - return super().convert(value, param, ctx) + return super().convert(package_relative, param, ctx) diff --git a/morpheus/config.py b/morpheus/config.py index 431e172f3e..54dc6ff00f 100644 --- a/morpheus/config.py +++ b/morpheus/config.py @@ -121,9 +121,11 @@ class ConfigAutoEncoder(ConfigBase): """ feature_columns: typing.List[str] = None userid_column_name: str = "userIdentityaccountId" + timestamp_column_name: str = "timestamp" userid_filter: str = None feature_scaler: AEFeatureScalar = AEFeatureScalar.STANDARD use_generic_model: bool = False + fallback_username: str = "generic_user" @dataclasses.dataclass diff --git a/morpheus/data/columns_ae_azure.txt b/morpheus/data/columns_ae_azure.txt index 071c9d2169..79120a5f2e 100644 --- a/morpheus/data/columns_ae_azure.txt +++ b/morpheus/data/columns_ae_azure.txt @@ -1,11 +1,9 @@ -locationcountryOrRegion appDisplayName -locationcity clientAppUsed -deviceDetaildisplayName deviceDetailbrowser +deviceDetaildisplayName deviceDetailoperatingSystem statusfailureReason -locincrement appincrement -logcount \ No newline at end of file +locincrement +logcount diff --git a/morpheus/data/columns_ae_duo.txt b/morpheus/data/columns_ae_duo.txt index 091e8d36a5..b653de69ba 100644 --- a/morpheus/data/columns_ae_duo.txt +++ b/morpheus/data/columns_ae_duo.txt @@ -1,8 +1,7 @@ accessdevicebrowser accessdeviceos -accessdevicelocationcity authdevicename -result reason +result locincrement logcount diff --git a/morpheus/messages/multi_ae_message.py b/morpheus/messages/multi_ae_message.py index 41f233d963..4b7068ed7d 100644 --- a/morpheus/messages/multi_ae_message.py +++ b/morpheus/messages/multi_ae_message.py @@ -27,8 +27,8 @@ class MultiAEMessage(MultiMessage): model: AutoEncoder # train_loss_scores: cp.ndarray - train_scores_mean: float - train_scores_std: float + train_scores_mean: float = 0.0 + train_scores_std: float = 1.0 def get_slice(self, start, stop): """ diff --git a/morpheus/utils/logger.py b/morpheus/utils/logger.py index c4ae9fa8b0..de8532aff1 100644 --- a/morpheus/utils/logger.py +++ b/morpheus/utils/logger.py @@ -180,3 +180,19 @@ def deprecated_stage_warning(logger, cls, name): "It has no effect and acts as a pass through stage."), cls.__name__, name) + + +def get_log_levels(): + log_levels = list(logging._nameToLevel.keys()) + + if ("NOTSET" in log_levels): + log_levels.remove("NOTSET") + + return log_levels + + +def parse_log_level(ctx, param, value): + x = logging._nameToLevel.get(value.upper(), None) + if x is None: + raise click.BadParameter('Must be one of {}. Passed: {}'.format(", ".join(logging._nameToLevel.keys()), value)) + return x diff --git a/scripts/fetch_data.py b/scripts/fetch_data.py index 917c1c29c2..693c5ed964 100755 --- a/scripts/fetch_data.py +++ b/scripts/fetch_data.py @@ -23,6 +23,7 @@ LFS_DATASETS = { 'all': '**', + 'docs': 'docs/**', 'examples': 'examples/**', 'models': 'models/**', 'tests': 'tests/**',