nv-morpheus · Jan 21, 2023 · Dec 20, 2022 · Dec 20, 2022 · Dec 21, 2022 · Dec 21, 2022
@@ -390,7 +390,7 @@ pipeline.add_stage(WriteToFileStage(config, filename=results_file, overwrite=Tru
 
 Note that we didn't specify the output format. In our example, the result file contains the extension `.jsonlines`. Morpheus will infer the output format based on the extension. At time of writing the extensions that Morpheus will infer are: `.csv`, `.json` & `.jsonlines`
 
-To explicitly set the output format we could specify the `file_type` argument to the `WriteToFileStage` which is an enumeration defined in `morpheus._lib.file_types.FileTypes`. Current values defined are:
+To explicitly set the output format we could specify the `file_type` argument to the `WriteToFileStage` which is an enumeration defined in `morpheus._lib.common.FileTypes`. Current values defined are:
 * `FileTypes.Auto`
 * `FileTypes.JSON`
 * `FileTypes.CSV`

@@ -507,7 +507,7 @@ The `DFPFileToDataFrameStage` ([examples/digital_fingerprinting/production/morph
 | `c` | `morpheus.config.Config` | Morpheus config object |
 | `schema` | `DataFrameInputSchema` | Schema specifying columns to load, along with any necessary renames and data type conversions  |
 | `filter_null` | `bool` | Optional: Whether to filter null rows after loading, by default True. |
-| `file_type` | `morpheus._lib.file_types.FileTypes` (enum) | Optional: Indicates file type to be loaded. Currently supported values at time of writing are: `FileTypes.Auto`, `FileTypes.CSV`, and `FileTypes.JSON`. Default value is `FileTypes.Auto` which will infer the type based on the file extension, set this value if using a custom extension |
+| `file_type` | `morpheus._lib.common.FileTypes` (enum) | Optional: Indicates file type to be loaded. Currently supported values at time of writing are: `FileTypes.Auto`, `FileTypes.CSV`, and `FileTypes.JSON`. Default value is `FileTypes.Auto` which will infer the type based on the file extension, set this value if using a custom extension |
 | `parser_kwargs` | `dict` or `None` | Optional: additional keyword arguments to be passed into the `DataFrame` parser, currently this is going to be either [`pandas.read_csv`](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html) or [`pandas.read_json`](https://pandas.pydata.org/docs/reference/api/pandas.read_json.html) |
 | `cache_dir` | `str` | Optional: path to cache location, defaults to `./.cache/dfp` |
 
@@ -634,12 +634,15 @@ For any user without an associated model in MLflow, the model for the generic us
 | `c` | `morpheus.config.Config` | Morpheus config object |
 | `model_name_formatter` | `str` | Format string to control the name of models fetched from MLflow.  Currently available field names are: `user_id` and `user_md5` which is an md5 hexadecimal digest as returned by [`hash.hexdigest`](https://docs.python.org/3.8/library/hashlib.html?highlight=hexdigest#hashlib.hash.hexdigest). |
 
+#### Filter Detection Stage (`FilterDetectionsStage`)
+This stage filters the output from the inference stage for any anomalous messages. Logs which exceed the specified Z-Score will be passed onto the next stage. All remaining logs which are below the threshold will be dropped. For the purposes of the DFP pipeline, this stage is configured to use the `mean_abs_z` column of the DataFrame as the filter criteria.
 
+| Name | Type | Default | Description |
+| --- | --- | --- | :-- |
+| `threshold` | `float` | `0.5` | The threshold value above which logs are considered to be anomalous. The default is `0.5`, however the DFP pipeline uses a value of `2.0`. All normal logs will be filtered out and anomalous logs will be passed on. |
+| `copy` | `bool` | `True` | When the `copy` argument is `True` (default), rows that meet the filter criteria are copied into a new dataframe. When `False` sliced views are used instead. This is a performance optimization, and has no functional impact. |
+| `data_source` | `FilterSource` | `FilterSource.Auto` | Indicates if the filter criteria exists in an output tensor (`FilterSource.TENSOR`) or a column in a DataFrame (`FilterSource.DATAFRAME`). |
+| `field_name` | `str` | `probs` | Name of the tensor (`data_source=FilterSource.TENSOR`) or DataFrame column (`data_source=FilterSource.DATAFRAME`) to use as the filter criteria. |
 
 #### Post Processing Stage (`DFPPostprocessingStage`)
-The `DFPPostprocessingStage` ([examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_postprocessing_stage.py](/examples/digital_fingerprinting/production/morpheus/dfp/stages/dfp_postprocessing_stage.py)) stage filters the output from the inference stage for any anomalous messages. Logs which exceed the specified Z-Score will be passed onto the next stage.  All remaining logs which are below the threshold will be dropped.
-
-| Argument | Type | Description |
-| -------- | ---- | ----------- |
-| `c` | `morpheus.config.Config` | Morpheus config object |
-| `z_score_threshold` | `float` | Optional, sets the threshold value above which values of `mean_abs_z` must be above in order to be considered  an anomaly, default is 2.0 |
+This stage adds a new `event_time` column to the DataFrame indicating the time which Morpheus detected the anomalous messages, and replaces any `NAN` values with the a string value of `'NaN'`.
@@ -18,7 +18,7 @@
 
 from rabbitmq_source_stage import RabbitMQSourceStage
 
-from morpheus._lib.file_types import FileTypes
+from morpheus._lib.common import FileTypes
 from morpheus.config import Config
 from morpheus.pipeline import LinearPipeline
 from morpheus.stages.general.monitor_stage import MonitorStage

@@ -19,7 +19,7 @@
 import click
 from rabbitmq_source_stage import RabbitMQSourceStage
 
-from morpheus._lib.file_types import FileTypes
+from morpheus._lib.common import FileTypes
 from morpheus.config import Config
 from morpheus.config import CppConfig
 from morpheus.pipeline import LinearPipeline

@@ -32,7 +32,7 @@
 
 import cudf
 
-from morpheus._lib.file_types import FileTypes
+from morpheus._lib.common import FileTypes
 from morpheus.config import Config
 from morpheus.io.deserializers import read_file_to_df
 from morpheus.pipeline.single_port_stage import SinglePortStage

@@ -26,18 +26,14 @@
 from morpheus.pipeline.single_port_stage import SinglePortStage
 from morpheus.pipeline.stream_pair import StreamPair
 
-from ..messages.multi_dfp_message import DFPMessageMeta
-
 logger = logging.getLogger("morpheus.{}".format(__name__))
 
 
 class DFPPostprocessingStage(SinglePortStage):
 
-    def __init__(self, c: Config, z_score_threshold=2.0):
+    def __init__(self, c: Config):
         super().__init__(c)
 
-        self._z_score_threshold = z_score_threshold
-
     @property
     def name(self) -> str:
         return "dfp-postproc"
@@ -48,42 +44,32 @@ def supports_cpp_node(self):
     def accepted_types(self) -> typing.Tuple:
         return (MultiAEMessage, )
 
-    def _extract_events(self, message: MultiAEMessage):
-
-        z_scores = message.get_meta("mean_abs_z")
-
-        above_threshold_df = message.get_meta()[z_scores > self._z_score_threshold]
-
-        if (not above_threshold_df.empty):
-            above_threshold_df['event_time'] = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ')
-            above_threshold_df = above_threshold_df.replace(np.nan, 'NaN', regex=True)
-
-            return above_threshold_df
-
-        return None
+    def _process_events(self, message: MultiAEMessage):
+        # Assume that a filter stage preceedes this stage
+        df = message.get_meta()
+        df['event_time'] = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ')
+        df.replace(np.nan, 'NaN', regex=True, inplace=True)
+        message.set_meta(None, df)
 
     def on_data(self, message: MultiAEMessage):
-        if (not message):
+        if (not message or message.mess_count == 0):
             return None
 
         start_time = time.time()
 
-        extracted_events = self._extract_events(message)
+        self._process_events(message)
 
         duration = (time.time() - start_time) * 1000.0
 
         if logger.isEnabledFor(logging.DEBUG):
             logger.debug("Completed postprocessing for user %s in %s ms. Event count: %s. Start: %s, End: %s",
                          message.meta.user_id,
                          duration,
-                         0 if extracted_events is None else len(extracted_events),
+                         message.mess_count,
                          message.get_meta(self._config.ae.timestamp_column_name).min(),
                          message.get_meta(self._config.ae.timestamp_column_name).max())
 
-        if (extracted_events is None):
-            return None
-
-        return DFPMessageMeta(extracted_events, user_id=message.meta.user_id)
+        return message
 
     def _build_single(self, builder: mrc.Builder, input_stream: StreamPair) -> StreamPair:
 
@@ -93,4 +79,4 @@ def node_fn(obs: mrc.Observable, sub: mrc.Subscriber):
         stream = builder.make_node_full(self.unique_name, node_fn)
         builder.make_edge(input_stream[0], stream)
 
-        return stream, DFPMessageMeta
+        return stream, input_stream[1]
@@ -69,7 +69,8 @@
     "from dfp.utils.file_utils import date_extractor\n",
     "from dfp.utils.file_utils import iso_date_regex\n",
     "\n",
-    "from morpheus._lib.file_types import FileTypes\n",
+    "from morpheus._lib.common import FileTypes\n",
+    "from morpheus._lib.common import FilterSource\n",
     "from morpheus.cli.utils import get_package_relative_file\n",
     "from morpheus.cli.utils import load_labels_file\n",
     "from morpheus.config import Config\n",
@@ -78,6 +79,8 @@
     "from morpheus.pipeline import LinearPipeline\n",
     "from morpheus.stages.general.monitor_stage import MonitorStage\n",
     "from morpheus.stages.output.write_to_file_stage import WriteToFileStage\n",
+    "from morpheus.stages.postprocess.filter_detections_stage import FilterDetectionsStage\n",
+    "from morpheus.stages.postprocess.serialize_stage import SerializeStage\n",
     "from morpheus.utils.logger import configure_logging\n",
     "from morpheus.utils.logger import get_log_levels\n",
     "from morpheus.utils.logger import parse_log_level\n",
@@ -255,6 +258,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "bdfc59de-dea8-4e5f-98e3-98eba1e4621d",
    "metadata": {},
@@ -341,16 +345,29 @@
     "| --- | --- | --- | :-- |\n",
     "| `model_name_formatter` | `str` | `\"\"` | A format string to use when building the model name. The model name is constructed by calling `model_name_formatter.format(user_id=user_id)`. For example, with `model_name_formatter=\"my_model-{user_id}\"` and a user ID of `\"first:last\"` would result in the model name of `\"my_model-first:last\"`. This should match the value used in `DFPMLFlowModelWriterStage` |\n",
     "\n",
+    "### Filter Detection Stage (`FilterDetectionsStage`)\n",
+    "This stage filters the output from the inference stage for any anomalous messages. Logs which exceed the specified Z-Score will be passed onto the next stage. All remaining logs which are below the threshold will be dropped. For the purposes of the DFP pipeline, this stage is configured to use the `mean_abs_z` column of the DataFrame as the filter criteria.\n",
+    "\n",
+    "| Name | Type | Default | Description |\n",
+    "| --- | --- | --- | :-- |\n",
+    "| `threshold` | `float` | `0.5` | The threshold value above which logs are considered to be anomalous. The default is `0.5`, however the DFP pipeline uses a value of `2.0`. All normal logs will be filtered out and anomalous logs will be passed on. |\n",
+    "| `copy` | `bool` | `True` | When the `copy` argument is `True` (default), rows that meet the filter criteria are copied into a new dataframe. When `False` sliced views are used instead. This is a performance optimization, and has no functional impact. |\n",
+    "| `data_source` | `FilterSource` | `FilterSource.Auto` | Indicates if the filter criteria exists in an output tensor (`FilterSource.TENSOR`) or a column in a DataFrame (`FilterSource.DATAFRAME`). |\n",
+    "| `field_name` | `str` | `probs` | Name of the tensor (`data_source=FilterSource.TENSOR`) or DataFrame column (`data_source=FilterSource.DATAFRAME`) to use as the filter criteria. |\n",
+    "\n",
     "### Post Processing Stage (`DFPPostprocessingStage`)\n",
+    "This stage adds a new `event_time` column to the DataFrame indicating the time which Morpheus detected the anomalous messages, and replaces any `NAN` values with the a string value of `'NaN'`.\n",
     "\n",
-    "This stage filters the output from the inference stage for any anomalous messages. Logs which exceed the specified Z-Score will be passed onto the next stage. All remaining logs which are below the threshold will be dropped.\n",
+    "### Serialize Stage (`SerializeStage`)\n",
+    "This stage controls which columns in the DataFrame will be included in the output. For the purposes of the DFP pipeline, we will exclude columns that are used internally by the pipeline which are not of interest to the end-user.\n",
     "\n",
     "| Name | Type | Default | Description |\n",
     "| --- | --- | --- | :-- |\n",
-    "| `z_score_threshold` | `float` | `2.0` | The Z-Score used to separate anomalous logs from normal logs. All normal logs will be filterd out and anomalous logs will be passed on. |\n",
+    "| `include` | List of `str` | `[]` | List of regular expression patterns matching columns to include in the output. Specifying an empty list causes all columns to be included not explicitly excluded. |\n",
+    "| `exclude` | List of `str` | `[r'^ID$', r'^_ts_']` | List of regular expression patterns matching columns to exclude from the output. |\n",
+    "| `fixed_columns` | `bool` | `True` | When `True` it is assumed that the Dataframe in all messages contain the same columns as the first message received. |\n",
     "\n",
     "### Write to File Stage (`WriteToFileStage`)\n",
-    "\n",
     "This final stage will write all received messages to a single output file in either CSV or JSON format.\n",
     "\n",
     "| Name | Type | Default | Description |\n",
@@ -413,7 +430,12 @@
     "pipeline.add_stage(DFPInferenceStage(config, model_name_formatter=model_name_formatter))\n",
     "\n",
     "# Filter for only the anomalous logs\n",
-    "pipeline.add_stage(DFPPostprocessingStage(config, z_score_threshold=2.0))\n",
+    "pipeline.add_stage(\n",
+    "            FilterDetectionsStage(config, threshold=2.0, data_source=FilterSource.DATAFRAME, field_name='mean_abs_z'))\n",
+    "pipeline.add_stage(DFPPostprocessingStage(config))\n",
+    "\n",
+    "# Exclude the columns we don't want in our output\n",
+    "pipeline.add_stage(SerializeStage(config, exclude=['batch_count', 'origin_hash', '_row_hash', '_batch_id']))\n",
     "\n",
     "# Write all anomalies to a CSV file\n",
     "pipeline.add_stage(WriteToFileStage(config, filename=\"dfp_detections_azure.csv\", overwrite=True))\n",
@@ -433,7 +455,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3.9.2 64-bit",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -447,7 +469,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.2"
+   "version": "3.9.2 (default, Feb 28 2021, 17:03:44) \n[GCC 10.2.1 20210110]"
   },
   "vscode": {
    "interpreter": {

@@ -45,7 +45,8 @@
 from dfp.utils.file_utils import date_extractor
 from dfp.utils.file_utils import iso_date_regex
 
-from morpheus._lib.file_types import FileTypes
+from morpheus._lib.common import FileTypes
+from morpheus._lib.common import FilterSource
 from morpheus.cli.utils import get_package_relative_file
 from morpheus.cli.utils import load_labels_file
 from morpheus.config import Config
@@ -54,6 +55,8 @@
 from morpheus.pipeline import LinearPipeline
 from morpheus.stages.general.monitor_stage import MonitorStage
 from morpheus.stages.output.write_to_file_stage import WriteToFileStage
+from morpheus.stages.postprocess.filter_detections_stage import FilterDetectionsStage
+from morpheus.stages.postprocess.serialize_stage import SerializeStage
 from morpheus.utils.logger import configure_logging
 from morpheus.utils.logger import get_log_levels
 from morpheus.utils.logger import parse_log_level
@@ -305,7 +308,12 @@ def run_pipeline(train_users,
         pipeline.add_stage(MonitorStage(config, description="Inference rate", smoothing=0.001))
 
         # Filter for only the anomalous logs
-        pipeline.add_stage(DFPPostprocessingStage(config, z_score_threshold=2.0))
+        pipeline.add_stage(
+            FilterDetectionsStage(config, threshold=2.0, data_source=FilterSource.DATAFRAME, field_name='mean_abs_z'))
+        pipeline.add_stage(DFPPostprocessingStage(config))
+
+        # Exclude the columns we don't want in our output
+        pipeline.add_stage(SerializeStage(config, exclude=['batch_count', 'origin_hash', '_row_hash', '_batch_id']))
 
         # Write all anomalies to a CSV file
         pipeline.add_stage(WriteToFileStage(config, filename="dfp_detections_azure.csv", overwrite=True))

@@ -69,7 +69,7 @@
     "from dfp.utils.file_utils import date_extractor\n",
     "from dfp.utils.file_utils import iso_date_regex\n",
     "\n",
-    "from morpheus._lib.file_types import FileTypes\n",
+    "from morpheus._lib.common import FileTypes\n",
     "from morpheus.cli.utils import get_package_relative_file\n",
     "from morpheus.cli.utils import load_labels_file\n",
     "from morpheus.config import Config\n",