Create tests for examples with custom stages (#885)

This PR creates at least one test for each example containing custom stages. This PR currently only covers those examples which do not require additional packages. Part of #849. * Moves the bert vocabulary files to `morpheus/data` dir, no longer requiring them to be fetched from LFS and making them available to unittests. * Fixes type hints and remove a redundant method in `examples/log_parsing/inference.py` * Remove redundant copies of `bert-base-cased-hash.txt` and `bert-base-uncased-hash.txt` files, replacing them with symlinks to the files in the morpheus/data` dir fixes #850 * Explicitly set `encoding='UTF-8'` in `examples/log_parsing/postprocessing.py` as a work-around for issue #859 * Add `py::kw_only` to Python bindings for `TensorMemory` and sublasses to ensure parity with Python impls. * Set `repr=False` for the `tensors` field of `TensorMemory` avoids bug when printing due to the fact that we assign the value to `self._tensors` * Seed cupy's random number generator in `manual_seed` method. * Fix usage of `reload_modules` fixture, requesting a reload of multiple modules should be done with `@pytest.mark.reload_modules([mod1, mod2])` not calling `reload_modules` twice. * New test data in `tests/tests_data/log_parsing` is based upon the first 5 rows of data from `models/datasets/validation-data/log-parsing-validation-data-input.csv` Authors: - David Gardner (https://github.com/dagardner-nv) Approvers: - Michael Demoret (https://github.com/mdemoret-nv) URL: #885
nv-morpheus · Apr 28, 2023 · 18fcbce · 18fcbce
1 parent 339a71f
commit 18fcbce
Show file tree

Hide file tree

Showing 41 changed files with 31,280 additions and 246,568 deletions.
diff --git a/ci/scripts/github/build.sh b/ci/scripts/github/build.sh
@@ -46,7 +46,8 @@ sccache --show-stats
 rapids-logger "Archiving results"
 tar cfj "${WORKSPACE_TMP}/wheel.tar.bz" build/dist
 
-MORPHEUS_LIBS=($(find ${MORPHEUS_ROOT}/build/morpheus/_lib -name "*.so" -exec realpath --relative-to ${MORPHEUS_ROOT} {} \;))
+MORPHEUS_LIBS=($(find ${MORPHEUS_ROOT}/build/morpheus/_lib -name "*.so" -exec realpath --relative-to ${MORPHEUS_ROOT} {} \;) \
+                $(find ${MORPHEUS_ROOT}/examples -name "*.so" -exec realpath --relative-to ${MORPHEUS_ROOT} {} \;))
 tar cfj "${WORKSPACE_TMP}/morhpeus_libs.tar.bz" "${MORPHEUS_LIBS[@]}"
 
 CPP_TESTS=($(find ${MORPHEUS_ROOT}/build/morpheus/_lib/tests -name "*.x" -exec realpath --relative-to ${MORPHEUS_ROOT} {} \;))

diff --git a/examples/log_parsing/README.md b/examples/log_parsing/README.md
@@ -63,7 +63,7 @@ python run.py \
     --num_threads 1 \
     --input_file ${MORPHEUS_ROOT}/models/datasets/validation-data/log-parsing-validation-data-input.csv \
     --output_file ./log-parsing-output.jsonlines \
-    --model_vocab_hash_file=${MORPHEUS_ROOT}/models/training-tuning-scripts/sid-models/resources/bert-base-cased-hash.txt \
+    --model_vocab_hash_file=${MORPHEUS_ROOT}/morpheus/data/bert-base-cased-hash.txt \
     --model_vocab_file=${MORPHEUS_ROOT}/models/training-tuning-scripts/sid-models/resources/bert-base-cased-vocab.txt \
     --model_seq_length=256 \
     --model_name log-parsing-onnx \
@@ -114,7 +114,7 @@ morpheus --log_level INFO \
 	pipeline-nlp \
 	from-file --filename ./models/datasets/validation-data/log-parsing-validation-data-input.csv  \
 	deserialize \
-	preprocess --vocab_hash_file ./models/training-tuning-scripts/sid-models/resources/bert-base-cased-hash.txt --stride 64 --column=raw \
+	preprocess --vocab_hash_file ${MORPHEUS_ROOT}/morpheus/data/bert-base-cased-hash.txt --stride 64 --column=raw \
 	monitor --description "Preprocessing rate" \
 	inf-logparsing --model_name log-parsing-onnx --server_url localhost:8001 --force_convert_inputs=True \
 	monitor --description "Inference rate" --unit inf \

diff --git a/examples/log_parsing/inference.py b/examples/log_parsing/inference.py
@@ -30,12 +30,10 @@
 from morpheus.cli.register_stage import register_stage
 from morpheus.config import Config
 from morpheus.config import PipelineModes
-from morpheus.messages import InferenceMemory
 from morpheus.messages import MultiInferenceMessage
 from morpheus.pipeline.stream_pair import StreamPair
 from morpheus.stages.inference.inference_stage import InferenceStage
 from morpheus.stages.inference.inference_stage import InferenceWorker
-from morpheus.stages.inference.triton_inference_stage import InputWrapper
 from morpheus.stages.inference.triton_inference_stage import _TritonInferenceWorker
 from morpheus.utils.producer_consumer_queue import ProducerConsumerQueue
 
@@ -97,7 +95,7 @@ def default_inout_mapping(cls) -> typing.Dict[str, str]:
         # Some models use different names for the same thing. Set that here but allow user customization
         return {"attention_mask": "input_mask"}
 
-    def build_output_message(self, x: MultiInferenceMessage) -> MultiResponseLogParsingMessage:
+    def build_output_message(self, x: MultiInferenceMessage) -> MultiPostprocLogParsingMessage:
 
         memory = PostprocMemoryLogParsing(
             count=x.count,
@@ -111,7 +109,7 @@ def build_output_message(self, x: MultiInferenceMessage) -> MultiResponseLogPars
                                                         mess_offset=x.mess_offset,
                                                         mess_count=x.mess_count,
                                                         memory=memory,
-                                                        offset=x.offset,
+                                                        offset=0,
                                                         count=x.count)
         return output_message
 
@@ -131,25 +129,6 @@ def _build_response(self, batch: MultiInferenceMessage,
 
         return mem
 
-    def _infer_callback(self,
-                        cb: typing.Callable[[ResponseMemoryLogParsing], None],
-                        m: InputWrapper,
-                        b: MultiInferenceMessage,
-                        result: tritonclient.InferResult,
-                        error: tritonclient.InferenceServerException):
-
-        # If its an error, return that here
-        if (error is not None):
-            raise error
-
-        # Build response
-        response_mem = self._build_response(b, result)
-
-        # Call the callback with the memory
-        cb(response_mem)
-
-        self._mem_pool.return_obj(m)
-
 
 @register_stage("inf-logparsing", modes=[PipelineModes.NLP])
 class LogParsingInferenceStage(InferenceStage):
@@ -261,7 +240,9 @@ def set_output_fut(resp: ResponseMemoryLogParsing, b, f: mrc.Future):
         return stream, out_type
 
     @staticmethod
-    def _convert_one_response(memory: InferenceMemory, inf: MultiInferenceMessage, res: ResponseMemoryLogParsing):
+    def _convert_one_response(memory: PostprocMemoryLogParsing,
+                              inf: MultiInferenceMessage,
+                              res: ResponseMemoryLogParsing):
 
         memory.input_ids[inf.offset:inf.count + inf.offset, :] = inf.input_ids
         memory.seq_ids[inf.offset:inf.count + inf.offset, :] = inf.seq_ids
@@ -280,12 +261,7 @@ def _convert_one_response(memory: InferenceMemory, inf: MultiInferenceMessage, r
                 memory.confidences[idx, :] = cp.maximum(memory.confidences[idx, :], res.confidences[i, :])
                 memory.labels[idx, :] = cp.maximum(memory.labels[idx, :], res.labels[i, :])
 
-        return MultiPostprocLogParsingMessage(meta=inf.meta,
-                                              mess_offset=inf.mess_offset,
-                                              mess_count=inf.mess_count,
-                                              memory=memory,
-                                              offset=inf.offset,
-                                              count=inf.count)
+        return MultiPostprocLogParsingMessage.from_message(inf, memory=memory, offset=inf.offset, count=inf.mess_count)
 
     def _get_inference_worker(self, inf_queue: ProducerConsumerQueue) -> InferenceWorker:
 

diff --git a/examples/log_parsing/postprocessing.py b/examples/log_parsing/postprocessing.py
@@ -54,11 +54,14 @@ def __init__(self, c: Config, vocab_path: pathlib.Path, model_config_path: pathl
         self._model_config_path = model_config_path
 
         self._vocab_lookup = {}
-        with open(vocab_path) as f:
+
+        # Explicitly setting the encoding, we know we have unicode chars in this file and we need to avoid issue:
+        # https://github.com/nv-morpheus/Morpheus/issues/859
+        with open(vocab_path, encoding='UTF-8') as f:
             for index, line in enumerate(f):
                 self._vocab_lookup[index] = line.split()[0]
 
-        with open(model_config_path) as f:
+        with open(model_config_path, encoding='UTF-8') as f:
             config = json.load(f)
 
         self._label_map = {int(k): v for k, v in config["id2label"].items()}