diff --git a/module-5/.gitignore b/module-5/.gitignore new file mode 100644 index 0000000..a21fd91 --- /dev/null +++ b/module-5/.gitignore @@ -0,0 +1 @@ +.lock-file diff --git a/module-5/README.md b/module-5/README.md index 401b3d4..3500079 100644 --- a/module-5/README.md +++ b/module-5/README.md @@ -33,7 +33,7 @@ k9s -A ``` -export WANDB_API_KEY='put your key' +export WANDB_API_KEY='your key here' ``` @@ -86,23 +86,11 @@ pytest -ss ./tests # Triton Inference Server - -## PyTriton - ``` -docker run -v $PWD:/dev_data --shm-size=1g --ulimit memlock=-1 --net=host --ulimit stack=67108864 -ti nvcr.io/nvidia/tritonserver:23.11-vllm-python-py3 /bin/bash - -pip install -r /dev_data/requirements.txt -export WANDB_API_KEY=cb86168a2e8db7edb905da69307450f5e7867d66 - -tritonserver --http-port 5000 --model-repository /dev_data/triton-python-example/ - +make run_pytriton ``` - - - # LLMs diff --git a/module-5/serving/pytriton_client.py b/module-5/serving/pytriton_client.py index c591836..c01524e 100644 --- a/module-5/serving/pytriton_client.py +++ b/module-5/serving/pytriton_client.py @@ -4,16 +4,12 @@ # https://triton-inference-server.github.io/pytriton/latest/clients/ def main(): - sequence = np.array([ - ["one day I will see the world"], - ]) - sequence = np.char.encode(sequence, "utf-8") + text = np.array([["one day I will see the world"],]) + text = np.char.encode(text, "utf-8") - with ModelClient("0.0.0.0", "BART") as client: - result_dict = client.infer_batch(sequence) - for output_name, output_data in result_dict.items(): - output_data = np.array2string(output_data, threshold=np.inf, max_line_width=np.inf, separator=",").replace("\n", "") - print(f"{output_name}: {output_data}.") + with ModelClient("0.0.0.0", "predictor_a") as client: + result_dict = client.infer_batch(text=text) + print(result_dict['probs']) if __name__ == "__main__": diff --git a/module-5/serving/pytriton_serving.py b/module-5/serving/pytriton_serving.py index f49b62f..4f23a7a 100644 --- a/module-5/serving/pytriton_serving.py +++ b/module-5/serving/pytriton_serving.py @@ -1,7 +1,6 @@ import logging import numpy as np -from transformers import pipeline from pytriton.decorators import batch from pytriton.model_config import ModelConfig, Tensor @@ -14,31 +13,16 @@ predictor = Predictor.default_from_model_registry() -# Labels pre-cached on server side -LABELS = [ - "travel", - "cooking", - "dancing", - "sport", - "music", - "entertainment", - "festival", - "movie", - "literature", -] - @batch -def _infer_fn(sequence: np.ndarray): - sequence = np.char.decode(sequence.astype("bytes"), "utf-8") - sequence = sequence.tolist()[0] +def _infer_fn(text: np.ndarray): + text = np.char.decode(text.astype("bytes"), "utf-8") + text = text.tolist()[0] - logger.info(f"sequence = {sequence}") - results = predictor.predict(text=sequence) + logger.info(f"sequence = {text}") + results = predictor.predict(text=text) logger.info(f"results = {results}") - - result_labels = ['travel' for _ in range(len(sequence))] - return {"label": np.char.encode(result_labels, "utf-8")} + return [results] def main(): @@ -46,10 +30,10 @@ def main(): with Triton() as triton: logger.info("Loading BART model.") triton.bind( - model_name="BART", + model_name="predictor_a", infer_func=_infer_fn, - inputs=[Tensor(name="sequence", dtype=bytes, shape=(-1,)),], - outputs=[Tensor(name="label", dtype=bytes, shape=(1,)),], + inputs=[Tensor(name="text", dtype=bytes, shape=(-1,)),], + outputs=[Tensor(name="probs", dtype=np.float32, shape=(-1,)),], config=ModelConfig(max_batch_size=1), ) logger.info("Serving inference")